From afb95882985199339b236011694284b5701f620d Mon Sep 17 00:00:00 2001
From: Teknium <teknium1@gmail.com>
Date: Tue, 28 Apr 2026 01:15:46 -0700
Subject: [PATCH] fix(computer-use): harden image-rejection fallback +
 AUTHOR_MAP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to #15328's vision-unsupported retry branch in run_agent.py.

_strip_images_from_messages() previously deleted any message whose content
was entirely images. That's fine for synthetic user messages injected for
attachment delivery, but it breaks providers for tool-role messages — the
paired tool_call_id on the preceding assistant message ends up unmatched,
which OpenAI-compatible APIs reject with HTTP 400.

Fix: tool-role messages whose content becomes empty are replaced with a
plaintext placeholder that preserves the tool_call_id linkage. Only
non-tool messages are dropped. Added 10 tests covering the role-alternation
invariants + image-type coverage.

Image-rejection detector: expanded phrase list (image content not
supported / multimodal input / vision input / model does not support
image) and gated on 4xx status so transient 5xx errors never get
misinterpreted as 'server said no to images'. Detection is documented as
best-effort English phrase matching.

AUTHOR_MAP: mapped 3820588+ddupont808@users.noreply.github.com to
ddupont808 so release notes attribute the salvage correctly.
---
 run_agent.py                                  |  48 +++-
 scripts/release.py                            |   1 +
 .../test_image_rejection_fallback.py          | 243 ++++++++++++++++++
 tests/tools/test_registry.py                  |   1 +
 4 files changed, 288 insertions(+), 5 deletions(-)
 create mode 100644 tests/run_agent/test_image_rejection_fallback.py

diff --git a/run_agent.py b/run_agent.py
index 55cb6e4b..153e7f5b 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -813,6 +813,15 @@ def _strip_images_from_messages(messages: list) -> bool:
     "Only 'text' content type is supported.").  Mutates messages so the
     next API call sends text only.
 
+    Preserves message alternation invariants:
+      * ``tool``-role messages whose content was entirely images are replaced
+        with a plaintext placeholder, NOT deleted — deleting them would leave
+        the paired ``tool_call_id`` on the prior assistant message unmatched,
+        which providers reject with HTTP 400.
+      * Non-tool messages whose content becomes empty are dropped.  In
+        practice this only hits synthetic image-only user messages appended
+        for attachment delivery; real user turns always include text.
+
     Returns True if any image parts were removed.
     """
     found = False
@@ -832,9 +841,13 @@ def _strip_images_from_messages(messages: list) -> bool:
         if len(new_parts) < len(content):
             if new_parts:
                 msg["content"] = new_parts
+            elif msg.get("role") == "tool":
+                # Preserve tool_call_id linkage — providers require every
+                # assistant tool_call to have a matching tool response.
+                msg["content"] = "[image content removed — server does not support images]"
             else:
-                # Entire message was images — drop it (user messages added for
-                # image delivery only, e.g. the deferred injection messages).
+                # Synthetic image-only user/assistant message with no text;
+                # safe to drop.
                 to_delete.append(i)
     for i in reversed(to_delete):
         del messages[i]
@@ -11365,11 +11378,18 @@ class AIAgent:
                             continue
 
                     # ── Image-rejection recovery ──────────────────────────────
-                    # Some providers (mlx-lm, text-only endpoints) reject any
-                    # message that contains image_url content with an error like
+                    # Some providers (mlx-lm, text-only endpoints, text-only
+                    # fallbacks on multimodal models) reject any message that
+                    # contains image_url content with a 4xx error like
                     # "Only 'text' content type is supported."  On first hit,
                     # strip all images from the message list, mark the session
                     # as vision-unsupported, and retry with text only.
+                    #
+                    # Detection is best-effort English phrase matching — a
+                    # locale-translated or heavily-reworded upstream error
+                    # will bypass this guard and fall through to the normal
+                    # error handler.  Expand the phrase list when new
+                    # provider wordings are observed in the wild.
                     _err_body = ""
                     try:
                         _err_body = str(getattr(api_error, "body", None) or
@@ -11377,17 +11397,35 @@ class AIAgent:
                                         str(api_error))
                     except Exception:
                         pass
+                    _err_status = getattr(api_error, "status_code", None)
                     _IMAGE_REJECTION_PHRASES = (
                         "only 'text' content type is supported",
                         "only text content type is supported",
                         "image_url is not supported",
+                        "image content is not supported",
                         "multimodal is not supported",
+                        "multimodal content is not supported",
+                        "multimodal input is not supported",
                         "vision is not supported",
+                        "vision input is not supported",
                         "does not support images",
+                        "does not support image input",
+                        "does not support multimodal",
+                        "does not support vision",
+                        "model does not support image",
                     )
+                    _err_lower = _err_body.lower()
+                    _looks_like_image_rejection = any(
+                        p in _err_lower for p in _IMAGE_REJECTION_PHRASES
+                    )
+                    # 4xx-only gate: never interpret 5xx/timeout as "server
+                    # said no to images" — those are transient and must
+                    # route to the normal retry path.
+                    _status_ok = _err_status is None or (400 <= int(_err_status) < 500)
                     if (
                         getattr(self, "_vision_supported", True)
-                        and any(p in _err_body.lower() for p in _IMAGE_REJECTION_PHRASES)
+                        and _looks_like_image_rejection
+                        and _status_ok
                     ):
                         self._vision_supported = False
                         _imgs_removed = _strip_images_from_messages(messages)
diff --git a/scripts/release.py b/scripts/release.py
index 5af48dca..08d6cc37 100755
--- a/scripts/release.py
+++ b/scripts/release.py
@@ -123,6 +123,7 @@ AUTHOR_MAP = {
     "104278804+Sertug17@users.noreply.github.com": "Sertug17",
     "112503481+caentzminger@users.noreply.github.com": "caentzminger",
     "258577966+voidborne-d@users.noreply.github.com": "voidborne-d",
+    "3820588+ddupont808@users.noreply.github.com": "ddupont808",
     "liusway405@gmail.com": "voidborne-d",
     "xydarcher@uestc.edu.cn": "Readon",
     "sir_even@icloud.com": "sirEven",
diff --git a/tests/run_agent/test_image_rejection_fallback.py b/tests/run_agent/test_image_rejection_fallback.py
new file mode 100644
index 00000000..e52719d9
--- /dev/null
+++ b/tests/run_agent/test_image_rejection_fallback.py
@@ -0,0 +1,243 @@
+"""Tests for the image-rejection fallback in run_agent.
+
+When a server rejects image content (e.g. text-only endpoints), the agent
+strips image parts from message history and retries text-only.  These tests
+verify that stripping preserves the role-alternation invariants providers
+require, and that the phrase detector fires on the expected error bodies.
+"""
+
+from run_agent import _strip_images_from_messages
+
+
+class TestStripImagesPreservesAlternation:
+    """_strip_images_from_messages must not break message role alternation."""
+
+    def test_noop_when_no_images(self):
+        msgs = [
+            {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "hi"},
+        ]
+        changed = _strip_images_from_messages(msgs)
+        assert changed is False
+        assert msgs == [
+            {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "hi"},
+        ]
+
+    def test_string_content_untouched(self):
+        """String content passes through — only list content is inspected."""
+        msgs = [{"role": "user", "content": "just text"}]
+        changed = _strip_images_from_messages(msgs)
+        assert changed is False
+        assert msgs[0]["content"] == "just text"
+
+    def test_strips_image_url_part_preserves_text(self):
+        msgs = [{
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "describe"},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}},
+            ],
+        }]
+        changed = _strip_images_from_messages(msgs)
+        assert changed is True
+        assert msgs[0]["content"] == [{"type": "text", "text": "describe"}]
+
+    def test_strips_all_recognized_image_types(self):
+        msgs = [{
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "hi"},
+                {"type": "image_url", "image_url": {}},
+                {"type": "image", "source": {}},
+                {"type": "input_image", "image_url": "http://x"},
+            ],
+        }]
+        changed = _strip_images_from_messages(msgs)
+        assert changed is True
+        assert msgs[0]["content"] == [{"type": "text", "text": "hi"}]
+
+    def test_tool_message_with_all_images_replaced_not_deleted(self):
+        """CRITICAL: tool messages must NEVER be deleted — their tool_call_id
+        pairs with an assistant tool_call and providers reject unmatched IDs.
+        """
+        msgs = [
+            {"role": "user", "content": "take a screenshot"},
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [{
+                    "id": "call_abc",
+                    "type": "function",
+                    "function": {"name": "computer_use", "arguments": "{}"},
+                }],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "call_abc",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
+                ],
+            },
+        ]
+        changed = _strip_images_from_messages(msgs)
+        assert changed is True
+        # Length preserved — tool message NOT deleted
+        assert len(msgs) == 3
+        # tool_call_id still present
+        assert msgs[2]["tool_call_id"] == "call_abc"
+        # Content replaced with text placeholder (now a string, not a list)
+        assert isinstance(msgs[2]["content"], str)
+        assert "image content removed" in msgs[2]["content"].lower()
+
+    def test_tool_message_with_mixed_content_keeps_text_parts(self):
+        msgs = [
+            {"role": "user", "content": "screenshot plz"},
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [{"id": "call_1", "type": "function", "function": {"name": "x", "arguments": "{}"}}],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "call_1",
+                "content": [
+                    {"type": "text", "text": "Captured 1024x768"},
+                    {"type": "image_url", "image_url": {"url": "data:..."}},
+                ],
+            },
+        ]
+        changed = _strip_images_from_messages(msgs)
+        assert changed is True
+        assert len(msgs) == 3
+        assert msgs[2]["content"] == [{"type": "text", "text": "Captured 1024x768"}]
+        assert msgs[2]["tool_call_id"] == "call_1"
+
+    def test_image_only_user_message_dropped(self):
+        """Synthetic image-only user messages (gateway injection pattern) are
+        safe to drop — no tool_call_id linkage to preserve."""
+        msgs = [
+            {"role": "user", "content": "what's in this?"},
+            {"role": "assistant", "content": "I'll check."},
+            {
+                "role": "user",
+                "content": [{"type": "image_url", "image_url": {"url": "data:..."}}],
+            },
+        ]
+        changed = _strip_images_from_messages(msgs)
+        assert changed is True
+        # Synthetic image-only user message dropped
+        assert len(msgs) == 2
+        assert msgs[-1]["role"] == "assistant"
+
+    def test_multiple_tool_messages_all_preserved(self):
+        """Parallel tool calls: each tool_call_id must retain a paired message."""
+        msgs = [
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [
+                    {"id": "c1", "type": "function", "function": {"name": "x", "arguments": "{}"}},
+                    {"id": "c2", "type": "function", "function": {"name": "x", "arguments": "{}"}},
+                ],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "c1",
+                "content": [{"type": "image_url", "image_url": {}}],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "c2",
+                "content": [{"type": "image_url", "image_url": {}}],
+            },
+        ]
+        changed = _strip_images_from_messages(msgs)
+        assert changed is True
+        tool_msgs = [m for m in msgs if m.get("role") == "tool"]
+        assert len(tool_msgs) == 2
+        assert {m["tool_call_id"] for m in tool_msgs} == {"c1", "c2"}
+
+    def test_returns_false_when_nothing_changed(self):
+        msgs = [
+            {"role": "user", "content": [{"type": "text", "text": "hi"}]},
+            {"role": "assistant", "content": "hello"},
+        ]
+        assert _strip_images_from_messages(msgs) is False
+
+    def test_handles_non_dict_entries_gracefully(self):
+        msgs = [None, "not a dict", {"role": "user", "content": "ok"}]
+        # Must not raise
+        changed = _strip_images_from_messages(msgs)
+        assert changed is False
+
+
+class TestImageRejectionPhraseIsolation:
+    """The image-rejection phrase list must NOT false-match on other
+    image-related error categories (size-too-large, format errors, etc.)
+    so they route to the correct recovery handler (e.g. _try_shrink_image_parts).
+    """
+
+    # Reproduces the phrase list used in run_agent.py's error-handler block.
+    _REJECTION_PHRASES = (
+        "only 'text' content type is supported",
+        "only text content type is supported",
+        "image_url is not supported",
+        "image content is not supported",
+        "multimodal is not supported",
+        "multimodal content is not supported",
+        "multimodal input is not supported",
+        "vision is not supported",
+        "vision input is not supported",
+        "does not support images",
+        "does not support image input",
+        "does not support multimodal",
+        "does not support vision",
+        "model does not support image",
+    )
+
+    def _matches(self, body: str) -> bool:
+        low = body.lower()
+        return any(p in low for p in self._REJECTION_PHRASES)
+
+    def test_anthropic_image_too_large_does_not_trip(self):
+        # From agent/error_classifier.py _IMAGE_TOO_LARGE_PATTERNS —
+        # these must route to image_too_large / _try_shrink_image_parts_in_messages,
+        # NOT to our vision-unsupported fallback.
+        bodies = [
+            "messages.0.content.1.image.source.base64: image exceeds 5 MB maximum",
+            "image too large: 6291456 bytes > 5242880 limit",
+            "image_too_large",
+            "image size exceeds per-request limit",
+        ]
+        for body in bodies:
+            assert self._matches(body) is False, f"false positive on: {body}"
+
+    def test_context_overflow_does_not_trip(self):
+        bodies = [
+            "This model's maximum context length is 200000 tokens.",
+            "Request too large: max tokens per request is 200000",
+            "The input exceeds the context window.",
+        ]
+        for body in bodies:
+            assert self._matches(body) is False, f"false positive on: {body}"
+
+    def test_rate_limit_does_not_trip(self):
+        bodies = [
+            "rate limit reached for requests",
+            "You exceeded your current quota",
+        ]
+        for body in bodies:
+            assert self._matches(body) is False
+
+    def test_real_image_rejection_bodies_trip(self):
+        """Positive cases — real-world error wordings that should trigger."""
+        bodies = [
+            "Only 'text' content type is supported.",
+            "Bad request: multimodal is not supported by this model",
+            "This model does not support images",
+            "vision is not supported on this endpoint",
+            "model does not support image input",
+        ]
+        for body in bodies:
+            assert self._matches(body) is True, f"false negative on: {body}"
diff --git a/tests/tools/test_registry.py b/tests/tools/test_registry.py
index 3c753f64..b8b631df 100644
--- a/tests/tools/test_registry.py
+++ b/tests/tools/test_registry.py
@@ -296,6 +296,7 @@ class TestBuiltinDiscovery:
             "tools.browser_tool",
             "tools.clarify_tool",
             "tools.code_execution_tool",
+            "tools.computer_use_tool",
             "tools.cronjob_tools",
             "tools.delegate_tool",
             "tools.discord_tool",