fix(computer-use): harden image-rejection fallback + AUTHOR_MAP

Follow-up to #15328's vision-unsupported retry branch in run_agent.py. _strip_images_from_messages() previously deleted any message whose content was entirely images. That's fine for synthetic user messages injected for attachment delivery, but it breaks providers for tool-role messages — the paired tool_call_id on the preceding assistant message ends up unmatched, which OpenAI-compatible APIs reject with HTTP 400. Fix: tool-role messages whose content becomes empty are replaced with a plaintext placeholder that preserves the tool_call_id linkage. Only non-tool messages are dropped. Added 10 tests covering the role-alternation invariants + image-type coverage. Image-rejection detector: expanded phrase list (image content not supported / multimodal input / vision input / model does not support image) and gated on 4xx status so transient 5xx errors never get misinterpreted as 'server said no to images'. Detection is documented as best-effort English phrase matching. AUTHOR_MAP: mapped 3820588+ddupont808@users.noreply.github.com to ddupont808 so release notes attribute the salvage correctly.
2026-04-28 01:15:46 -07:00 · 2026-04-28 01:15:46 -07:00 · afb9588298
commit afb9588298
parent b4a8031b2e
4 changed files with 288 additions and 5 deletions
--- a/run_agent.py
+++ b/run_agent.py
@ -813,6 +813,15 @@ def _strip_images_from_messages(messages: list) -> bool:
    "Only 'text' content type is supported.").  Mutates messages so the
    next API call sends text only.
    Preserves message alternation invariants:
      * ``tool``-role messages whose content was entirely images are replaced
        with a plaintext placeholder, NOT deleted — deleting them would leave
        the paired ``tool_call_id`` on the prior assistant message unmatched,
        which providers reject with HTTP 400.
      * Non-tool messages whose content becomes empty are dropped.  In
        practice this only hits synthetic image-only user messages appended
        for attachment delivery; real user turns always include text.
    Returns True if any image parts were removed.
    """
    found = False
@ -832,9 +841,13 @@ def _strip_images_from_messages(messages: list) -> bool:
        if len(new_parts) < len(content):
            if new_parts:
                msg["content"] = new_parts
            elif msg.get("role") == "tool":
                # Preserve tool_call_id linkage — providers require every
                # assistant tool_call to have a matching tool response.
                msg["content"] = "[image content removed — server does not support images]"
            else:
-                # Entire message was images — drop it (user messages added for
+                # Synthetic image-only user/assistant message with no text;
-                # image delivery only, e.g. the deferred injection messages).
+                # safe to drop.
                to_delete.append(i)
    for i in reversed(to_delete):
        del messages[i]
@ -11365,11 +11378,18 @@ class AIAgent:
                            continue
                    # ── Image-rejection recovery ──────────────────────────────
-                    # Some providers (mlx-lm, text-only endpoints) reject any
+                    # Some providers (mlx-lm, text-only endpoints, text-only
-                    # message that contains image_url content with an error like
+                    # fallbacks on multimodal models) reject any message that
                    # contains image_url content with a 4xx error like
                    # "Only 'text' content type is supported."  On first hit,
                    # strip all images from the message list, mark the session
                    # as vision-unsupported, and retry with text only.
                    #
                    # Detection is best-effort English phrase matching — a
                    # locale-translated or heavily-reworded upstream error
                    # will bypass this guard and fall through to the normal
                    # error handler.  Expand the phrase list when new
                    # provider wordings are observed in the wild.
                    _err_body = ""
                    try:
                        _err_body = str(getattr(api_error, "body", None) or
@ -11377,17 +11397,35 @@ class AIAgent:
                                        str(api_error))
                    except Exception:
                        pass
                    _err_status = getattr(api_error, "status_code", None)
                    _IMAGE_REJECTION_PHRASES = (
                        "only 'text' content type is supported",
                        "only text content type is supported",
                        "image_url is not supported",
                        "image content is not supported",
                        "multimodal is not supported",
                        "multimodal content is not supported",
                        "multimodal input is not supported",
                        "vision is not supported",
                        "vision input is not supported",
                        "does not support images",
                        "does not support image input",
                        "does not support multimodal",
                        "does not support vision",
                        "model does not support image",
                    )
                    _err_lower = _err_body.lower()
                    _looks_like_image_rejection = any(
                        p in _err_lower for p in _IMAGE_REJECTION_PHRASES
                    )
                    # 4xx-only gate: never interpret 5xx/timeout as "server
                    # said no to images" — those are transient and must
                    # route to the normal retry path.
                    _status_ok = _err_status is None or (400 <= int(_err_status) < 500)
                    if (
                        getattr(self, "_vision_supported", True)
-                        and any(p in _err_body.lower() for p in _IMAGE_REJECTION_PHRASES)
+                        and _looks_like_image_rejection
                        and _status_ok
                    ):
                        self._vision_supported = False
                        _imgs_removed = _strip_images_from_messages(messages)
--- a/scripts/release.py
+++ b/scripts/release.py
@ -123,6 +123,7 @@ AUTHOR_MAP = {
    "104278804+Sertug17@users.noreply.github.com": "Sertug17",
    "112503481+caentzminger@users.noreply.github.com": "caentzminger",
    "258577966+voidborne-d@users.noreply.github.com": "voidborne-d",
    "3820588+ddupont808@users.noreply.github.com": "ddupont808",
    "liusway405@gmail.com": "voidborne-d",
    "xydarcher@uestc.edu.cn": "Readon",
    "sir_even@icloud.com": "sirEven",
--- a/tests/run_agent/test_image_rejection_fallback.py
+++ b/tests/run_agent/test_image_rejection_fallback.py
@ -0,0 +1,243 @@
 """Tests for the image-rejection fallback in run_agent.
 When a server rejects image content (e.g. text-only endpoints), the agent
 strips image parts from message history and retries text-only.  These tests
 verify that stripping preserves the role-alternation invariants providers
 require, and that the phrase detector fires on the expected error bodies.
 """
 from run_agent import _strip_images_from_messages
 class TestStripImagesPreservesAlternation:
    """_strip_images_from_messages must not break message role alternation."""
    def test_noop_when_no_images(self):
        msgs = [
            {"role": "user", "content": "hello"},
            {"role": "assistant", "content": "hi"},
        ]
        changed = _strip_images_from_messages(msgs)
        assert changed is False
        assert msgs == [
            {"role": "user", "content": "hello"},
            {"role": "assistant", "content": "hi"},
        ]
    def test_string_content_untouched(self):
        """String content passes through — only list content is inspected."""
        msgs = [{"role": "user", "content": "just text"}]
        changed = _strip_images_from_messages(msgs)
        assert changed is False
        assert msgs[0]["content"] == "just text"
    def test_strips_image_url_part_preserves_text(self):
        msgs = [{
            "role": "user",
            "content": [
                {"type": "text", "text": "describe"},
                {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}},
            ],
        }]
        changed = _strip_images_from_messages(msgs)
        assert changed is True
        assert msgs[0]["content"] == [{"type": "text", "text": "describe"}]
    def test_strips_all_recognized_image_types(self):
        msgs = [{
            "role": "user",
            "content": [
                {"type": "text", "text": "hi"},
                {"type": "image_url", "image_url": {}},
                {"type": "image", "source": {}},
                {"type": "input_image", "image_url": "http://x"},
            ],
        }]
        changed = _strip_images_from_messages(msgs)
        assert changed is True
        assert msgs[0]["content"] == [{"type": "text", "text": "hi"}]
    def test_tool_message_with_all_images_replaced_not_deleted(self):
        """CRITICAL: tool messages must NEVER be deleted — their tool_call_id
        pairs with an assistant tool_call and providers reject unmatched IDs.
        """
        msgs = [
            {"role": "user", "content": "take a screenshot"},
            {
                "role": "assistant",
                "content": None,
                "tool_calls": [{
                    "id": "call_abc",
                    "type": "function",
                    "function": {"name": "computer_use", "arguments": "{}"},
                }],
            },
            {
                "role": "tool",
                "tool_call_id": "call_abc",
                "content": [
                    {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
                ],
            },
        ]
        changed = _strip_images_from_messages(msgs)
        assert changed is True
        # Length preserved — tool message NOT deleted
        assert len(msgs) == 3
        # tool_call_id still present
        assert msgs[2]["tool_call_id"] == "call_abc"
        # Content replaced with text placeholder (now a string, not a list)
        assert isinstance(msgs[2]["content"], str)
        assert "image content removed" in msgs[2]["content"].lower()
    def test_tool_message_with_mixed_content_keeps_text_parts(self):
        msgs = [
            {"role": "user", "content": "screenshot plz"},
            {
                "role": "assistant",
                "content": None,
                "tool_calls": [{"id": "call_1", "type": "function", "function": {"name": "x", "arguments": "{}"}}],
            },
            {
                "role": "tool",
                "tool_call_id": "call_1",
                "content": [
                    {"type": "text", "text": "Captured 1024x768"},
                    {"type": "image_url", "image_url": {"url": "data:..."}},
                ],
            },
        ]
        changed = _strip_images_from_messages(msgs)
        assert changed is True
        assert len(msgs) == 3
        assert msgs[2]["content"] == [{"type": "text", "text": "Captured 1024x768"}]
        assert msgs[2]["tool_call_id"] == "call_1"
    def test_image_only_user_message_dropped(self):
        """Synthetic image-only user messages (gateway injection pattern) are
        safe to drop — no tool_call_id linkage to preserve."""
        msgs = [
            {"role": "user", "content": "what's in this?"},
            {"role": "assistant", "content": "I'll check."},
            {
                "role": "user",
                "content": [{"type": "image_url", "image_url": {"url": "data:..."}}],
            },
        ]
        changed = _strip_images_from_messages(msgs)
        assert changed is True
        # Synthetic image-only user message dropped
        assert len(msgs) == 2
        assert msgs[-1]["role"] == "assistant"
    def test_multiple_tool_messages_all_preserved(self):
        """Parallel tool calls: each tool_call_id must retain a paired message."""
        msgs = [
            {
                "role": "assistant",
                "content": None,
                "tool_calls": [
                    {"id": "c1", "type": "function", "function": {"name": "x", "arguments": "{}"}},
                    {"id": "c2", "type": "function", "function": {"name": "x", "arguments": "{}"}},
                ],
            },
            {
                "role": "tool",
                "tool_call_id": "c1",
                "content": [{"type": "image_url", "image_url": {}}],
            },
            {
                "role": "tool",
                "tool_call_id": "c2",
                "content": [{"type": "image_url", "image_url": {}}],
            },
        ]
        changed = _strip_images_from_messages(msgs)
        assert changed is True
        tool_msgs = [m for m in msgs if m.get("role") == "tool"]
        assert len(tool_msgs) == 2
        assert {m["tool_call_id"] for m in tool_msgs} == {"c1", "c2"}
    def test_returns_false_when_nothing_changed(self):
        msgs = [
            {"role": "user", "content": [{"type": "text", "text": "hi"}]},
            {"role": "assistant", "content": "hello"},
        ]
        assert _strip_images_from_messages(msgs) is False
    def test_handles_non_dict_entries_gracefully(self):
        msgs = [None, "not a dict", {"role": "user", "content": "ok"}]
        # Must not raise
        changed = _strip_images_from_messages(msgs)
        assert changed is False
 class TestImageRejectionPhraseIsolation:
    """The image-rejection phrase list must NOT false-match on other
    image-related error categories (size-too-large, format errors, etc.)
    so they route to the correct recovery handler (e.g. _try_shrink_image_parts).
    """
    # Reproduces the phrase list used in run_agent.py's error-handler block.
    _REJECTION_PHRASES = (
        "only 'text' content type is supported",
        "only text content type is supported",
        "image_url is not supported",
        "image content is not supported",
        "multimodal is not supported",
        "multimodal content is not supported",
        "multimodal input is not supported",
        "vision is not supported",
        "vision input is not supported",
        "does not support images",
        "does not support image input",
        "does not support multimodal",
        "does not support vision",
        "model does not support image",
    )
    def _matches(self, body: str) -> bool:
        low = body.lower()
        return any(p in low for p in self._REJECTION_PHRASES)
    def test_anthropic_image_too_large_does_not_trip(self):
        # From agent/error_classifier.py _IMAGE_TOO_LARGE_PATTERNS —
        # these must route to image_too_large / _try_shrink_image_parts_in_messages,
        # NOT to our vision-unsupported fallback.
        bodies = [
            "messages.0.content.1.image.source.base64: image exceeds 5 MB maximum",
            "image too large: 6291456 bytes > 5242880 limit",
            "image_too_large",
            "image size exceeds per-request limit",
        ]
        for body in bodies:
            assert self._matches(body) is False, f"false positive on: {body}"
    def test_context_overflow_does_not_trip(self):
        bodies = [
            "This model's maximum context length is 200000 tokens.",
            "Request too large: max tokens per request is 200000",
            "The input exceeds the context window.",
        ]
        for body in bodies:
            assert self._matches(body) is False, f"false positive on: {body}"
    def test_rate_limit_does_not_trip(self):
        bodies = [
            "rate limit reached for requests",
            "You exceeded your current quota",
        ]
        for body in bodies:
            assert self._matches(body) is False
    def test_real_image_rejection_bodies_trip(self):
        """Positive cases — real-world error wordings that should trigger."""
        bodies = [
            "Only 'text' content type is supported.",
            "Bad request: multimodal is not supported by this model",
            "This model does not support images",
            "vision is not supported on this endpoint",
            "model does not support image input",
        ]
        for body in bodies:
            assert self._matches(body) is True, f"false negative on: {body}"
--- a/tests/tools/test_registry.py
+++ b/tests/tools/test_registry.py
@ -296,6 +296,7 @@ class TestBuiltinDiscovery:
            "tools.browser_tool",
            "tools.clarify_tool",
            "tools.code_execution_tool",
            "tools.computer_use_tool",
            "tools.cronjob_tools",
            "tools.delegate_tool",
            "tools.discord_tool",