From afb95882985199339b236011694284b5701f620d Mon Sep 17 00:00:00 2001 From: Teknium Date: Tue, 28 Apr 2026 01:15:46 -0700 Subject: [PATCH] fix(computer-use): harden image-rejection fallback + AUTHOR_MAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to #15328's vision-unsupported retry branch in run_agent.py. _strip_images_from_messages() previously deleted any message whose content was entirely images. That's fine for synthetic user messages injected for attachment delivery, but it breaks providers for tool-role messages — the paired tool_call_id on the preceding assistant message ends up unmatched, which OpenAI-compatible APIs reject with HTTP 400. Fix: tool-role messages whose content becomes empty are replaced with a plaintext placeholder that preserves the tool_call_id linkage. Only non-tool messages are dropped. Added 10 tests covering the role-alternation invariants + image-type coverage. Image-rejection detector: expanded phrase list (image content not supported / multimodal input / vision input / model does not support image) and gated on 4xx status so transient 5xx errors never get misinterpreted as 'server said no to images'. Detection is documented as best-effort English phrase matching. AUTHOR_MAP: mapped 3820588+ddupont808@users.noreply.github.com to ddupont808 so release notes attribute the salvage correctly. --- run_agent.py | 48 +++- scripts/release.py | 1 + .../test_image_rejection_fallback.py | 243 ++++++++++++++++++ tests/tools/test_registry.py | 1 + 4 files changed, 288 insertions(+), 5 deletions(-) create mode 100644 tests/run_agent/test_image_rejection_fallback.py diff --git a/run_agent.py b/run_agent.py index 55cb6e4b..153e7f5b 100644 --- a/run_agent.py +++ b/run_agent.py @@ -813,6 +813,15 @@ def _strip_images_from_messages(messages: list) -> bool: "Only 'text' content type is supported."). Mutates messages so the next API call sends text only. + Preserves message alternation invariants: + * ``tool``-role messages whose content was entirely images are replaced + with a plaintext placeholder, NOT deleted — deleting them would leave + the paired ``tool_call_id`` on the prior assistant message unmatched, + which providers reject with HTTP 400. + * Non-tool messages whose content becomes empty are dropped. In + practice this only hits synthetic image-only user messages appended + for attachment delivery; real user turns always include text. + Returns True if any image parts were removed. """ found = False @@ -832,9 +841,13 @@ def _strip_images_from_messages(messages: list) -> bool: if len(new_parts) < len(content): if new_parts: msg["content"] = new_parts + elif msg.get("role") == "tool": + # Preserve tool_call_id linkage — providers require every + # assistant tool_call to have a matching tool response. + msg["content"] = "[image content removed — server does not support images]" else: - # Entire message was images — drop it (user messages added for - # image delivery only, e.g. the deferred injection messages). + # Synthetic image-only user/assistant message with no text; + # safe to drop. to_delete.append(i) for i in reversed(to_delete): del messages[i] @@ -11365,11 +11378,18 @@ class AIAgent: continue # ── Image-rejection recovery ────────────────────────────── - # Some providers (mlx-lm, text-only endpoints) reject any - # message that contains image_url content with an error like + # Some providers (mlx-lm, text-only endpoints, text-only + # fallbacks on multimodal models) reject any message that + # contains image_url content with a 4xx error like # "Only 'text' content type is supported." On first hit, # strip all images from the message list, mark the session # as vision-unsupported, and retry with text only. + # + # Detection is best-effort English phrase matching — a + # locale-translated or heavily-reworded upstream error + # will bypass this guard and fall through to the normal + # error handler. Expand the phrase list when new + # provider wordings are observed in the wild. _err_body = "" try: _err_body = str(getattr(api_error, "body", None) or @@ -11377,17 +11397,35 @@ class AIAgent: str(api_error)) except Exception: pass + _err_status = getattr(api_error, "status_code", None) _IMAGE_REJECTION_PHRASES = ( "only 'text' content type is supported", "only text content type is supported", "image_url is not supported", + "image content is not supported", "multimodal is not supported", + "multimodal content is not supported", + "multimodal input is not supported", "vision is not supported", + "vision input is not supported", "does not support images", + "does not support image input", + "does not support multimodal", + "does not support vision", + "model does not support image", ) + _err_lower = _err_body.lower() + _looks_like_image_rejection = any( + p in _err_lower for p in _IMAGE_REJECTION_PHRASES + ) + # 4xx-only gate: never interpret 5xx/timeout as "server + # said no to images" — those are transient and must + # route to the normal retry path. + _status_ok = _err_status is None or (400 <= int(_err_status) < 500) if ( getattr(self, "_vision_supported", True) - and any(p in _err_body.lower() for p in _IMAGE_REJECTION_PHRASES) + and _looks_like_image_rejection + and _status_ok ): self._vision_supported = False _imgs_removed = _strip_images_from_messages(messages) diff --git a/scripts/release.py b/scripts/release.py index 5af48dca..08d6cc37 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -123,6 +123,7 @@ AUTHOR_MAP = { "104278804+Sertug17@users.noreply.github.com": "Sertug17", "112503481+caentzminger@users.noreply.github.com": "caentzminger", "258577966+voidborne-d@users.noreply.github.com": "voidborne-d", + "3820588+ddupont808@users.noreply.github.com": "ddupont808", "liusway405@gmail.com": "voidborne-d", "xydarcher@uestc.edu.cn": "Readon", "sir_even@icloud.com": "sirEven", diff --git a/tests/run_agent/test_image_rejection_fallback.py b/tests/run_agent/test_image_rejection_fallback.py new file mode 100644 index 00000000..e52719d9 --- /dev/null +++ b/tests/run_agent/test_image_rejection_fallback.py @@ -0,0 +1,243 @@ +"""Tests for the image-rejection fallback in run_agent. + +When a server rejects image content (e.g. text-only endpoints), the agent +strips image parts from message history and retries text-only. These tests +verify that stripping preserves the role-alternation invariants providers +require, and that the phrase detector fires on the expected error bodies. +""" + +from run_agent import _strip_images_from_messages + + +class TestStripImagesPreservesAlternation: + """_strip_images_from_messages must not break message role alternation.""" + + def test_noop_when_no_images(self): + msgs = [ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi"}, + ] + changed = _strip_images_from_messages(msgs) + assert changed is False + assert msgs == [ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi"}, + ] + + def test_string_content_untouched(self): + """String content passes through — only list content is inspected.""" + msgs = [{"role": "user", "content": "just text"}] + changed = _strip_images_from_messages(msgs) + assert changed is False + assert msgs[0]["content"] == "just text" + + def test_strips_image_url_part_preserves_text(self): + msgs = [{ + "role": "user", + "content": [ + {"type": "text", "text": "describe"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}, + ], + }] + changed = _strip_images_from_messages(msgs) + assert changed is True + assert msgs[0]["content"] == [{"type": "text", "text": "describe"}] + + def test_strips_all_recognized_image_types(self): + msgs = [{ + "role": "user", + "content": [ + {"type": "text", "text": "hi"}, + {"type": "image_url", "image_url": {}}, + {"type": "image", "source": {}}, + {"type": "input_image", "image_url": "http://x"}, + ], + }] + changed = _strip_images_from_messages(msgs) + assert changed is True + assert msgs[0]["content"] == [{"type": "text", "text": "hi"}] + + def test_tool_message_with_all_images_replaced_not_deleted(self): + """CRITICAL: tool messages must NEVER be deleted — their tool_call_id + pairs with an assistant tool_call and providers reject unmatched IDs. + """ + msgs = [ + {"role": "user", "content": "take a screenshot"}, + { + "role": "assistant", + "content": None, + "tool_calls": [{ + "id": "call_abc", + "type": "function", + "function": {"name": "computer_use", "arguments": "{}"}, + }], + }, + { + "role": "tool", + "tool_call_id": "call_abc", + "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}, + ], + }, + ] + changed = _strip_images_from_messages(msgs) + assert changed is True + # Length preserved — tool message NOT deleted + assert len(msgs) == 3 + # tool_call_id still present + assert msgs[2]["tool_call_id"] == "call_abc" + # Content replaced with text placeholder (now a string, not a list) + assert isinstance(msgs[2]["content"], str) + assert "image content removed" in msgs[2]["content"].lower() + + def test_tool_message_with_mixed_content_keeps_text_parts(self): + msgs = [ + {"role": "user", "content": "screenshot plz"}, + { + "role": "assistant", + "content": None, + "tool_calls": [{"id": "call_1", "type": "function", "function": {"name": "x", "arguments": "{}"}}], + }, + { + "role": "tool", + "tool_call_id": "call_1", + "content": [ + {"type": "text", "text": "Captured 1024x768"}, + {"type": "image_url", "image_url": {"url": "data:..."}}, + ], + }, + ] + changed = _strip_images_from_messages(msgs) + assert changed is True + assert len(msgs) == 3 + assert msgs[2]["content"] == [{"type": "text", "text": "Captured 1024x768"}] + assert msgs[2]["tool_call_id"] == "call_1" + + def test_image_only_user_message_dropped(self): + """Synthetic image-only user messages (gateway injection pattern) are + safe to drop — no tool_call_id linkage to preserve.""" + msgs = [ + {"role": "user", "content": "what's in this?"}, + {"role": "assistant", "content": "I'll check."}, + { + "role": "user", + "content": [{"type": "image_url", "image_url": {"url": "data:..."}}], + }, + ] + changed = _strip_images_from_messages(msgs) + assert changed is True + # Synthetic image-only user message dropped + assert len(msgs) == 2 + assert msgs[-1]["role"] == "assistant" + + def test_multiple_tool_messages_all_preserved(self): + """Parallel tool calls: each tool_call_id must retain a paired message.""" + msgs = [ + { + "role": "assistant", + "content": None, + "tool_calls": [ + {"id": "c1", "type": "function", "function": {"name": "x", "arguments": "{}"}}, + {"id": "c2", "type": "function", "function": {"name": "x", "arguments": "{}"}}, + ], + }, + { + "role": "tool", + "tool_call_id": "c1", + "content": [{"type": "image_url", "image_url": {}}], + }, + { + "role": "tool", + "tool_call_id": "c2", + "content": [{"type": "image_url", "image_url": {}}], + }, + ] + changed = _strip_images_from_messages(msgs) + assert changed is True + tool_msgs = [m for m in msgs if m.get("role") == "tool"] + assert len(tool_msgs) == 2 + assert {m["tool_call_id"] for m in tool_msgs} == {"c1", "c2"} + + def test_returns_false_when_nothing_changed(self): + msgs = [ + {"role": "user", "content": [{"type": "text", "text": "hi"}]}, + {"role": "assistant", "content": "hello"}, + ] + assert _strip_images_from_messages(msgs) is False + + def test_handles_non_dict_entries_gracefully(self): + msgs = [None, "not a dict", {"role": "user", "content": "ok"}] + # Must not raise + changed = _strip_images_from_messages(msgs) + assert changed is False + + +class TestImageRejectionPhraseIsolation: + """The image-rejection phrase list must NOT false-match on other + image-related error categories (size-too-large, format errors, etc.) + so they route to the correct recovery handler (e.g. _try_shrink_image_parts). + """ + + # Reproduces the phrase list used in run_agent.py's error-handler block. + _REJECTION_PHRASES = ( + "only 'text' content type is supported", + "only text content type is supported", + "image_url is not supported", + "image content is not supported", + "multimodal is not supported", + "multimodal content is not supported", + "multimodal input is not supported", + "vision is not supported", + "vision input is not supported", + "does not support images", + "does not support image input", + "does not support multimodal", + "does not support vision", + "model does not support image", + ) + + def _matches(self, body: str) -> bool: + low = body.lower() + return any(p in low for p in self._REJECTION_PHRASES) + + def test_anthropic_image_too_large_does_not_trip(self): + # From agent/error_classifier.py _IMAGE_TOO_LARGE_PATTERNS — + # these must route to image_too_large / _try_shrink_image_parts_in_messages, + # NOT to our vision-unsupported fallback. + bodies = [ + "messages.0.content.1.image.source.base64: image exceeds 5 MB maximum", + "image too large: 6291456 bytes > 5242880 limit", + "image_too_large", + "image size exceeds per-request limit", + ] + for body in bodies: + assert self._matches(body) is False, f"false positive on: {body}" + + def test_context_overflow_does_not_trip(self): + bodies = [ + "This model's maximum context length is 200000 tokens.", + "Request too large: max tokens per request is 200000", + "The input exceeds the context window.", + ] + for body in bodies: + assert self._matches(body) is False, f"false positive on: {body}" + + def test_rate_limit_does_not_trip(self): + bodies = [ + "rate limit reached for requests", + "You exceeded your current quota", + ] + for body in bodies: + assert self._matches(body) is False + + def test_real_image_rejection_bodies_trip(self): + """Positive cases — real-world error wordings that should trigger.""" + bodies = [ + "Only 'text' content type is supported.", + "Bad request: multimodal is not supported by this model", + "This model does not support images", + "vision is not supported on this endpoint", + "model does not support image input", + ] + for body in bodies: + assert self._matches(body) is True, f"false negative on: {body}" diff --git a/tests/tools/test_registry.py b/tests/tools/test_registry.py index 3c753f64..b8b631df 100644 --- a/tests/tools/test_registry.py +++ b/tests/tools/test_registry.py @@ -296,6 +296,7 @@ class TestBuiltinDiscovery: "tools.browser_tool", "tools.clarify_tool", "tools.code_execution_tool", + "tools.computer_use_tool", "tools.cronjob_tools", "tools.delegate_tool", "tools.discord_tool",