fix(computer-use): unwrap _multimodal tool results to content list for non-Anthropic providers

Tool handlers (e.g. computer_use capture) return a _multimodal envelope dict when a screenshot is attached. The tool-message builder was passing this raw dict as the `content` field of role:tool messages, which is an illegal format — OpenAI-compatible APIs expect a string or a content-parts list, not a plain Python dict, and would reject it with a 400/422 error. Fix: unwrap _multimodal results to their `content` list ([{type:text,...},{type:image_url,...}]) in both the parallel and sequential tool-call paths. The Anthropic adapter already handles content lists natively; vision-capable OpenAI-compatible servers (mlx-vlm, GPT-4o, etc.) accept image_url parts in tool messages directly. Also add a _vision_supported adaptive fallback: on first image-rejection error ("Only 'text' content type is supported." etc.) the agent strips all image parts from the message history and retries with text only, so text-only endpoints degrade gracefully without crashing the session.
2026-04-24 14:51:08 -07:00 · 2026-04-24 14:51:08 -07:00 · b4a8031b2e
commit b4a8031b2e
parent 413ee1a286
1 changed files with 99 additions and 3 deletions
--- a/run_agent.py
+++ b/run_agent.py
@ -806,6 +806,41 @@ def _sanitize_tools_non_ascii(tools: list) -> bool:
    return _sanitize_structure_non_ascii(tools)


+def _strip_images_from_messages(messages: list) -> bool:
+    """Remove image_url content parts from all messages in-place.
+
+    Called when a server signals it does not support images (e.g.
+    "Only 'text' content type is supported.").  Mutates messages so the
+    next API call sends text only.
+
+    Returns True if any image parts were removed.
+    """
+    found = False
+    to_delete = []
+    for i, msg in enumerate(messages):
+        if not isinstance(msg, dict):
+            continue
+        content = msg.get("content")
+        if not isinstance(content, list):
+            continue
+        new_parts = []
+        for part in content:
+            if isinstance(part, dict) and part.get("type") in ("image_url", "image", "input_image"):
+                found = True
+            else:
+                new_parts.append(part)
+        if len(new_parts) < len(content):
+            if new_parts:
+                msg["content"] = new_parts
+            else:
+                # Entire message was images — drop it (user messages added for
+                # image delivery only, e.g. the deferred injection messages).
+                to_delete.append(i)
+    for i in reversed(to_delete):
+        del messages[i]
+    return found
+
+
 def _sanitize_structure_non_ascii(payload: Any) -> bool:
    """Strip non-ASCII characters from nested dict/list payloads in-place."""
    found = False
@ -9161,9 +9196,22 @@ class AIAgent:
                else:
                    function_result += subdir_hints

+            # Unwrap _multimodal dicts to an OpenAI-style content list so any
+            # vision-capable provider receives [{type:text},{type:image_url}]
+            # rather than a raw Python dict.  The Anthropic adapter already
+            # accepts content lists; vision-capable OpenAI-compatible servers
+            # (mlx-vlm, GPT-4o, …) accept image_url in tool messages natively.
+            # Text-only servers that reject images are handled by the adaptive
+            # _vision_supported recovery in the API retry loop.
+            # String results pass through unchanged.
+            _tool_content = (
+                function_result["content"]
+                if _is_multimodal_tool_result(function_result)
+                else function_result
+            )
            tool_msg = {
                "role": "tool",
-                "content": function_result,
+                "content": _tool_content,
                "tool_call_id": tc.id,
            }
            messages.append(tool_msg)
@ -9535,9 +9583,16 @@ class AIAgent:
                else:
                    function_result += subdir_hints

+            # Unwrap _multimodal dicts to an OpenAI-style content list
+            # (see parallel path for rationale). String results pass through.
+            _tool_content = (
+                function_result["content"]
+                if _is_multimodal_tool_result(function_result)
+                else function_result
+            )
            tool_msg = {
                "role": "tool",
-                "content": function_result,
+                "content": _tool_content,
                "tool_call_id": tool_call.id
            }
            messages.append(tool_msg)
@ -9585,7 +9640,6 @@ class AIAgent:
            self._apply_pending_steer_to_tool_results(messages, num_tools_seq)


-
    def _handle_max_iterations(self, messages: list, api_call_count: int) -> str:
        """Request a summary when max iterations are reached. Returns the final response text."""
        print(f"⚠️  Reached maximum iterations ({self.max_iterations}). Requesting summary...")
@ -9825,6 +9879,11 @@ class AIAgent:
        self._last_content_tools_all_housekeeping = False
        self._mute_post_response = False
        self._unicode_sanitization_passes = 0
+        # True until the server rejects an image_url content part with an error
+        # like "Only 'text' content type is supported."  Set to False on first
+        # rejection and kept False for the rest of the session so we never re-send
+        # images to a text-only endpoint.  Scoped per `_run()` call, not per instance.
+        self._vision_supported = True

        # Pre-turn connection health check: detect and clean up dead TCP
        # connections left over from provider outages or dropped streams.
@ -11305,6 +11364,43 @@ class AIAgent:
                                )
                            continue

+                    # ── Image-rejection recovery ──────────────────────────────
+                    # Some providers (mlx-lm, text-only endpoints) reject any
+                    # message that contains image_url content with an error like
+                    # "Only 'text' content type is supported."  On first hit,
+                    # strip all images from the message list, mark the session
+                    # as vision-unsupported, and retry with text only.
+                    _err_body = ""
+                    try:
+                        _err_body = str(getattr(api_error, "body", None) or
+                                        getattr(api_error, "message", None) or
+                                        str(api_error))
+                    except Exception:
+                        pass
+                    _IMAGE_REJECTION_PHRASES = (
+                        "only 'text' content type is supported",
+                        "only text content type is supported",
+                        "image_url is not supported",
+                        "multimodal is not supported",
+                        "vision is not supported",
+                        "does not support images",
+                    )
+                    if (
+                        getattr(self, "_vision_supported", True)
+                        and any(p in _err_body.lower() for p in _IMAGE_REJECTION_PHRASES)
+                    ):
+                        self._vision_supported = False
+                        _imgs_removed = _strip_images_from_messages(messages)
+                        if isinstance(api_messages, list):
+                            _strip_images_from_messages(api_messages)
+                        self._vprint(
+                            f"{self.log_prefix}⚠️  Server rejected image content — "
+                            f"switching to text-only mode for this session"
+                            + (". Stripped images from history and retrying." if _imgs_removed else "."),
+                            force=True,
+                        )
+                        continue
+
                    status_code = getattr(api_error, "status_code", None)
                    error_context = self._extract_api_error_context(api_error)