fix(computer-use): unwrap _multimodal tool results to content list for non-Anthropic providers

Tool handlers (e.g. computer_use capture) return a _multimodal envelope
dict when a screenshot is attached. The tool-message builder was passing
this raw dict as the `content` field of role:tool messages, which is an
illegal format — OpenAI-compatible APIs expect a string or a content-parts
list, not a plain Python dict, and would reject it with a 400/422 error.

Fix: unwrap _multimodal results to their `content` list
([{type:text,...},{type:image_url,...}]) in both the parallel and
sequential tool-call paths. The Anthropic adapter already handles content
lists natively; vision-capable OpenAI-compatible servers (mlx-vlm,
GPT-4o, etc.) accept image_url parts in tool messages directly.

Also add a _vision_supported adaptive fallback: on first image-rejection
error ("Only 'text' content type is supported." etc.) the agent strips all
image parts from the message history and retries with text only, so
text-only endpoints degrade gracefully without crashing the session.
This commit is contained in:
ddupont 2026-04-24 14:51:08 -07:00 committed by Teknium
parent 413ee1a286
commit b4a8031b2e

View File

@ -806,6 +806,41 @@ def _sanitize_tools_non_ascii(tools: list) -> bool:
return _sanitize_structure_non_ascii(tools)
def _strip_images_from_messages(messages: list) -> bool:
"""Remove image_url content parts from all messages in-place.
Called when a server signals it does not support images (e.g.
"Only 'text' content type is supported."). Mutates messages so the
next API call sends text only.
Returns True if any image parts were removed.
"""
found = False
to_delete = []
for i, msg in enumerate(messages):
if not isinstance(msg, dict):
continue
content = msg.get("content")
if not isinstance(content, list):
continue
new_parts = []
for part in content:
if isinstance(part, dict) and part.get("type") in ("image_url", "image", "input_image"):
found = True
else:
new_parts.append(part)
if len(new_parts) < len(content):
if new_parts:
msg["content"] = new_parts
else:
# Entire message was images — drop it (user messages added for
# image delivery only, e.g. the deferred injection messages).
to_delete.append(i)
for i in reversed(to_delete):
del messages[i]
return found
def _sanitize_structure_non_ascii(payload: Any) -> bool:
"""Strip non-ASCII characters from nested dict/list payloads in-place."""
found = False
@ -9161,9 +9196,22 @@ class AIAgent:
else:
function_result += subdir_hints
# Unwrap _multimodal dicts to an OpenAI-style content list so any
# vision-capable provider receives [{type:text},{type:image_url}]
# rather than a raw Python dict. The Anthropic adapter already
# accepts content lists; vision-capable OpenAI-compatible servers
# (mlx-vlm, GPT-4o, …) accept image_url in tool messages natively.
# Text-only servers that reject images are handled by the adaptive
# _vision_supported recovery in the API retry loop.
# String results pass through unchanged.
_tool_content = (
function_result["content"]
if _is_multimodal_tool_result(function_result)
else function_result
)
tool_msg = {
"role": "tool",
"content": function_result,
"content": _tool_content,
"tool_call_id": tc.id,
}
messages.append(tool_msg)
@ -9535,9 +9583,16 @@ class AIAgent:
else:
function_result += subdir_hints
# Unwrap _multimodal dicts to an OpenAI-style content list
# (see parallel path for rationale). String results pass through.
_tool_content = (
function_result["content"]
if _is_multimodal_tool_result(function_result)
else function_result
)
tool_msg = {
"role": "tool",
"content": function_result,
"content": _tool_content,
"tool_call_id": tool_call.id
}
messages.append(tool_msg)
@ -9585,7 +9640,6 @@ class AIAgent:
self._apply_pending_steer_to_tool_results(messages, num_tools_seq)
def _handle_max_iterations(self, messages: list, api_call_count: int) -> str:
"""Request a summary when max iterations are reached. Returns the final response text."""
print(f"⚠️ Reached maximum iterations ({self.max_iterations}). Requesting summary...")
@ -9825,6 +9879,11 @@ class AIAgent:
self._last_content_tools_all_housekeeping = False
self._mute_post_response = False
self._unicode_sanitization_passes = 0
# True until the server rejects an image_url content part with an error
# like "Only 'text' content type is supported." Set to False on first
# rejection and kept False for the rest of the session so we never re-send
# images to a text-only endpoint. Scoped per `_run()` call, not per instance.
self._vision_supported = True
# Pre-turn connection health check: detect and clean up dead TCP
# connections left over from provider outages or dropped streams.
@ -11305,6 +11364,43 @@ class AIAgent:
)
continue
# ── Image-rejection recovery ──────────────────────────────
# Some providers (mlx-lm, text-only endpoints) reject any
# message that contains image_url content with an error like
# "Only 'text' content type is supported." On first hit,
# strip all images from the message list, mark the session
# as vision-unsupported, and retry with text only.
_err_body = ""
try:
_err_body = str(getattr(api_error, "body", None) or
getattr(api_error, "message", None) or
str(api_error))
except Exception:
pass
_IMAGE_REJECTION_PHRASES = (
"only 'text' content type is supported",
"only text content type is supported",
"image_url is not supported",
"multimodal is not supported",
"vision is not supported",
"does not support images",
)
if (
getattr(self, "_vision_supported", True)
and any(p in _err_body.lower() for p in _IMAGE_REJECTION_PHRASES)
):
self._vision_supported = False
_imgs_removed = _strip_images_from_messages(messages)
if isinstance(api_messages, list):
_strip_images_from_messages(api_messages)
self._vprint(
f"{self.log_prefix}⚠️ Server rejected image content — "
f"switching to text-only mode for this session"
+ (". Stripped images from history and retrying." if _imgs_removed else "."),
force=True,
)
continue
status_code = getattr(api_error, "status_code", None)
error_context = self._extract_api_error_context(api_error)