fix(computer-use): unwrap _multimodal tool results to content list for non-Anthropic providers
Tool handlers (e.g. computer_use capture) return a _multimodal envelope
dict when a screenshot is attached. The tool-message builder was passing
this raw dict as the `content` field of role:tool messages, which is an
illegal format — OpenAI-compatible APIs expect a string or a content-parts
list, not a plain Python dict, and would reject it with a 400/422 error.
Fix: unwrap _multimodal results to their `content` list
([{type:text,...},{type:image_url,...}]) in both the parallel and
sequential tool-call paths. The Anthropic adapter already handles content
lists natively; vision-capable OpenAI-compatible servers (mlx-vlm,
GPT-4o, etc.) accept image_url parts in tool messages directly.
Also add a _vision_supported adaptive fallback: on first image-rejection
error ("Only 'text' content type is supported." etc.) the agent strips all
image parts from the message history and retries with text only, so
text-only endpoints degrade gracefully without crashing the session.
This commit is contained in:
parent
413ee1a286
commit
b4a8031b2e
102
run_agent.py
102
run_agent.py
@ -806,6 +806,41 @@ def _sanitize_tools_non_ascii(tools: list) -> bool:
|
||||
return _sanitize_structure_non_ascii(tools)
|
||||
|
||||
|
||||
def _strip_images_from_messages(messages: list) -> bool:
|
||||
"""Remove image_url content parts from all messages in-place.
|
||||
|
||||
Called when a server signals it does not support images (e.g.
|
||||
"Only 'text' content type is supported."). Mutates messages so the
|
||||
next API call sends text only.
|
||||
|
||||
Returns True if any image parts were removed.
|
||||
"""
|
||||
found = False
|
||||
to_delete = []
|
||||
for i, msg in enumerate(messages):
|
||||
if not isinstance(msg, dict):
|
||||
continue
|
||||
content = msg.get("content")
|
||||
if not isinstance(content, list):
|
||||
continue
|
||||
new_parts = []
|
||||
for part in content:
|
||||
if isinstance(part, dict) and part.get("type") in ("image_url", "image", "input_image"):
|
||||
found = True
|
||||
else:
|
||||
new_parts.append(part)
|
||||
if len(new_parts) < len(content):
|
||||
if new_parts:
|
||||
msg["content"] = new_parts
|
||||
else:
|
||||
# Entire message was images — drop it (user messages added for
|
||||
# image delivery only, e.g. the deferred injection messages).
|
||||
to_delete.append(i)
|
||||
for i in reversed(to_delete):
|
||||
del messages[i]
|
||||
return found
|
||||
|
||||
|
||||
def _sanitize_structure_non_ascii(payload: Any) -> bool:
|
||||
"""Strip non-ASCII characters from nested dict/list payloads in-place."""
|
||||
found = False
|
||||
@ -9161,9 +9196,22 @@ class AIAgent:
|
||||
else:
|
||||
function_result += subdir_hints
|
||||
|
||||
# Unwrap _multimodal dicts to an OpenAI-style content list so any
|
||||
# vision-capable provider receives [{type:text},{type:image_url}]
|
||||
# rather than a raw Python dict. The Anthropic adapter already
|
||||
# accepts content lists; vision-capable OpenAI-compatible servers
|
||||
# (mlx-vlm, GPT-4o, …) accept image_url in tool messages natively.
|
||||
# Text-only servers that reject images are handled by the adaptive
|
||||
# _vision_supported recovery in the API retry loop.
|
||||
# String results pass through unchanged.
|
||||
_tool_content = (
|
||||
function_result["content"]
|
||||
if _is_multimodal_tool_result(function_result)
|
||||
else function_result
|
||||
)
|
||||
tool_msg = {
|
||||
"role": "tool",
|
||||
"content": function_result,
|
||||
"content": _tool_content,
|
||||
"tool_call_id": tc.id,
|
||||
}
|
||||
messages.append(tool_msg)
|
||||
@ -9535,9 +9583,16 @@ class AIAgent:
|
||||
else:
|
||||
function_result += subdir_hints
|
||||
|
||||
# Unwrap _multimodal dicts to an OpenAI-style content list
|
||||
# (see parallel path for rationale). String results pass through.
|
||||
_tool_content = (
|
||||
function_result["content"]
|
||||
if _is_multimodal_tool_result(function_result)
|
||||
else function_result
|
||||
)
|
||||
tool_msg = {
|
||||
"role": "tool",
|
||||
"content": function_result,
|
||||
"content": _tool_content,
|
||||
"tool_call_id": tool_call.id
|
||||
}
|
||||
messages.append(tool_msg)
|
||||
@ -9585,7 +9640,6 @@ class AIAgent:
|
||||
self._apply_pending_steer_to_tool_results(messages, num_tools_seq)
|
||||
|
||||
|
||||
|
||||
def _handle_max_iterations(self, messages: list, api_call_count: int) -> str:
|
||||
"""Request a summary when max iterations are reached. Returns the final response text."""
|
||||
print(f"⚠️ Reached maximum iterations ({self.max_iterations}). Requesting summary...")
|
||||
@ -9825,6 +9879,11 @@ class AIAgent:
|
||||
self._last_content_tools_all_housekeeping = False
|
||||
self._mute_post_response = False
|
||||
self._unicode_sanitization_passes = 0
|
||||
# True until the server rejects an image_url content part with an error
|
||||
# like "Only 'text' content type is supported." Set to False on first
|
||||
# rejection and kept False for the rest of the session so we never re-send
|
||||
# images to a text-only endpoint. Scoped per `_run()` call, not per instance.
|
||||
self._vision_supported = True
|
||||
|
||||
# Pre-turn connection health check: detect and clean up dead TCP
|
||||
# connections left over from provider outages or dropped streams.
|
||||
@ -11305,6 +11364,43 @@ class AIAgent:
|
||||
)
|
||||
continue
|
||||
|
||||
# ── Image-rejection recovery ──────────────────────────────
|
||||
# Some providers (mlx-lm, text-only endpoints) reject any
|
||||
# message that contains image_url content with an error like
|
||||
# "Only 'text' content type is supported." On first hit,
|
||||
# strip all images from the message list, mark the session
|
||||
# as vision-unsupported, and retry with text only.
|
||||
_err_body = ""
|
||||
try:
|
||||
_err_body = str(getattr(api_error, "body", None) or
|
||||
getattr(api_error, "message", None) or
|
||||
str(api_error))
|
||||
except Exception:
|
||||
pass
|
||||
_IMAGE_REJECTION_PHRASES = (
|
||||
"only 'text' content type is supported",
|
||||
"only text content type is supported",
|
||||
"image_url is not supported",
|
||||
"multimodal is not supported",
|
||||
"vision is not supported",
|
||||
"does not support images",
|
||||
)
|
||||
if (
|
||||
getattr(self, "_vision_supported", True)
|
||||
and any(p in _err_body.lower() for p in _IMAGE_REJECTION_PHRASES)
|
||||
):
|
||||
self._vision_supported = False
|
||||
_imgs_removed = _strip_images_from_messages(messages)
|
||||
if isinstance(api_messages, list):
|
||||
_strip_images_from_messages(api_messages)
|
||||
self._vprint(
|
||||
f"{self.log_prefix}⚠️ Server rejected image content — "
|
||||
f"switching to text-only mode for this session"
|
||||
+ (". Stripped images from history and retrying." if _imgs_removed else "."),
|
||||
force=True,
|
||||
)
|
||||
continue
|
||||
|
||||
status_code = getattr(api_error, "status_code", None)
|
||||
error_context = self._extract_api_error_context(api_error)
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user