fix(computer-use): harden image-rejection fallback + AUTHOR_MAP
Follow-up to #15328's vision-unsupported retry branch in run_agent.py. _strip_images_from_messages() previously deleted any message whose content was entirely images. That's fine for synthetic user messages injected for attachment delivery, but it breaks providers for tool-role messages — the paired tool_call_id on the preceding assistant message ends up unmatched, which OpenAI-compatible APIs reject with HTTP 400. Fix: tool-role messages whose content becomes empty are replaced with a plaintext placeholder that preserves the tool_call_id linkage. Only non-tool messages are dropped. Added 10 tests covering the role-alternation invariants + image-type coverage. Image-rejection detector: expanded phrase list (image content not supported / multimodal input / vision input / model does not support image) and gated on 4xx status so transient 5xx errors never get misinterpreted as 'server said no to images'. Detection is documented as best-effort English phrase matching. AUTHOR_MAP: mapped 3820588+ddupont808@users.noreply.github.com to ddupont808 so release notes attribute the salvage correctly.
This commit is contained in:
parent
b4a8031b2e
commit
afb9588298
48
run_agent.py
48
run_agent.py
@ -813,6 +813,15 @@ def _strip_images_from_messages(messages: list) -> bool:
|
|||||||
"Only 'text' content type is supported."). Mutates messages so the
|
"Only 'text' content type is supported."). Mutates messages so the
|
||||||
next API call sends text only.
|
next API call sends text only.
|
||||||
|
|
||||||
|
Preserves message alternation invariants:
|
||||||
|
* ``tool``-role messages whose content was entirely images are replaced
|
||||||
|
with a plaintext placeholder, NOT deleted — deleting them would leave
|
||||||
|
the paired ``tool_call_id`` on the prior assistant message unmatched,
|
||||||
|
which providers reject with HTTP 400.
|
||||||
|
* Non-tool messages whose content becomes empty are dropped. In
|
||||||
|
practice this only hits synthetic image-only user messages appended
|
||||||
|
for attachment delivery; real user turns always include text.
|
||||||
|
|
||||||
Returns True if any image parts were removed.
|
Returns True if any image parts were removed.
|
||||||
"""
|
"""
|
||||||
found = False
|
found = False
|
||||||
@ -832,9 +841,13 @@ def _strip_images_from_messages(messages: list) -> bool:
|
|||||||
if len(new_parts) < len(content):
|
if len(new_parts) < len(content):
|
||||||
if new_parts:
|
if new_parts:
|
||||||
msg["content"] = new_parts
|
msg["content"] = new_parts
|
||||||
|
elif msg.get("role") == "tool":
|
||||||
|
# Preserve tool_call_id linkage — providers require every
|
||||||
|
# assistant tool_call to have a matching tool response.
|
||||||
|
msg["content"] = "[image content removed — server does not support images]"
|
||||||
else:
|
else:
|
||||||
# Entire message was images — drop it (user messages added for
|
# Synthetic image-only user/assistant message with no text;
|
||||||
# image delivery only, e.g. the deferred injection messages).
|
# safe to drop.
|
||||||
to_delete.append(i)
|
to_delete.append(i)
|
||||||
for i in reversed(to_delete):
|
for i in reversed(to_delete):
|
||||||
del messages[i]
|
del messages[i]
|
||||||
@ -11365,11 +11378,18 @@ class AIAgent:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# ── Image-rejection recovery ──────────────────────────────
|
# ── Image-rejection recovery ──────────────────────────────
|
||||||
# Some providers (mlx-lm, text-only endpoints) reject any
|
# Some providers (mlx-lm, text-only endpoints, text-only
|
||||||
# message that contains image_url content with an error like
|
# fallbacks on multimodal models) reject any message that
|
||||||
|
# contains image_url content with a 4xx error like
|
||||||
# "Only 'text' content type is supported." On first hit,
|
# "Only 'text' content type is supported." On first hit,
|
||||||
# strip all images from the message list, mark the session
|
# strip all images from the message list, mark the session
|
||||||
# as vision-unsupported, and retry with text only.
|
# as vision-unsupported, and retry with text only.
|
||||||
|
#
|
||||||
|
# Detection is best-effort English phrase matching — a
|
||||||
|
# locale-translated or heavily-reworded upstream error
|
||||||
|
# will bypass this guard and fall through to the normal
|
||||||
|
# error handler. Expand the phrase list when new
|
||||||
|
# provider wordings are observed in the wild.
|
||||||
_err_body = ""
|
_err_body = ""
|
||||||
try:
|
try:
|
||||||
_err_body = str(getattr(api_error, "body", None) or
|
_err_body = str(getattr(api_error, "body", None) or
|
||||||
@ -11377,17 +11397,35 @@ class AIAgent:
|
|||||||
str(api_error))
|
str(api_error))
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
_err_status = getattr(api_error, "status_code", None)
|
||||||
_IMAGE_REJECTION_PHRASES = (
|
_IMAGE_REJECTION_PHRASES = (
|
||||||
"only 'text' content type is supported",
|
"only 'text' content type is supported",
|
||||||
"only text content type is supported",
|
"only text content type is supported",
|
||||||
"image_url is not supported",
|
"image_url is not supported",
|
||||||
|
"image content is not supported",
|
||||||
"multimodal is not supported",
|
"multimodal is not supported",
|
||||||
|
"multimodal content is not supported",
|
||||||
|
"multimodal input is not supported",
|
||||||
"vision is not supported",
|
"vision is not supported",
|
||||||
|
"vision input is not supported",
|
||||||
"does not support images",
|
"does not support images",
|
||||||
|
"does not support image input",
|
||||||
|
"does not support multimodal",
|
||||||
|
"does not support vision",
|
||||||
|
"model does not support image",
|
||||||
)
|
)
|
||||||
|
_err_lower = _err_body.lower()
|
||||||
|
_looks_like_image_rejection = any(
|
||||||
|
p in _err_lower for p in _IMAGE_REJECTION_PHRASES
|
||||||
|
)
|
||||||
|
# 4xx-only gate: never interpret 5xx/timeout as "server
|
||||||
|
# said no to images" — those are transient and must
|
||||||
|
# route to the normal retry path.
|
||||||
|
_status_ok = _err_status is None or (400 <= int(_err_status) < 500)
|
||||||
if (
|
if (
|
||||||
getattr(self, "_vision_supported", True)
|
getattr(self, "_vision_supported", True)
|
||||||
and any(p in _err_body.lower() for p in _IMAGE_REJECTION_PHRASES)
|
and _looks_like_image_rejection
|
||||||
|
and _status_ok
|
||||||
):
|
):
|
||||||
self._vision_supported = False
|
self._vision_supported = False
|
||||||
_imgs_removed = _strip_images_from_messages(messages)
|
_imgs_removed = _strip_images_from_messages(messages)
|
||||||
|
|||||||
@ -123,6 +123,7 @@ AUTHOR_MAP = {
|
|||||||
"104278804+Sertug17@users.noreply.github.com": "Sertug17",
|
"104278804+Sertug17@users.noreply.github.com": "Sertug17",
|
||||||
"112503481+caentzminger@users.noreply.github.com": "caentzminger",
|
"112503481+caentzminger@users.noreply.github.com": "caentzminger",
|
||||||
"258577966+voidborne-d@users.noreply.github.com": "voidborne-d",
|
"258577966+voidborne-d@users.noreply.github.com": "voidborne-d",
|
||||||
|
"3820588+ddupont808@users.noreply.github.com": "ddupont808",
|
||||||
"liusway405@gmail.com": "voidborne-d",
|
"liusway405@gmail.com": "voidborne-d",
|
||||||
"xydarcher@uestc.edu.cn": "Readon",
|
"xydarcher@uestc.edu.cn": "Readon",
|
||||||
"sir_even@icloud.com": "sirEven",
|
"sir_even@icloud.com": "sirEven",
|
||||||
|
|||||||
243
tests/run_agent/test_image_rejection_fallback.py
Normal file
243
tests/run_agent/test_image_rejection_fallback.py
Normal file
@ -0,0 +1,243 @@
|
|||||||
|
"""Tests for the image-rejection fallback in run_agent.
|
||||||
|
|
||||||
|
When a server rejects image content (e.g. text-only endpoints), the agent
|
||||||
|
strips image parts from message history and retries text-only. These tests
|
||||||
|
verify that stripping preserves the role-alternation invariants providers
|
||||||
|
require, and that the phrase detector fires on the expected error bodies.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from run_agent import _strip_images_from_messages
|
||||||
|
|
||||||
|
|
||||||
|
class TestStripImagesPreservesAlternation:
|
||||||
|
"""_strip_images_from_messages must not break message role alternation."""
|
||||||
|
|
||||||
|
def test_noop_when_no_images(self):
|
||||||
|
msgs = [
|
||||||
|
{"role": "user", "content": "hello"},
|
||||||
|
{"role": "assistant", "content": "hi"},
|
||||||
|
]
|
||||||
|
changed = _strip_images_from_messages(msgs)
|
||||||
|
assert changed is False
|
||||||
|
assert msgs == [
|
||||||
|
{"role": "user", "content": "hello"},
|
||||||
|
{"role": "assistant", "content": "hi"},
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_string_content_untouched(self):
|
||||||
|
"""String content passes through — only list content is inspected."""
|
||||||
|
msgs = [{"role": "user", "content": "just text"}]
|
||||||
|
changed = _strip_images_from_messages(msgs)
|
||||||
|
assert changed is False
|
||||||
|
assert msgs[0]["content"] == "just text"
|
||||||
|
|
||||||
|
def test_strips_image_url_part_preserves_text(self):
|
||||||
|
msgs = [{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "describe"},
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}},
|
||||||
|
],
|
||||||
|
}]
|
||||||
|
changed = _strip_images_from_messages(msgs)
|
||||||
|
assert changed is True
|
||||||
|
assert msgs[0]["content"] == [{"type": "text", "text": "describe"}]
|
||||||
|
|
||||||
|
def test_strips_all_recognized_image_types(self):
|
||||||
|
msgs = [{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "hi"},
|
||||||
|
{"type": "image_url", "image_url": {}},
|
||||||
|
{"type": "image", "source": {}},
|
||||||
|
{"type": "input_image", "image_url": "http://x"},
|
||||||
|
],
|
||||||
|
}]
|
||||||
|
changed = _strip_images_from_messages(msgs)
|
||||||
|
assert changed is True
|
||||||
|
assert msgs[0]["content"] == [{"type": "text", "text": "hi"}]
|
||||||
|
|
||||||
|
def test_tool_message_with_all_images_replaced_not_deleted(self):
|
||||||
|
"""CRITICAL: tool messages must NEVER be deleted — their tool_call_id
|
||||||
|
pairs with an assistant tool_call and providers reject unmatched IDs.
|
||||||
|
"""
|
||||||
|
msgs = [
|
||||||
|
{"role": "user", "content": "take a screenshot"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": None,
|
||||||
|
"tool_calls": [{
|
||||||
|
"id": "call_abc",
|
||||||
|
"type": "function",
|
||||||
|
"function": {"name": "computer_use", "arguments": "{}"},
|
||||||
|
}],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "tool",
|
||||||
|
"tool_call_id": "call_abc",
|
||||||
|
"content": [
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
changed = _strip_images_from_messages(msgs)
|
||||||
|
assert changed is True
|
||||||
|
# Length preserved — tool message NOT deleted
|
||||||
|
assert len(msgs) == 3
|
||||||
|
# tool_call_id still present
|
||||||
|
assert msgs[2]["tool_call_id"] == "call_abc"
|
||||||
|
# Content replaced with text placeholder (now a string, not a list)
|
||||||
|
assert isinstance(msgs[2]["content"], str)
|
||||||
|
assert "image content removed" in msgs[2]["content"].lower()
|
||||||
|
|
||||||
|
def test_tool_message_with_mixed_content_keeps_text_parts(self):
|
||||||
|
msgs = [
|
||||||
|
{"role": "user", "content": "screenshot plz"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": None,
|
||||||
|
"tool_calls": [{"id": "call_1", "type": "function", "function": {"name": "x", "arguments": "{}"}}],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "tool",
|
||||||
|
"tool_call_id": "call_1",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "Captured 1024x768"},
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:..."}},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
changed = _strip_images_from_messages(msgs)
|
||||||
|
assert changed is True
|
||||||
|
assert len(msgs) == 3
|
||||||
|
assert msgs[2]["content"] == [{"type": "text", "text": "Captured 1024x768"}]
|
||||||
|
assert msgs[2]["tool_call_id"] == "call_1"
|
||||||
|
|
||||||
|
def test_image_only_user_message_dropped(self):
|
||||||
|
"""Synthetic image-only user messages (gateway injection pattern) are
|
||||||
|
safe to drop — no tool_call_id linkage to preserve."""
|
||||||
|
msgs = [
|
||||||
|
{"role": "user", "content": "what's in this?"},
|
||||||
|
{"role": "assistant", "content": "I'll check."},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [{"type": "image_url", "image_url": {"url": "data:..."}}],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
changed = _strip_images_from_messages(msgs)
|
||||||
|
assert changed is True
|
||||||
|
# Synthetic image-only user message dropped
|
||||||
|
assert len(msgs) == 2
|
||||||
|
assert msgs[-1]["role"] == "assistant"
|
||||||
|
|
||||||
|
def test_multiple_tool_messages_all_preserved(self):
|
||||||
|
"""Parallel tool calls: each tool_call_id must retain a paired message."""
|
||||||
|
msgs = [
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": None,
|
||||||
|
"tool_calls": [
|
||||||
|
{"id": "c1", "type": "function", "function": {"name": "x", "arguments": "{}"}},
|
||||||
|
{"id": "c2", "type": "function", "function": {"name": "x", "arguments": "{}"}},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "tool",
|
||||||
|
"tool_call_id": "c1",
|
||||||
|
"content": [{"type": "image_url", "image_url": {}}],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "tool",
|
||||||
|
"tool_call_id": "c2",
|
||||||
|
"content": [{"type": "image_url", "image_url": {}}],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
changed = _strip_images_from_messages(msgs)
|
||||||
|
assert changed is True
|
||||||
|
tool_msgs = [m for m in msgs if m.get("role") == "tool"]
|
||||||
|
assert len(tool_msgs) == 2
|
||||||
|
assert {m["tool_call_id"] for m in tool_msgs} == {"c1", "c2"}
|
||||||
|
|
||||||
|
def test_returns_false_when_nothing_changed(self):
|
||||||
|
msgs = [
|
||||||
|
{"role": "user", "content": [{"type": "text", "text": "hi"}]},
|
||||||
|
{"role": "assistant", "content": "hello"},
|
||||||
|
]
|
||||||
|
assert _strip_images_from_messages(msgs) is False
|
||||||
|
|
||||||
|
def test_handles_non_dict_entries_gracefully(self):
|
||||||
|
msgs = [None, "not a dict", {"role": "user", "content": "ok"}]
|
||||||
|
# Must not raise
|
||||||
|
changed = _strip_images_from_messages(msgs)
|
||||||
|
assert changed is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestImageRejectionPhraseIsolation:
|
||||||
|
"""The image-rejection phrase list must NOT false-match on other
|
||||||
|
image-related error categories (size-too-large, format errors, etc.)
|
||||||
|
so they route to the correct recovery handler (e.g. _try_shrink_image_parts).
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Reproduces the phrase list used in run_agent.py's error-handler block.
|
||||||
|
_REJECTION_PHRASES = (
|
||||||
|
"only 'text' content type is supported",
|
||||||
|
"only text content type is supported",
|
||||||
|
"image_url is not supported",
|
||||||
|
"image content is not supported",
|
||||||
|
"multimodal is not supported",
|
||||||
|
"multimodal content is not supported",
|
||||||
|
"multimodal input is not supported",
|
||||||
|
"vision is not supported",
|
||||||
|
"vision input is not supported",
|
||||||
|
"does not support images",
|
||||||
|
"does not support image input",
|
||||||
|
"does not support multimodal",
|
||||||
|
"does not support vision",
|
||||||
|
"model does not support image",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _matches(self, body: str) -> bool:
|
||||||
|
low = body.lower()
|
||||||
|
return any(p in low for p in self._REJECTION_PHRASES)
|
||||||
|
|
||||||
|
def test_anthropic_image_too_large_does_not_trip(self):
|
||||||
|
# From agent/error_classifier.py _IMAGE_TOO_LARGE_PATTERNS —
|
||||||
|
# these must route to image_too_large / _try_shrink_image_parts_in_messages,
|
||||||
|
# NOT to our vision-unsupported fallback.
|
||||||
|
bodies = [
|
||||||
|
"messages.0.content.1.image.source.base64: image exceeds 5 MB maximum",
|
||||||
|
"image too large: 6291456 bytes > 5242880 limit",
|
||||||
|
"image_too_large",
|
||||||
|
"image size exceeds per-request limit",
|
||||||
|
]
|
||||||
|
for body in bodies:
|
||||||
|
assert self._matches(body) is False, f"false positive on: {body}"
|
||||||
|
|
||||||
|
def test_context_overflow_does_not_trip(self):
|
||||||
|
bodies = [
|
||||||
|
"This model's maximum context length is 200000 tokens.",
|
||||||
|
"Request too large: max tokens per request is 200000",
|
||||||
|
"The input exceeds the context window.",
|
||||||
|
]
|
||||||
|
for body in bodies:
|
||||||
|
assert self._matches(body) is False, f"false positive on: {body}"
|
||||||
|
|
||||||
|
def test_rate_limit_does_not_trip(self):
|
||||||
|
bodies = [
|
||||||
|
"rate limit reached for requests",
|
||||||
|
"You exceeded your current quota",
|
||||||
|
]
|
||||||
|
for body in bodies:
|
||||||
|
assert self._matches(body) is False
|
||||||
|
|
||||||
|
def test_real_image_rejection_bodies_trip(self):
|
||||||
|
"""Positive cases — real-world error wordings that should trigger."""
|
||||||
|
bodies = [
|
||||||
|
"Only 'text' content type is supported.",
|
||||||
|
"Bad request: multimodal is not supported by this model",
|
||||||
|
"This model does not support images",
|
||||||
|
"vision is not supported on this endpoint",
|
||||||
|
"model does not support image input",
|
||||||
|
]
|
||||||
|
for body in bodies:
|
||||||
|
assert self._matches(body) is True, f"false negative on: {body}"
|
||||||
@ -296,6 +296,7 @@ class TestBuiltinDiscovery:
|
|||||||
"tools.browser_tool",
|
"tools.browser_tool",
|
||||||
"tools.clarify_tool",
|
"tools.clarify_tool",
|
||||||
"tools.code_execution_tool",
|
"tools.code_execution_tool",
|
||||||
|
"tools.computer_use_tool",
|
||||||
"tools.cronjob_tools",
|
"tools.cronjob_tools",
|
||||||
"tools.delegate_tool",
|
"tools.delegate_tool",
|
||||||
"tools.discord_tool",
|
"tools.discord_tool",
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user