fix(surrogates): sanitize reasoning/reasoning_content/reasoning_details fields (#11628)
Byte-level reasoning models (xiaomi/mimo-v2-pro, kimi, glm) can emit lone surrogates in reasoning output. The proactive sanitizer walked content/ name/tool_calls but not extra fields like reasoning or the nested reasoning_details array. Surrogates in those fields survived the proactive pass, crashed json.dumps() in the OpenAI SDK, and the recovery block's _sanitize_messages_surrogates(messages) call also didn't check those fields — so 'found' was False, no retry happened, and after 3 attempts the user saw: API call failed after 3 retries. 'utf-8' codec can't encode characters in position N-M: surrogates not allowed Changes: - _sanitize_messages_surrogates: walk any extra string fields (reasoning, reasoning_content, etc.) and recurse into nested dict/list values (reasoning_details). Mirrors _sanitize_messages_non_ascii coverage added in PR #10537. - _sanitize_structure_surrogates: new recursive walker, mirror of _sanitize_structure_non_ascii but for surrogate recovery. - UnicodeEncodeError recovery block: also sanitize api_messages, api_kwargs, and prefill_messages (not just the canonical messages list — the API-copy carries reasoning_content transformed from reasoning and that's what the SDK actually serializes). Always retry on detected surrogate errors, not only when we found something to strip — gate on error type per PR #10537's pattern. Tests: extended tests/cli/test_surrogate_sanitization.py with coverage for reasoning, reasoning_content, reasoning_details (flat and deeply nested), structure walker, and an integration case that reproduces the exact api_messages shape that was crashing.
This commit is contained in:
parent
1229d8855c
commit
2ff1ef6ae6
105
run_agent.py
105
run_agent.py
@ -353,12 +353,50 @@ def _sanitize_surrogates(text: str) -> str:
|
||||
return text
|
||||
|
||||
|
||||
def _sanitize_structure_surrogates(payload: Any) -> bool:
|
||||
"""Replace surrogate code points in nested dict/list payloads in-place.
|
||||
|
||||
Mirror of ``_sanitize_structure_non_ascii`` but for surrogate recovery.
|
||||
Used to scrub nested structured fields (e.g. ``reasoning_details`` — an
|
||||
array of dicts with ``summary``/``text`` strings) that flat per-field
|
||||
checks don't reach. Returns True if any surrogates were replaced.
|
||||
"""
|
||||
found = False
|
||||
|
||||
def _walk(node):
|
||||
nonlocal found
|
||||
if isinstance(node, dict):
|
||||
for key, value in node.items():
|
||||
if isinstance(value, str):
|
||||
if _SURROGATE_RE.search(value):
|
||||
node[key] = _SURROGATE_RE.sub('\ufffd', value)
|
||||
found = True
|
||||
elif isinstance(value, (dict, list)):
|
||||
_walk(value)
|
||||
elif isinstance(node, list):
|
||||
for idx, value in enumerate(node):
|
||||
if isinstance(value, str):
|
||||
if _SURROGATE_RE.search(value):
|
||||
node[idx] = _SURROGATE_RE.sub('\ufffd', value)
|
||||
found = True
|
||||
elif isinstance(value, (dict, list)):
|
||||
_walk(value)
|
||||
|
||||
_walk(payload)
|
||||
return found
|
||||
|
||||
|
||||
def _sanitize_messages_surrogates(messages: list) -> bool:
|
||||
"""Sanitize surrogate characters from all string content in a messages list.
|
||||
|
||||
Walks message dicts in-place. Returns True if any surrogates were found
|
||||
and replaced, False otherwise. Covers content/text, name, and tool call
|
||||
metadata/arguments so retries don't fail on a non-content field.
|
||||
and replaced, False otherwise. Covers content/text, name, tool call
|
||||
metadata/arguments, AND any additional string or nested structured fields
|
||||
(``reasoning``, ``reasoning_content``, ``reasoning_details``, etc.) so
|
||||
retries don't fail on a non-content field. Byte-level reasoning models
|
||||
(xiaomi/mimo, kimi, glm) can emit lone surrogates in reasoning output
|
||||
that flow through to ``api_messages["reasoning_content"]`` on the next
|
||||
turn and crash json.dumps inside the OpenAI SDK.
|
||||
"""
|
||||
found = False
|
||||
for msg in messages:
|
||||
@ -398,6 +436,21 @@ def _sanitize_messages_surrogates(messages: list) -> bool:
|
||||
if isinstance(fn_args, str) and _SURROGATE_RE.search(fn_args):
|
||||
fn["arguments"] = _SURROGATE_RE.sub('\ufffd', fn_args)
|
||||
found = True
|
||||
# Walk any additional string / nested fields (reasoning,
|
||||
# reasoning_content, reasoning_details, etc.) — surrogates from
|
||||
# byte-level reasoning models (xiaomi/mimo, kimi, glm) can lurk
|
||||
# in these fields and aren't covered by the per-field checks above.
|
||||
# Matches _sanitize_messages_non_ascii's coverage (PR #10537).
|
||||
for key, value in msg.items():
|
||||
if key in {"content", "name", "tool_calls", "role"}:
|
||||
continue
|
||||
if isinstance(value, str):
|
||||
if _SURROGATE_RE.search(value):
|
||||
msg[key] = _SURROGATE_RE.sub('\ufffd', value)
|
||||
found = True
|
||||
elif isinstance(value, (dict, list)):
|
||||
if _sanitize_structure_surrogates(value):
|
||||
found = True
|
||||
return found
|
||||
|
||||
|
||||
@ -9570,13 +9623,51 @@ class AIAgent:
|
||||
if isinstance(api_error, UnicodeEncodeError) and getattr(self, '_unicode_sanitization_passes', 0) < 2:
|
||||
_err_str = str(api_error).lower()
|
||||
_is_ascii_codec = "'ascii'" in _err_str or "ascii" in _err_str
|
||||
# Detect surrogate errors — utf-8 codec refusing to
|
||||
# encode U+D800..U+DFFF. The error text is:
|
||||
# "'utf-8' codec can't encode characters in position
|
||||
# N-M: surrogates not allowed"
|
||||
_is_surrogate_error = (
|
||||
"surrogate" in _err_str
|
||||
or ("'utf-8'" in _err_str and not _is_ascii_codec)
|
||||
)
|
||||
# Sanitize surrogates from both the canonical `messages`
|
||||
# list AND `api_messages` (the API-copy, which may carry
|
||||
# `reasoning_content`/`reasoning_details` transformed
|
||||
# from `reasoning` — fields the canonical list doesn't
|
||||
# have directly). Also clean `api_kwargs` if built and
|
||||
# `prefill_messages` if present. Mirrors the ASCII
|
||||
# codec recovery below.
|
||||
_surrogates_found = _sanitize_messages_surrogates(messages)
|
||||
if _surrogates_found:
|
||||
if isinstance(api_messages, list):
|
||||
if _sanitize_messages_surrogates(api_messages):
|
||||
_surrogates_found = True
|
||||
if isinstance(api_kwargs, dict):
|
||||
if _sanitize_structure_surrogates(api_kwargs):
|
||||
_surrogates_found = True
|
||||
if isinstance(getattr(self, "prefill_messages", None), list):
|
||||
if _sanitize_messages_surrogates(self.prefill_messages):
|
||||
_surrogates_found = True
|
||||
# Gate the retry on the error type, not on whether we
|
||||
# found anything — _force_ascii_payload / the extended
|
||||
# surrogate walker above cover all known paths, but a
|
||||
# new transformed field could still slip through. If
|
||||
# the error was a surrogate encode failure, always let
|
||||
# the retry run; the proactive sanitizer at line ~8781
|
||||
# runs again on the next iteration. Bounded by
|
||||
# _unicode_sanitization_passes < 2 (outer guard).
|
||||
if _surrogates_found or _is_surrogate_error:
|
||||
self._unicode_sanitization_passes += 1
|
||||
self._vprint(
|
||||
f"{self.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...",
|
||||
force=True,
|
||||
)
|
||||
if _surrogates_found:
|
||||
self._vprint(
|
||||
f"{self.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...",
|
||||
force=True,
|
||||
)
|
||||
else:
|
||||
self._vprint(
|
||||
f"{self.log_prefix}⚠️ Surrogate encoding error — retrying after full-payload sanitization...",
|
||||
force=True,
|
||||
)
|
||||
continue
|
||||
if _is_ascii_codec:
|
||||
self._force_ascii_payload = True
|
||||
|
||||
@ -2,7 +2,8 @@
|
||||
|
||||
Surrogates (U+D800..U+DFFF) are invalid in UTF-8 and crash json.dumps()
|
||||
inside the OpenAI SDK. They can appear via clipboard paste from rich-text
|
||||
editors like Google Docs.
|
||||
editors like Google Docs, OR from byte-level reasoning models (xiaomi/mimo,
|
||||
kimi, glm) emitting lone halves in reasoning output.
|
||||
"""
|
||||
import json
|
||||
import pytest
|
||||
@ -11,6 +12,7 @@ from unittest.mock import MagicMock, patch
|
||||
from run_agent import (
|
||||
_sanitize_surrogates,
|
||||
_sanitize_messages_surrogates,
|
||||
_sanitize_structure_surrogates,
|
||||
_SURROGATE_RE,
|
||||
)
|
||||
|
||||
@ -109,6 +111,186 @@ class TestSanitizeMessagesSurrogates:
|
||||
assert "\ufffd" in msgs[0]["content"]
|
||||
|
||||
|
||||
class TestReasoningFieldSurrogates:
|
||||
"""Surrogates in reasoning fields (byte-level reasoning models).
|
||||
|
||||
xiaomi/mimo, kimi, glm and similar byte-level tokenizers can emit lone
|
||||
surrogates in reasoning output. These fields are carried through to the
|
||||
API as `reasoning_content` on assistant messages, and must be sanitized
|
||||
or json.dumps() crashes with 'utf-8' codec can't encode surrogates.
|
||||
"""
|
||||
|
||||
def test_reasoning_field_sanitized(self):
|
||||
msgs = [
|
||||
{"role": "assistant", "content": "ok", "reasoning": "thought \udce2 here"},
|
||||
]
|
||||
assert _sanitize_messages_surrogates(msgs) is True
|
||||
assert "\udce2" not in msgs[0]["reasoning"]
|
||||
assert "\ufffd" in msgs[0]["reasoning"]
|
||||
|
||||
def test_reasoning_content_field_sanitized(self):
|
||||
"""api_messages carry `reasoning_content` built from `reasoning`."""
|
||||
msgs = [
|
||||
{"role": "assistant", "content": "ok", "reasoning_content": "thought \udce2 here"},
|
||||
]
|
||||
assert _sanitize_messages_surrogates(msgs) is True
|
||||
assert "\udce2" not in msgs[0]["reasoning_content"]
|
||||
assert "\ufffd" in msgs[0]["reasoning_content"]
|
||||
|
||||
def test_reasoning_details_nested_sanitized(self):
|
||||
"""reasoning_details is a list of dicts with nested string fields."""
|
||||
msgs = [
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "ok",
|
||||
"reasoning_details": [
|
||||
{"type": "reasoning.summary", "summary": "summary \udce2 text"},
|
||||
{"type": "reasoning.text", "text": "chain \udc00 of thought"},
|
||||
],
|
||||
},
|
||||
]
|
||||
assert _sanitize_messages_surrogates(msgs) is True
|
||||
assert "\udce2" not in msgs[0]["reasoning_details"][0]["summary"]
|
||||
assert "\ufffd" in msgs[0]["reasoning_details"][0]["summary"]
|
||||
assert "\udc00" not in msgs[0]["reasoning_details"][1]["text"]
|
||||
assert "\ufffd" in msgs[0]["reasoning_details"][1]["text"]
|
||||
|
||||
def test_deeply_nested_reasoning_sanitized(self):
|
||||
"""Nested dicts / lists inside extra fields are recursed into."""
|
||||
msgs = [
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "ok",
|
||||
"reasoning_details": [
|
||||
{
|
||||
"type": "reasoning.encrypted",
|
||||
"content": {
|
||||
"encrypted_content": "opaque",
|
||||
"text_parts": ["part1", "part2 \udce2 part"],
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
assert _sanitize_messages_surrogates(msgs) is True
|
||||
assert (
|
||||
msgs[0]["reasoning_details"][0]["content"]["text_parts"][1]
|
||||
== "part2 \ufffd part"
|
||||
)
|
||||
|
||||
def test_reasoning_end_to_end_json_serialization(self):
|
||||
"""After sanitization, the full message dict must serialize clean."""
|
||||
msgs = [
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "answer",
|
||||
"reasoning_content": "reasoning with \udce2 surrogate",
|
||||
"reasoning_details": [
|
||||
{"summary": "nested \udcb0 surrogate"},
|
||||
],
|
||||
},
|
||||
]
|
||||
_sanitize_messages_surrogates(msgs)
|
||||
# Must round-trip through json + utf-8 encoding without error
|
||||
payload = json.dumps(msgs, ensure_ascii=False).encode("utf-8")
|
||||
assert b"\\" not in payload[:0] # sanity — just ensure we got bytes
|
||||
assert len(payload) > 0
|
||||
|
||||
def test_no_surrogates_returns_false(self):
|
||||
"""Clean reasoning fields don't trigger a modification."""
|
||||
msgs = [
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "ok",
|
||||
"reasoning": "clean thought",
|
||||
"reasoning_content": "also clean",
|
||||
"reasoning_details": [{"summary": "clean summary"}],
|
||||
},
|
||||
]
|
||||
assert _sanitize_messages_surrogates(msgs) is False
|
||||
|
||||
|
||||
class TestSanitizeStructureSurrogates:
|
||||
"""Test the _sanitize_structure_surrogates() helper for nested payloads."""
|
||||
|
||||
def test_empty_payload(self):
|
||||
assert _sanitize_structure_surrogates({}) is False
|
||||
assert _sanitize_structure_surrogates([]) is False
|
||||
|
||||
def test_flat_dict(self):
|
||||
payload = {"a": "clean", "b": "dirty \udce2 text"}
|
||||
assert _sanitize_structure_surrogates(payload) is True
|
||||
assert payload["a"] == "clean"
|
||||
assert "\ufffd" in payload["b"]
|
||||
|
||||
def test_flat_list(self):
|
||||
payload = ["clean", "dirty \udce2"]
|
||||
assert _sanitize_structure_surrogates(payload) is True
|
||||
assert payload[0] == "clean"
|
||||
assert "\ufffd" in payload[1]
|
||||
|
||||
def test_nested_dict_in_list(self):
|
||||
payload = [{"x": "dirty \udce2"}, {"x": "clean"}]
|
||||
assert _sanitize_structure_surrogates(payload) is True
|
||||
assert "\ufffd" in payload[0]["x"]
|
||||
assert payload[1]["x"] == "clean"
|
||||
|
||||
def test_deeply_nested(self):
|
||||
payload = {
|
||||
"level1": {
|
||||
"level2": [
|
||||
{"level3": "deep \udce2 surrogate"},
|
||||
],
|
||||
},
|
||||
}
|
||||
assert _sanitize_structure_surrogates(payload) is True
|
||||
assert "\ufffd" in payload["level1"]["level2"][0]["level3"]
|
||||
|
||||
def test_clean_payload_returns_false(self):
|
||||
payload = {"a": "clean", "b": [{"c": "also clean"}]}
|
||||
assert _sanitize_structure_surrogates(payload) is False
|
||||
|
||||
def test_non_string_values_ignored(self):
|
||||
payload = {"int": 42, "list": [1, 2, 3], "dict": {"none": None}, "bool": True}
|
||||
assert _sanitize_structure_surrogates(payload) is False
|
||||
# Non-string values survive unchanged
|
||||
assert payload["int"] == 42
|
||||
assert payload["list"] == [1, 2, 3]
|
||||
|
||||
|
||||
class TestApiMessagesSurrogateRecovery:
|
||||
"""Integration: verify the recovery block sanitizes api_messages.
|
||||
|
||||
The bug this guards against: a surrogate in `reasoning_content` on
|
||||
api_messages (transformed from `reasoning` during build) crashes the
|
||||
OpenAI SDK's json.dumps(), and the recovery block previously only
|
||||
sanitized the canonical `messages` list — not `api_messages` — so the
|
||||
next retry would send the same broken payload and fail 3 times.
|
||||
"""
|
||||
|
||||
def test_api_messages_reasoning_content_sanitized(self):
|
||||
"""The extended sanitizer catches reasoning_content in api_messages."""
|
||||
api_messages = [
|
||||
{"role": "system", "content": "sys"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "response",
|
||||
"reasoning_content": "thought \udce2 trail",
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": "call_1",
|
||||
"function": {"name": "tool", "arguments": "{}"},
|
||||
}
|
||||
],
|
||||
},
|
||||
{"role": "tool", "content": "result", "tool_call_id": "call_1"},
|
||||
]
|
||||
assert _sanitize_messages_surrogates(api_messages) is True
|
||||
assert "\udce2" not in api_messages[1]["reasoning_content"]
|
||||
# Full payload must now serialize clean
|
||||
json.dumps(api_messages, ensure_ascii=False).encode("utf-8")
|
||||
|
||||
|
||||
class TestRunConversationSurrogateSanitization:
|
||||
"""Integration: verify run_conversation sanitizes user_message."""
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user