fix(gateway): persist user message on transient agent failures (#7100)
The #1630 fix introduced a blanket ``agent_failed_early`` transcript skip to prevent context-overflow sessions from looping. That guard also triggers for unrelated transient failures (429 rate limits, read timeouts, connection resets, provider 5xx) which have nothing to do with session size — and it silently drops the user's message, so the agent has no memory of the last turn on retry. Split the failure classification in ``GatewayRunner._run_agent``: * Context-overflow (``compression_exhausted`` flag, explicit context-length phrases, or generic 400 with a long history) → keep the existing skip, preserving the #1630/#9893 fix. * Anything else that failed → persist just the user message so the conversation survives a retry. Use specific multi-word phrases (``context length``, ``token limit``, ``prompt is too long``, etc.) to match ``run_agent.py``'s own classifier; bare ``exceed`` false-positively flagged "rate limit exceeded" as context overflow. Covered by new tests in ``tests/gateway/test_7100_transient_failure_transcript.py`` and the existing #1630 suite still passes.
This commit is contained in:
parent
87f5e1a25a
commit
d1d0ef6dbd
@ -5584,15 +5584,43 @@ class GatewayRunner:
|
||||
# intermediate reasoning) so sessions can be resumed with full context
|
||||
# and transcripts are useful for debugging and training data.
|
||||
#
|
||||
# IMPORTANT: When the agent failed (e.g. context-overflow 400,
|
||||
# compression exhausted), do NOT persist the user's message.
|
||||
# Persisting it would make the session even larger, causing the
|
||||
# same failure on the next attempt — an infinite loop. (#1630, #9893)
|
||||
# IMPORTANT: For context-overflow failures (compression exhausted,
|
||||
# generic 400 on large sessions) we must NOT persist the user's
|
||||
# message — doing so would grow the session further and cause the
|
||||
# same failure on the next attempt, an infinite loop. (#1630, #9893)
|
||||
#
|
||||
# Transient failures (429, timeout, connection error, provider 5xx)
|
||||
# are different: the session is not oversized, and silently dropping
|
||||
# the user message causes severe context loss on retry — the agent
|
||||
# forgets what was just asked. Persist the user turn so the
|
||||
# conversation is preserved. (#7100)
|
||||
agent_failed_early = bool(agent_result.get("failed"))
|
||||
if agent_failed_early:
|
||||
_err_str_for_classify = str(agent_result.get("error", "")).lower()
|
||||
# Use specific multi-word phrases (not bare "exceed" or "token")
|
||||
# to avoid false positives on transient errors like "rate limit
|
||||
# exceeded" or "invalid auth token". Matches run_agent.py's
|
||||
# own context-length classifier.
|
||||
is_context_overflow_failure = agent_failed_early and (
|
||||
bool(agent_result.get("compression_exhausted"))
|
||||
or any(p in _err_str_for_classify for p in (
|
||||
"context length", "context size", "context window",
|
||||
"maximum context", "token limit", "too many tokens",
|
||||
"reduce the length", "exceeds the limit",
|
||||
"request entity too large", "prompt is too long",
|
||||
"payload too large", "input is too long",
|
||||
))
|
||||
or ("400" in _err_str_for_classify and len(history) > 50)
|
||||
)
|
||||
if is_context_overflow_failure:
|
||||
logger.info(
|
||||
"Skipping transcript persistence for failed request in "
|
||||
"session %s to prevent session growth loop.",
|
||||
"Skipping transcript persistence for context-overflow "
|
||||
"failure in session %s to prevent session growth loop.",
|
||||
session_entry.session_id,
|
||||
)
|
||||
elif agent_failed_early:
|
||||
logger.info(
|
||||
"Transient agent failure in session %s — persisting user "
|
||||
"message so conversation context is preserved on retry.",
|
||||
session_entry.session_id,
|
||||
)
|
||||
|
||||
@ -5622,7 +5650,7 @@ class GatewayRunner:
|
||||
# If this is a fresh session (no history), write the full tool
|
||||
# definitions as the first entry so the transcript is self-describing
|
||||
# -- the same list of dicts sent as tools=[...] in the API request.
|
||||
if agent_failed_early:
|
||||
if is_context_overflow_failure:
|
||||
pass # Skip all transcript writes — don't grow a broken session
|
||||
elif not history:
|
||||
tool_defs = agent_result.get("tools", [])
|
||||
@ -5641,10 +5669,21 @@ class GatewayRunner:
|
||||
# Use the filtered history length (history_offset) that was actually
|
||||
# passed to the agent, not len(history) which includes session_meta
|
||||
# entries that were stripped before the agent saw them.
|
||||
if not agent_failed_early:
|
||||
if is_context_overflow_failure:
|
||||
pass # handled above — skip all transcript writes
|
||||
elif agent_failed_early:
|
||||
# Transient failure (429/timeout/5xx): persist only the user
|
||||
# message so the next message can load a transcript that
|
||||
# reflects what was said. Skip the assistant error text since
|
||||
# it's a gateway-generated hint, not model output. (#7100)
|
||||
self.session_store.append_to_transcript(
|
||||
session_entry.session_id,
|
||||
{"role": "user", "content": message_text, "timestamp": ts},
|
||||
)
|
||||
else:
|
||||
history_len = agent_result.get("history_offset", len(history))
|
||||
new_messages = agent_messages[history_len:] if len(agent_messages) > history_len else []
|
||||
|
||||
|
||||
# If no new messages found (edge case), fall back to simple user/assistant
|
||||
if not new_messages:
|
||||
self.session_store.append_to_transcript(
|
||||
|
||||
137
tests/gateway/test_7100_transient_failure_transcript.py
Normal file
137
tests/gateway/test_7100_transient_failure_transcript.py
Normal file
@ -0,0 +1,137 @@
|
||||
"""Tests for #7100 — transient failures (429/timeout) must not drop the
|
||||
user message from the transcript.
|
||||
|
||||
The #1630 fix introduced a blanket skip of transcript writes on any
|
||||
``failed`` agent result. That was correct for context-overflow failures
|
||||
(which would otherwise cause a session-growth loop), but it also caused
|
||||
transient provider failures (rate limits, read timeouts, connection
|
||||
resets) to silently drop the user's message — so the agent had no memory
|
||||
of the last turn on the next attempt.
|
||||
|
||||
The gateway classifier must distinguish:
|
||||
|
||||
* ``compression_exhausted=True`` OR context-keyword errors OR a generic
|
||||
``400`` on a long history → context-overflow → skip transcript
|
||||
* everything else that fails → transient → persist the user message
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def _classify(agent_result: dict, history_len: int) -> tuple[bool, bool]:
|
||||
"""Replicate the gateway classifier from GatewayRunner._run_agent.
|
||||
|
||||
Returns ``(agent_failed_early, is_context_overflow_failure)``.
|
||||
"""
|
||||
agent_failed_early = bool(agent_result.get("failed"))
|
||||
err = str(agent_result.get("error", "")).lower()
|
||||
is_context_overflow_failure = agent_failed_early and (
|
||||
bool(agent_result.get("compression_exhausted"))
|
||||
or any(p in err for p in (
|
||||
"context length", "context size", "context window",
|
||||
"maximum context", "token limit", "too many tokens",
|
||||
"reduce the length", "exceeds the limit",
|
||||
"request entity too large", "prompt is too long",
|
||||
"payload too large", "input is too long",
|
||||
))
|
||||
or ("400" in err and history_len > 50)
|
||||
)
|
||||
return agent_failed_early, is_context_overflow_failure
|
||||
|
||||
|
||||
class TestContextOverflowStillSkipsTranscript:
|
||||
"""#1630 behavior must be preserved for real context-overflow cases."""
|
||||
|
||||
def test_compression_exhausted_is_context_overflow(self):
|
||||
agent_result = {
|
||||
"failed": True,
|
||||
"compression_exhausted": True,
|
||||
"error": "Request payload too large: max compression attempts reached.",
|
||||
}
|
||||
failed, ctx_overflow = _classify(agent_result, history_len=100)
|
||||
assert failed
|
||||
assert ctx_overflow
|
||||
|
||||
def test_explicit_context_length_error_is_context_overflow(self):
|
||||
agent_result = {
|
||||
"failed": True,
|
||||
"error": "prompt is too long: 250000 tokens > 200000 maximum",
|
||||
}
|
||||
failed, ctx_overflow = _classify(agent_result, history_len=10)
|
||||
assert failed
|
||||
assert ctx_overflow
|
||||
|
||||
def test_generic_400_on_large_session_is_context_overflow(self):
|
||||
agent_result = {
|
||||
"failed": True,
|
||||
"error": "error code: 400 - {'type': 'error', 'message': 'Error'}",
|
||||
}
|
||||
failed, ctx_overflow = _classify(agent_result, history_len=100)
|
||||
assert failed
|
||||
assert ctx_overflow
|
||||
|
||||
|
||||
class TestTransientFailureKeepsUserMessage:
|
||||
"""Transient provider failures must NOT skip the transcript — doing so
|
||||
drops the user message and the agent forgets the turn. (#7100)"""
|
||||
|
||||
def test_rate_limit_429_is_not_context_overflow(self):
|
||||
agent_result = {
|
||||
"failed": True,
|
||||
"error": (
|
||||
"API call failed after 3 retries: 429 Too Many Requests "
|
||||
"— rate limit exceeded"
|
||||
),
|
||||
}
|
||||
failed, ctx_overflow = _classify(agent_result, history_len=10)
|
||||
assert failed
|
||||
assert not ctx_overflow
|
||||
|
||||
def test_read_timeout_is_not_context_overflow(self):
|
||||
agent_result = {
|
||||
"failed": True,
|
||||
"error": "ReadTimeout: HTTPSConnectionPool(host='api.z.ai'): Read timed out.",
|
||||
}
|
||||
failed, ctx_overflow = _classify(agent_result, history_len=10)
|
||||
assert failed
|
||||
assert not ctx_overflow
|
||||
|
||||
def test_connection_reset_is_not_context_overflow(self):
|
||||
agent_result = {
|
||||
"failed": True,
|
||||
"error": "ConnectionError: [Errno 54] Connection reset by peer",
|
||||
}
|
||||
failed, ctx_overflow = _classify(agent_result, history_len=10)
|
||||
assert failed
|
||||
assert not ctx_overflow
|
||||
|
||||
def test_provider_500_is_not_context_overflow(self):
|
||||
agent_result = {
|
||||
"failed": True,
|
||||
"error": "API call failed after 3 retries: 500 Internal Server Error",
|
||||
}
|
||||
failed, ctx_overflow = _classify(agent_result, history_len=10)
|
||||
assert failed
|
||||
assert not ctx_overflow
|
||||
|
||||
def test_generic_400_on_short_session_is_not_context_overflow(self):
|
||||
"""A 400 on a short session is a real client error, not context
|
||||
overflow — still not a reason to drop the user turn."""
|
||||
agent_result = {
|
||||
"failed": True,
|
||||
"error": "error code: 400 - invalid model",
|
||||
}
|
||||
failed, ctx_overflow = _classify(agent_result, history_len=5)
|
||||
assert failed
|
||||
assert not ctx_overflow
|
||||
|
||||
|
||||
class TestSuccessfulResultUnaffected:
|
||||
def test_successful_result_neither_failed_nor_overflow(self):
|
||||
agent_result = {
|
||||
"final_response": "Hello!",
|
||||
"messages": [{"role": "assistant", "content": "Hello!"}],
|
||||
}
|
||||
failed, ctx_overflow = _classify(agent_result, history_len=10)
|
||||
assert not failed
|
||||
assert not ctx_overflow
|
||||
Loading…
Reference in New Issue
Block a user