fix(gateway): persist user message on transient agent failures (#7100)

The #1630 fix introduced a blanket ``agent_failed_early`` transcript skip
to prevent context-overflow sessions from looping.  That guard also
triggers for unrelated transient failures (429 rate limits, read
timeouts, connection resets, provider 5xx) which have nothing to do with
session size — and it silently drops the user's message, so the agent
has no memory of the last turn on retry.

Split the failure classification in ``GatewayRunner._run_agent``:

* Context-overflow (``compression_exhausted`` flag, explicit
  context-length phrases, or generic 400 with a long history) → keep
  the existing skip, preserving the #1630/#9893 fix.
* Anything else that failed → persist just the user message so the
  conversation survives a retry.

Use specific multi-word phrases (``context length``, ``token limit``,
``prompt is too long``, etc.) to match ``run_agent.py``'s own
classifier; bare ``exceed`` false-positively flagged "rate limit
exceeded" as context overflow.

Covered by new tests in ``tests/gateway/test_7100_transient_failure_transcript.py``
and the existing #1630 suite still passes.
This commit is contained in:
konsisumer 2026-04-17 21:41:48 +02:00 committed by Teknium
parent 87f5e1a25a
commit d1d0ef6dbd
2 changed files with 186 additions and 10 deletions

View File

@ -5584,15 +5584,43 @@ class GatewayRunner:
# intermediate reasoning) so sessions can be resumed with full context
# and transcripts are useful for debugging and training data.
#
# IMPORTANT: When the agent failed (e.g. context-overflow 400,
# compression exhausted), do NOT persist the user's message.
# Persisting it would make the session even larger, causing the
# same failure on the next attempt — an infinite loop. (#1630, #9893)
# IMPORTANT: For context-overflow failures (compression exhausted,
# generic 400 on large sessions) we must NOT persist the user's
# message — doing so would grow the session further and cause the
# same failure on the next attempt, an infinite loop. (#1630, #9893)
#
# Transient failures (429, timeout, connection error, provider 5xx)
# are different: the session is not oversized, and silently dropping
# the user message causes severe context loss on retry — the agent
# forgets what was just asked. Persist the user turn so the
# conversation is preserved. (#7100)
agent_failed_early = bool(agent_result.get("failed"))
if agent_failed_early:
_err_str_for_classify = str(agent_result.get("error", "")).lower()
# Use specific multi-word phrases (not bare "exceed" or "token")
# to avoid false positives on transient errors like "rate limit
# exceeded" or "invalid auth token". Matches run_agent.py's
# own context-length classifier.
is_context_overflow_failure = agent_failed_early and (
bool(agent_result.get("compression_exhausted"))
or any(p in _err_str_for_classify for p in (
"context length", "context size", "context window",
"maximum context", "token limit", "too many tokens",
"reduce the length", "exceeds the limit",
"request entity too large", "prompt is too long",
"payload too large", "input is too long",
))
or ("400" in _err_str_for_classify and len(history) > 50)
)
if is_context_overflow_failure:
logger.info(
"Skipping transcript persistence for failed request in "
"session %s to prevent session growth loop.",
"Skipping transcript persistence for context-overflow "
"failure in session %s to prevent session growth loop.",
session_entry.session_id,
)
elif agent_failed_early:
logger.info(
"Transient agent failure in session %s — persisting user "
"message so conversation context is preserved on retry.",
session_entry.session_id,
)
@ -5622,7 +5650,7 @@ class GatewayRunner:
# If this is a fresh session (no history), write the full tool
# definitions as the first entry so the transcript is self-describing
# -- the same list of dicts sent as tools=[...] in the API request.
if agent_failed_early:
if is_context_overflow_failure:
pass # Skip all transcript writes — don't grow a broken session
elif not history:
tool_defs = agent_result.get("tools", [])
@ -5641,10 +5669,21 @@ class GatewayRunner:
# Use the filtered history length (history_offset) that was actually
# passed to the agent, not len(history) which includes session_meta
# entries that were stripped before the agent saw them.
if not agent_failed_early:
if is_context_overflow_failure:
pass # handled above — skip all transcript writes
elif agent_failed_early:
# Transient failure (429/timeout/5xx): persist only the user
# message so the next message can load a transcript that
# reflects what was said. Skip the assistant error text since
# it's a gateway-generated hint, not model output. (#7100)
self.session_store.append_to_transcript(
session_entry.session_id,
{"role": "user", "content": message_text, "timestamp": ts},
)
else:
history_len = agent_result.get("history_offset", len(history))
new_messages = agent_messages[history_len:] if len(agent_messages) > history_len else []
# If no new messages found (edge case), fall back to simple user/assistant
if not new_messages:
self.session_store.append_to_transcript(

View File

@ -0,0 +1,137 @@
"""Tests for #7100 — transient failures (429/timeout) must not drop the
user message from the transcript.
The #1630 fix introduced a blanket skip of transcript writes on any
``failed`` agent result. That was correct for context-overflow failures
(which would otherwise cause a session-growth loop), but it also caused
transient provider failures (rate limits, read timeouts, connection
resets) to silently drop the user's message — so the agent had no memory
of the last turn on the next attempt.
The gateway classifier must distinguish:
* ``compression_exhausted=True`` OR context-keyword errors OR a generic
``400`` on a long history context-overflow skip transcript
* everything else that fails transient persist the user message
"""
import pytest
def _classify(agent_result: dict, history_len: int) -> tuple[bool, bool]:
"""Replicate the gateway classifier from GatewayRunner._run_agent.
Returns ``(agent_failed_early, is_context_overflow_failure)``.
"""
agent_failed_early = bool(agent_result.get("failed"))
err = str(agent_result.get("error", "")).lower()
is_context_overflow_failure = agent_failed_early and (
bool(agent_result.get("compression_exhausted"))
or any(p in err for p in (
"context length", "context size", "context window",
"maximum context", "token limit", "too many tokens",
"reduce the length", "exceeds the limit",
"request entity too large", "prompt is too long",
"payload too large", "input is too long",
))
or ("400" in err and history_len > 50)
)
return agent_failed_early, is_context_overflow_failure
class TestContextOverflowStillSkipsTranscript:
"""#1630 behavior must be preserved for real context-overflow cases."""
def test_compression_exhausted_is_context_overflow(self):
agent_result = {
"failed": True,
"compression_exhausted": True,
"error": "Request payload too large: max compression attempts reached.",
}
failed, ctx_overflow = _classify(agent_result, history_len=100)
assert failed
assert ctx_overflow
def test_explicit_context_length_error_is_context_overflow(self):
agent_result = {
"failed": True,
"error": "prompt is too long: 250000 tokens > 200000 maximum",
}
failed, ctx_overflow = _classify(agent_result, history_len=10)
assert failed
assert ctx_overflow
def test_generic_400_on_large_session_is_context_overflow(self):
agent_result = {
"failed": True,
"error": "error code: 400 - {'type': 'error', 'message': 'Error'}",
}
failed, ctx_overflow = _classify(agent_result, history_len=100)
assert failed
assert ctx_overflow
class TestTransientFailureKeepsUserMessage:
"""Transient provider failures must NOT skip the transcript — doing so
drops the user message and the agent forgets the turn. (#7100)"""
def test_rate_limit_429_is_not_context_overflow(self):
agent_result = {
"failed": True,
"error": (
"API call failed after 3 retries: 429 Too Many Requests "
"— rate limit exceeded"
),
}
failed, ctx_overflow = _classify(agent_result, history_len=10)
assert failed
assert not ctx_overflow
def test_read_timeout_is_not_context_overflow(self):
agent_result = {
"failed": True,
"error": "ReadTimeout: HTTPSConnectionPool(host='api.z.ai'): Read timed out.",
}
failed, ctx_overflow = _classify(agent_result, history_len=10)
assert failed
assert not ctx_overflow
def test_connection_reset_is_not_context_overflow(self):
agent_result = {
"failed": True,
"error": "ConnectionError: [Errno 54] Connection reset by peer",
}
failed, ctx_overflow = _classify(agent_result, history_len=10)
assert failed
assert not ctx_overflow
def test_provider_500_is_not_context_overflow(self):
agent_result = {
"failed": True,
"error": "API call failed after 3 retries: 500 Internal Server Error",
}
failed, ctx_overflow = _classify(agent_result, history_len=10)
assert failed
assert not ctx_overflow
def test_generic_400_on_short_session_is_not_context_overflow(self):
"""A 400 on a short session is a real client error, not context
overflow still not a reason to drop the user turn."""
agent_result = {
"failed": True,
"error": "error code: 400 - invalid model",
}
failed, ctx_overflow = _classify(agent_result, history_len=5)
assert failed
assert not ctx_overflow
class TestSuccessfulResultUnaffected:
def test_successful_result_neither_failed_nor_overflow(self):
agent_result = {
"final_response": "Hello!",
"messages": [{"role": "assistant", "content": "Hello!"}],
}
failed, ctx_overflow = _classify(agent_result, history_len=10)
assert not failed
assert not ctx_overflow