fix(gateway): persist user message on transient agent failures (#7100)

The #1630 fix introduced a blanket ``agent_failed_early`` transcript skip to prevent context-overflow sessions from looping. That guard also triggers for unrelated transient failures (429 rate limits, read timeouts, connection resets, provider 5xx) which have nothing to do with session size — and it silently drops the user's message, so the agent has no memory of the last turn on retry. Split the failure classification in ``GatewayRunner._run_agent``: * Context-overflow (``compression_exhausted`` flag, explicit context-length phrases, or generic 400 with a long history) → keep the existing skip, preserving the #1630/#9893 fix. * Anything else that failed → persist just the user message so the conversation survives a retry. Use specific multi-word phrases (``context length``, ``token limit``, ``prompt is too long``, etc.) to match ``run_agent.py``'s own classifier; bare ``exceed`` false-positively flagged "rate limit exceeded" as context overflow. Covered by new tests in ``tests/gateway/test_7100_transient_failure_transcript.py`` and the existing #1630 suite still passes.
2026-04-17 21:41:48 +02:00 · 2026-04-17 21:41:48 +02:00 · d1d0ef6dbd
commit d1d0ef6dbd
parent 87f5e1a25a
2 changed files with 186 additions and 10 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -5584,15 +5584,43 @@ class GatewayRunner:
            # intermediate reasoning) so sessions can be resumed with full context
            # and transcripts are useful for debugging and training data.
            #
-            # IMPORTANT: When the agent failed (e.g. context-overflow 400,
-            # compression exhausted), do NOT persist the user's message.
-            # Persisting it would make the session even larger, causing the
-            # same failure on the next attempt — an infinite loop. (#1630, #9893)
+            # IMPORTANT: For context-overflow failures (compression exhausted,
+            # generic 400 on large sessions) we must NOT persist the user's
+            # message — doing so would grow the session further and cause the
+            # same failure on the next attempt, an infinite loop. (#1630, #9893)
+            #
+            # Transient failures (429, timeout, connection error, provider 5xx)
+            # are different: the session is not oversized, and silently dropping
+            # the user message causes severe context loss on retry — the agent
+            # forgets what was just asked.  Persist the user turn so the
+            # conversation is preserved. (#7100)
            agent_failed_early = bool(agent_result.get("failed"))
-            if agent_failed_early:
+            _err_str_for_classify = str(agent_result.get("error", "")).lower()
+            # Use specific multi-word phrases (not bare "exceed" or "token")
+            # to avoid false positives on transient errors like "rate limit
+            # exceeded" or "invalid auth token". Matches run_agent.py's
+            # own context-length classifier.
+            is_context_overflow_failure = agent_failed_early and (
+                bool(agent_result.get("compression_exhausted"))
+                or any(p in _err_str_for_classify for p in (
+                    "context length", "context size", "context window",
+                    "maximum context", "token limit", "too many tokens",
+                    "reduce the length", "exceeds the limit",
+                    "request entity too large", "prompt is too long",
+                    "payload too large", "input is too long",
+                ))
+                or ("400" in _err_str_for_classify and len(history) > 50)
+            )
+            if is_context_overflow_failure:
                logger.info(
-                    "Skipping transcript persistence for failed request in "
-                    "session %s to prevent session growth loop.",
+                    "Skipping transcript persistence for context-overflow "
+                    "failure in session %s to prevent session growth loop.",
+                    session_entry.session_id,
+                )
+            elif agent_failed_early:
+                logger.info(
+                    "Transient agent failure in session %s — persisting user "
+                    "message so conversation context is preserved on retry.",
                    session_entry.session_id,
                )

@ -5622,7 +5650,7 @@ class GatewayRunner:
            # If this is a fresh session (no history), write the full tool
            # definitions as the first entry so the transcript is self-describing
            # -- the same list of dicts sent as tools=[...] in the API request.
-            if agent_failed_early:
+            if is_context_overflow_failure:
                pass  # Skip all transcript writes — don't grow a broken session
            elif not history:
                tool_defs = agent_result.get("tools", [])
@ -5641,10 +5669,21 @@ class GatewayRunner:
            # Use the filtered history length (history_offset) that was actually
            # passed to the agent, not len(history) which includes session_meta
            # entries that were stripped before the agent saw them.
-            if not agent_failed_early:
+            if is_context_overflow_failure:
+                pass  # handled above — skip all transcript writes
+            elif agent_failed_early:
+                # Transient failure (429/timeout/5xx): persist only the user
+                # message so the next message can load a transcript that
+                # reflects what was said.  Skip the assistant error text since
+                # it's a gateway-generated hint, not model output. (#7100)
+                self.session_store.append_to_transcript(
+                    session_entry.session_id,
+                    {"role": "user", "content": message_text, "timestamp": ts},
+                )
+            else:
                history_len = agent_result.get("history_offset", len(history))
                new_messages = agent_messages[history_len:] if len(agent_messages) > history_len else []
-                
+
                # If no new messages found (edge case), fall back to simple user/assistant
                if not new_messages:
                    self.session_store.append_to_transcript(
--- a/tests/gateway/test_7100_transient_failure_transcript.py
+++ b/tests/gateway/test_7100_transient_failure_transcript.py
@ -0,0 +1,137 @@
+"""Tests for #7100 — transient failures (429/timeout) must not drop the
+user message from the transcript.
+
+The #1630 fix introduced a blanket skip of transcript writes on any
+``failed`` agent result.  That was correct for context-overflow failures
+(which would otherwise cause a session-growth loop), but it also caused
+transient provider failures (rate limits, read timeouts, connection
+resets) to silently drop the user's message — so the agent had no memory
+of the last turn on the next attempt.
+
+The gateway classifier must distinguish:
+
+* ``compression_exhausted=True`` OR context-keyword errors OR a generic
+  ``400`` on a long history  → context-overflow → skip transcript
+* everything else that fails → transient → persist the user message
+"""
+
+import pytest
+
+
+def _classify(agent_result: dict, history_len: int) -> tuple[bool, bool]:
+    """Replicate the gateway classifier from GatewayRunner._run_agent.
+
+    Returns ``(agent_failed_early, is_context_overflow_failure)``.
+    """
+    agent_failed_early = bool(agent_result.get("failed"))
+    err = str(agent_result.get("error", "")).lower()
+    is_context_overflow_failure = agent_failed_early and (
+        bool(agent_result.get("compression_exhausted"))
+        or any(p in err for p in (
+            "context length", "context size", "context window",
+            "maximum context", "token limit", "too many tokens",
+            "reduce the length", "exceeds the limit",
+            "request entity too large", "prompt is too long",
+            "payload too large", "input is too long",
+        ))
+        or ("400" in err and history_len > 50)
+    )
+    return agent_failed_early, is_context_overflow_failure
+
+
+class TestContextOverflowStillSkipsTranscript:
+    """#1630 behavior must be preserved for real context-overflow cases."""
+
+    def test_compression_exhausted_is_context_overflow(self):
+        agent_result = {
+            "failed": True,
+            "compression_exhausted": True,
+            "error": "Request payload too large: max compression attempts reached.",
+        }
+        failed, ctx_overflow = _classify(agent_result, history_len=100)
+        assert failed
+        assert ctx_overflow
+
+    def test_explicit_context_length_error_is_context_overflow(self):
+        agent_result = {
+            "failed": True,
+            "error": "prompt is too long: 250000 tokens > 200000 maximum",
+        }
+        failed, ctx_overflow = _classify(agent_result, history_len=10)
+        assert failed
+        assert ctx_overflow
+
+    def test_generic_400_on_large_session_is_context_overflow(self):
+        agent_result = {
+            "failed": True,
+            "error": "error code: 400 - {'type': 'error', 'message': 'Error'}",
+        }
+        failed, ctx_overflow = _classify(agent_result, history_len=100)
+        assert failed
+        assert ctx_overflow
+
+
+class TestTransientFailureKeepsUserMessage:
+    """Transient provider failures must NOT skip the transcript — doing so
+    drops the user message and the agent forgets the turn. (#7100)"""
+
+    def test_rate_limit_429_is_not_context_overflow(self):
+        agent_result = {
+            "failed": True,
+            "error": (
+                "API call failed after 3 retries: 429 Too Many Requests "
+                "— rate limit exceeded"
+            ),
+        }
+        failed, ctx_overflow = _classify(agent_result, history_len=10)
+        assert failed
+        assert not ctx_overflow
+
+    def test_read_timeout_is_not_context_overflow(self):
+        agent_result = {
+            "failed": True,
+            "error": "ReadTimeout: HTTPSConnectionPool(host='api.z.ai'): Read timed out.",
+        }
+        failed, ctx_overflow = _classify(agent_result, history_len=10)
+        assert failed
+        assert not ctx_overflow
+
+    def test_connection_reset_is_not_context_overflow(self):
+        agent_result = {
+            "failed": True,
+            "error": "ConnectionError: [Errno 54] Connection reset by peer",
+        }
+        failed, ctx_overflow = _classify(agent_result, history_len=10)
+        assert failed
+        assert not ctx_overflow
+
+    def test_provider_500_is_not_context_overflow(self):
+        agent_result = {
+            "failed": True,
+            "error": "API call failed after 3 retries: 500 Internal Server Error",
+        }
+        failed, ctx_overflow = _classify(agent_result, history_len=10)
+        assert failed
+        assert not ctx_overflow
+
+    def test_generic_400_on_short_session_is_not_context_overflow(self):
+        """A 400 on a short session is a real client error, not context
+        overflow — still not a reason to drop the user turn."""
+        agent_result = {
+            "failed": True,
+            "error": "error code: 400 - invalid model",
+        }
+        failed, ctx_overflow = _classify(agent_result, history_len=5)
+        assert failed
+        assert not ctx_overflow
+
+
+class TestSuccessfulResultUnaffected:
+    def test_successful_result_neither_failed_nor_overflow(self):
+        agent_result = {
+            "final_response": "Hello!",
+            "messages": [{"role": "assistant", "content": "Hello!"}],
+        }
+        failed, ctx_overflow = _classify(agent_result, history_len=10)
+        assert not failed
+        assert not ctx_overflow