From 241032455cb88f712963c329feaddabb645a529e Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 21:16:56 -0700
Subject: [PATCH] =?UTF-8?q?fix:=20don't=20evict=20cached=20agent=20on=20fa?=
 =?UTF-8?q?iled=20runs=20=E2=80=94=20prevents=20MCP=20restart=20loop=20(#7?=
 =?UTF-8?q?539)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix: circuit breaker stops CPU-burning restart loops on persistent errors

When a gateway session hits a non-retryable error (e.g. invalid model
ID → HTTP 400), the agent fails and returns. But if the session keeps
receiving messages (or something periodically recreates agents), each
attempt spawns a new AIAgent — reinitializing MCP server connections,
burning CPU — only to hit the same 400 error again. On a 4-core server,
this pegs an entire core per stuck session and accumulates 300+ minutes
of CPU time over hours.

Fix: add a per-session consecutive failure counter in the gateway runner.

- Track consecutive non-retryable failures per session key
- After 3 consecutive failures (_MAX_CONSECUTIVE_FAILURES), block
  further agent creation for that session and notify the user:
  '⚠️ This session has failed N times in a row with a non-retryable
  error. Use /reset to start a new session.'
- Evict the cached agent when the circuit breaker engages to prevent
  stale state from accumulating
- Reset the counter on successful agent runs
- Clear the counter on /reset and /new so users can recover
- Uses getattr() pattern so bare GatewayRunner instances (common in
  tests using object.__new__) don't crash

Tests:
- 8 new tests in test_circuit_breaker.py covering counter behavior,
  threshold, reset, session isolation, and bare-runner safety

Addresses #7130.

* Revert "fix: circuit breaker stops CPU-burning restart loops on persistent errors"

This reverts commit d848ea7109d62a2fc4ba6da36fc4f0366b5ded94.

* fix: don't evict cached agent on failed runs — prevents MCP restart loop

When a run fails (e.g. invalid model ID → 400) and fallback activated,
the gateway was evicting the cached agent to 'retry primary next time.'
But evicting a failed agent forces a full AIAgent recreation on the next
message — reinitializing MCP server connections, spawning stdio
processes — only to hit the same 400 again. This created a CPU-burning
loop (91%+ for hours, #7130).

The fix: add `and not _run_failed` to the fallback-eviction check.
Failed runs keep the cached agent. The next message reuses it (no MCP
reinit), hits the same error, returns it to the user quickly. The user
can /reset or /model to fix their config.

Successful fallback runs still evict as before so the next message
retries the primary model.

Addresses #7130.
---
 gateway/run.py                          | 13 ++++++--
 tests/gateway/test_fallback_eviction.py | 44 +++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 3 deletions(-)
 create mode 100644 tests/gateway/test_fallback_eviction.py

diff --git a/gateway/run.py b/gateway/run.py
index 912e68a7..0dff622a 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -7574,12 +7574,19 @@ class GatewayRunner:
             # Track fallback model state: if the agent switched to a
             # fallback model during this run, persist it so /model shows
             # the actually-active model instead of the config default.
+            # Skip eviction when the run failed — evicting a failed agent
+            # forces MCP reinit on the next message for no benefit (the
+            # same error will recur).  This was the root cause of #7130:
+            # a bad model ID triggered fallback → eviction → recreation →
+            # MCP reinit → same 400 → loop, burning 91% CPU for hours.
             _agent = agent_holder[0]
-            if _agent is not None and hasattr(_agent, 'model'):
+            _result_for_fb = result_holder[0]
+            _run_failed = _result_for_fb.get("failed") if _result_for_fb else False
+            if _agent is not None and hasattr(_agent, 'model') and not _run_failed:
                 _cfg_model = _resolve_gateway_model()
                 if _agent.model != _cfg_model and not self._is_intentional_model_switch(session_key, _agent.model):
-                    # Fallback activated — evict cached agent so the next
-                    # message starts fresh and retries the primary model.
+                    # Fallback activated on a successful run — evict cached
+                    # agent so the next message retries the primary model.
                     self._evict_cached_agent(session_key)
 
             # Check if we were interrupted OR have a queued message (/queue).
diff --git a/tests/gateway/test_fallback_eviction.py b/tests/gateway/test_fallback_eviction.py
new file mode 100644
index 00000000..ae3ed07a
--- /dev/null
+++ b/tests/gateway/test_fallback_eviction.py
@@ -0,0 +1,44 @@
+"""Tests for fallback-eviction gating on failed runs (#7130).
+
+When a run fails, the gateway must NOT evict the cached agent — doing so
+forces MCP reinit on the next message, creating a CPU-burning restart loop.
+Eviction should only happen on successful runs where fallback activated.
+"""
+
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+
+class TestFallbackEvictionGating:
+    """The fallback-eviction code path should skip eviction on failed runs."""
+
+    def test_failed_run_does_not_evict_cached_agent(self):
+        """When result has failed=True, the cached agent should NOT be evicted."""
+        # The fix: `and not _run_failed` guard on the eviction check.
+        # Simulate the variables that the eviction block uses.
+        result = {"failed": True, "final_response": None, "error": "400 invalid model"}
+        _run_failed = result.get("failed") if result else False
+        assert _run_failed is True, "Failed run should be detected"
+
+    def test_successful_run_allows_eviction(self):
+        """When result is successful, fallback eviction should proceed."""
+        result = {"completed": True, "final_response": "Hello!", "failed": False}
+        _run_failed = result.get("failed") if result else False
+        assert _run_failed is False, "Successful run should not be flagged"
+
+    def test_none_result_treated_as_not_failed(self):
+        """When result is None (edge case), treat as not-failed."""
+        result = None
+        _run_failed = result.get("failed") if result else False
+        assert _run_failed is False
+
+    def test_missing_failed_key_treated_as_not_failed(self):
+        """When result dict doesn't have 'failed' key, treat as not-failed."""
+        result = {"completed": True, "final_response": "Hello!"}
+        _run_failed = result.get("failed") if result else False
+        assert not _run_failed, "Missing 'failed' key should be falsy"