From 241032455cb88f712963c329feaddabb645a529e Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 10 Apr 2026 21:16:56 -0700 Subject: [PATCH] =?UTF-8?q?fix:=20don't=20evict=20cached=20agent=20on=20fa?= =?UTF-8?q?iled=20runs=20=E2=80=94=20prevents=20MCP=20restart=20loop=20(#7?= =?UTF-8?q?539)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: circuit breaker stops CPU-burning restart loops on persistent errors When a gateway session hits a non-retryable error (e.g. invalid model ID → HTTP 400), the agent fails and returns. But if the session keeps receiving messages (or something periodically recreates agents), each attempt spawns a new AIAgent — reinitializing MCP server connections, burning CPU — only to hit the same 400 error again. On a 4-core server, this pegs an entire core per stuck session and accumulates 300+ minutes of CPU time over hours. Fix: add a per-session consecutive failure counter in the gateway runner. - Track consecutive non-retryable failures per session key - After 3 consecutive failures (_MAX_CONSECUTIVE_FAILURES), block further agent creation for that session and notify the user: '⚠️ This session has failed N times in a row with a non-retryable error. Use /reset to start a new session.' - Evict the cached agent when the circuit breaker engages to prevent stale state from accumulating - Reset the counter on successful agent runs - Clear the counter on /reset and /new so users can recover - Uses getattr() pattern so bare GatewayRunner instances (common in tests using object.__new__) don't crash Tests: - 8 new tests in test_circuit_breaker.py covering counter behavior, threshold, reset, session isolation, and bare-runner safety Addresses #7130. * Revert "fix: circuit breaker stops CPU-burning restart loops on persistent errors" This reverts commit d848ea7109d62a2fc4ba6da36fc4f0366b5ded94. * fix: don't evict cached agent on failed runs — prevents MCP restart loop When a run fails (e.g. invalid model ID → 400) and fallback activated, the gateway was evicting the cached agent to 'retry primary next time.' But evicting a failed agent forces a full AIAgent recreation on the next message — reinitializing MCP server connections, spawning stdio processes — only to hit the same 400 again. This created a CPU-burning loop (91%+ for hours, #7130). The fix: add `and not _run_failed` to the fallback-eviction check. Failed runs keep the cached agent. The next message reuses it (no MCP reinit), hits the same error, returns it to the user quickly. The user can /reset or /model to fix their config. Successful fallback runs still evict as before so the next message retries the primary model. Addresses #7130. --- gateway/run.py | 13 ++++++-- tests/gateway/test_fallback_eviction.py | 44 +++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 3 deletions(-) create mode 100644 tests/gateway/test_fallback_eviction.py diff --git a/gateway/run.py b/gateway/run.py index 912e68a7..0dff622a 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -7574,12 +7574,19 @@ class GatewayRunner: # Track fallback model state: if the agent switched to a # fallback model during this run, persist it so /model shows # the actually-active model instead of the config default. + # Skip eviction when the run failed — evicting a failed agent + # forces MCP reinit on the next message for no benefit (the + # same error will recur). This was the root cause of #7130: + # a bad model ID triggered fallback → eviction → recreation → + # MCP reinit → same 400 → loop, burning 91% CPU for hours. _agent = agent_holder[0] - if _agent is not None and hasattr(_agent, 'model'): + _result_for_fb = result_holder[0] + _run_failed = _result_for_fb.get("failed") if _result_for_fb else False + if _agent is not None and hasattr(_agent, 'model') and not _run_failed: _cfg_model = _resolve_gateway_model() if _agent.model != _cfg_model and not self._is_intentional_model_switch(session_key, _agent.model): - # Fallback activated — evict cached agent so the next - # message starts fresh and retries the primary model. + # Fallback activated on a successful run — evict cached + # agent so the next message retries the primary model. self._evict_cached_agent(session_key) # Check if we were interrupted OR have a queued message (/queue). diff --git a/tests/gateway/test_fallback_eviction.py b/tests/gateway/test_fallback_eviction.py new file mode 100644 index 00000000..ae3ed07a --- /dev/null +++ b/tests/gateway/test_fallback_eviction.py @@ -0,0 +1,44 @@ +"""Tests for fallback-eviction gating on failed runs (#7130). + +When a run fails, the gateway must NOT evict the cached agent — doing so +forces MCP reinit on the next message, creating a CPU-burning restart loop. +Eviction should only happen on successful runs where fallback activated. +""" + +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + + +class TestFallbackEvictionGating: + """The fallback-eviction code path should skip eviction on failed runs.""" + + def test_failed_run_does_not_evict_cached_agent(self): + """When result has failed=True, the cached agent should NOT be evicted.""" + # The fix: `and not _run_failed` guard on the eviction check. + # Simulate the variables that the eviction block uses. + result = {"failed": True, "final_response": None, "error": "400 invalid model"} + _run_failed = result.get("failed") if result else False + assert _run_failed is True, "Failed run should be detected" + + def test_successful_run_allows_eviction(self): + """When result is successful, fallback eviction should proceed.""" + result = {"completed": True, "final_response": "Hello!", "failed": False} + _run_failed = result.get("failed") if result else False + assert _run_failed is False, "Successful run should not be flagged" + + def test_none_result_treated_as_not_failed(self): + """When result is None (edge case), treat as not-failed.""" + result = None + _run_failed = result.get("failed") if result else False + assert _run_failed is False + + def test_missing_failed_key_treated_as_not_failed(self): + """When result dict doesn't have 'failed' key, treat as not-failed.""" + result = {"completed": True, "final_response": "Hello!"} + _run_failed = result.get("failed") if result else False + assert not _run_failed, "Missing 'failed' key should be falsy"