diff --git a/gateway/run.py b/gateway/run.py index 912e68a7..0dff622a 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -7574,12 +7574,19 @@ class GatewayRunner: # Track fallback model state: if the agent switched to a # fallback model during this run, persist it so /model shows # the actually-active model instead of the config default. + # Skip eviction when the run failed — evicting a failed agent + # forces MCP reinit on the next message for no benefit (the + # same error will recur). This was the root cause of #7130: + # a bad model ID triggered fallback → eviction → recreation → + # MCP reinit → same 400 → loop, burning 91% CPU for hours. _agent = agent_holder[0] - if _agent is not None and hasattr(_agent, 'model'): + _result_for_fb = result_holder[0] + _run_failed = _result_for_fb.get("failed") if _result_for_fb else False + if _agent is not None and hasattr(_agent, 'model') and not _run_failed: _cfg_model = _resolve_gateway_model() if _agent.model != _cfg_model and not self._is_intentional_model_switch(session_key, _agent.model): - # Fallback activated — evict cached agent so the next - # message starts fresh and retries the primary model. + # Fallback activated on a successful run — evict cached + # agent so the next message retries the primary model. self._evict_cached_agent(session_key) # Check if we were interrupted OR have a queued message (/queue). diff --git a/tests/gateway/test_fallback_eviction.py b/tests/gateway/test_fallback_eviction.py new file mode 100644 index 00000000..ae3ed07a --- /dev/null +++ b/tests/gateway/test_fallback_eviction.py @@ -0,0 +1,44 @@ +"""Tests for fallback-eviction gating on failed runs (#7130). + +When a run fails, the gateway must NOT evict the cached agent — doing so +forces MCP reinit on the next message, creating a CPU-burning restart loop. +Eviction should only happen on successful runs where fallback activated. +""" + +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + + +class TestFallbackEvictionGating: + """The fallback-eviction code path should skip eviction on failed runs.""" + + def test_failed_run_does_not_evict_cached_agent(self): + """When result has failed=True, the cached agent should NOT be evicted.""" + # The fix: `and not _run_failed` guard on the eviction check. + # Simulate the variables that the eviction block uses. + result = {"failed": True, "final_response": None, "error": "400 invalid model"} + _run_failed = result.get("failed") if result else False + assert _run_failed is True, "Failed run should be detected" + + def test_successful_run_allows_eviction(self): + """When result is successful, fallback eviction should proceed.""" + result = {"completed": True, "final_response": "Hello!", "failed": False} + _run_failed = result.get("failed") if result else False + assert _run_failed is False, "Successful run should not be flagged" + + def test_none_result_treated_as_not_failed(self): + """When result is None (edge case), treat as not-failed.""" + result = None + _run_failed = result.get("failed") if result else False + assert _run_failed is False + + def test_missing_failed_key_treated_as_not_failed(self): + """When result dict doesn't have 'failed' key, treat as not-failed.""" + result = {"completed": True, "final_response": "Hello!"} + _run_failed = result.get("failed") if result else False + assert not _run_failed, "Missing 'failed' key should be falsy"