From 3ff18ffe1408b37baa1d604dadecd20fe455c55e Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 15 Apr 2026 22:33:48 -0700 Subject: [PATCH] fix: add circuit breaker to MCP tool handler to prevent retry burn loops (#10447) (#10776) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an MCP server returns errors consistently (crashed, disconnected, auth expired), the model sees each error and retries the tool call. With no circuit breaker, this burned through all 90 iterations — each one a full LLM API call plus failed MCP call — producing 15-45 minutes of zero useful output while the gateway inactivity timeout never fired (because the agent WAS active, just uselessly). Fix: track consecutive error counts per MCP server. After 3 consecutive failures (connection errors, MCP-level errors, or transport exceptions), the handler short-circuits with a message telling the model to stop retrying and use alternative approaches. The counter resets to 0 on any successful call. Closes #10447 --- tools/mcp_tool.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py index 5f450522..a73aa438 100644 --- a/tools/mcp_tool.py +++ b/tools/mcp_tool.py @@ -1166,6 +1166,14 @@ class MCPServerTask: _servers: Dict[str, MCPServerTask] = {} +# Circuit breaker: consecutive error counts per server. After +# _CIRCUIT_BREAKER_THRESHOLD consecutive failures, the handler returns +# a "server unreachable" message that tells the model to stop retrying, +# preventing the 90-iteration burn loop described in #10447. +# Reset to 0 on any successful call. +_server_error_counts: Dict[str, int] = {} +_CIRCUIT_BREAKER_THRESHOLD = 3 + # Dedicated event loop running in a background daemon thread. _mcp_loop: Optional[asyncio.AbstractEventLoop] = None _mcp_thread: Optional[threading.Thread] = None @@ -1356,9 +1364,23 @@ def _make_tool_handler(server_name: str, tool_name: str, tool_timeout: float): """ def _handler(args: dict, **kwargs) -> str: + # Circuit breaker: if this server has failed too many times + # consecutively, short-circuit with a clear message so the model + # stops retrying and uses alternative approaches (#10447). + if _server_error_counts.get(server_name, 0) >= _CIRCUIT_BREAKER_THRESHOLD: + return json.dumps({ + "error": ( + f"MCP server '{server_name}' is unreachable after " + f"{_CIRCUIT_BREAKER_THRESHOLD} consecutive failures. " + f"Do NOT retry this tool — use alternative approaches " + f"or ask the user to check the MCP server." + ) + }, ensure_ascii=False) + with _lock: server = _servers.get(server_name) if not server or not server.session: + _server_error_counts[server_name] = _server_error_counts.get(server_name, 0) + 1 return json.dumps({ "error": f"MCP server '{server_name}' is not connected" }, ensure_ascii=False) @@ -1399,10 +1421,21 @@ def _make_tool_handler(server_name: str, tool_name: str, tool_timeout: float): return json.dumps({"result": text_result}, ensure_ascii=False) try: - return _run_on_mcp_loop(_call(), timeout=tool_timeout) + result = _run_on_mcp_loop(_call(), timeout=tool_timeout) + # Check if the MCP tool itself returned an error + try: + parsed = json.loads(result) + if "error" in parsed: + _server_error_counts[server_name] = _server_error_counts.get(server_name, 0) + 1 + else: + _server_error_counts[server_name] = 0 # success — reset + except (json.JSONDecodeError, TypeError): + _server_error_counts[server_name] = 0 # non-JSON = success + return result except InterruptedError: return _interrupted_call_result() except Exception as exc: + _server_error_counts[server_name] = _server_error_counts.get(server_name, 0) + 1 logger.error( "MCP tool %s/%s call failed: %s", server_name, tool_name, exc,