From 813dbd9b40bc9a00bd364cabe139ba8e8ac6f312 Mon Sep 17 00:00:00 2001 From: Devzo Date: Tue, 7 Apr 2026 23:27:50 +0200 Subject: [PATCH] fix(codex): route auth failures to fallback provider chain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related paths where Codex auth failures silently swallowed the fallback chain instead of switching to the next provider: 1. cli.py — _ensure_runtime_credentials() calls resolve_runtime_provider() before each turn. When provider is explicitly configured (not "auto"), an AuthError from token refresh is re-raised and printed as a bold-red error, returning False before the agent ever starts. The fallback chain was never tried. Fix: on AuthError, iterate fallback_providers and switch to the first one that resolves successfully. 2. run_agent.py — inside the codex_responses validity gate (inner retry loop), response.status in {"failed","cancelled"} with non-empty output items was treated as a valid response and broke out of the retry loop, reaching _normalize_codex_response() outside the fallback machinery. That function raises RuntimeError on status="failed", which propagates to the outer except with no fallback logic. Fix: detect terminal status codes before the output_items check and set response_invalid=True so the existing fallback chain fires normally. --- cli.py | 31 +++++++++++++++++++++++++++- run_agent.py | 57 ++++++++++++++++++++++++++++++++++------------------ 2 files changed, 68 insertions(+), 20 deletions(-) diff --git a/cli.py b/cli.py index 10315d6c..ea43cdf5 100644 --- a/cli.py +++ b/cli.py @@ -3083,6 +3083,8 @@ class HermesCLI: format_runtime_provider_error, ) + _primary_exc = None + runtime = None try: runtime = resolve_runtime_provider( requested=self.requested_provider, @@ -3090,7 +3092,34 @@ class HermesCLI: explicit_base_url=self._explicit_base_url, ) except Exception as exc: - message = format_runtime_provider_error(exc) + _primary_exc = exc + + # Primary provider auth failed — try fallback providers before giving up. + if runtime is None and _primary_exc is not None: + from hermes_cli.auth import AuthError + if isinstance(_primary_exc, AuthError): + _fb_chain = self._fallback_model if isinstance(self._fallback_model, list) else [] + for _fb in _fb_chain: + _fb_provider = (_fb.get("provider") or "").strip().lower() + _fb_model = (_fb.get("model") or "").strip() + if not _fb_provider or not _fb_model: + continue + try: + runtime = resolve_runtime_provider(requested=_fb_provider) + logger.warning( + "Primary provider auth failed (%s). Falling through to fallback: %s/%s", + _primary_exc, _fb_provider, _fb_model, + ) + _cprint(f"⚠️ Primary auth failed — switching to fallback: {_fb_provider} / {_fb_model}") + self.requested_provider = _fb_provider + self.model = _fb_model + _primary_exc = None + break + except Exception: + continue + + if runtime is None: + message = format_runtime_provider_error(_primary_exc) if _primary_exc else "Provider resolution failed." ChatConsole().print(f"[bold red]{message}[/]") return False diff --git a/run_agent.py b/run_agent.py index e68e8f54..8994f206 100644 --- a/run_agent.py +++ b/run_agent.py @@ -9532,28 +9532,47 @@ class AIAgent: response_invalid = True error_details.append("response is None") else: - # output_text fallback: stream backfill may have failed - # but normalize can still recover from output_text - _out_text = getattr(response, "output_text", None) - _out_text_stripped = _out_text.strip() if isinstance(_out_text, str) else "" - if _out_text_stripped: - logger.debug( - "Codex response.output is empty but output_text is present " - "(%d chars); deferring to normalization.", - len(_out_text_stripped), + # Provider returned a terminal failure (e.g. quota exhaustion). + # Treat as invalid so the fallback chain is triggered instead of + # letting the error bubble up outside the retry/fallback loop. + _codex_resp_status = str(getattr(response, "status", "") or "").strip().lower() + if _codex_resp_status in {"failed", "cancelled"}: + _codex_error_obj = getattr(response, "error", None) + _codex_error_msg = ( + _codex_error_obj.get("message") if isinstance(_codex_error_obj, dict) + else str(_codex_error_obj) if _codex_error_obj + else f"Responses API returned status '{_codex_resp_status}'" ) - else: - _resp_status = getattr(response, "status", None) - _resp_incomplete = getattr(response, "incomplete_details", None) - logger.warning( - "Codex response.output is empty after stream backfill " - "(status=%s, incomplete_details=%s, model=%s). %s", - _resp_status, _resp_incomplete, - getattr(response, "model", None), - f"api_mode={self.api_mode} provider={self.provider}", + logging.warning( + "Codex response status='%s' (error=%s). Routing to fallback. %s", + _codex_resp_status, _codex_error_msg, + self._client_log_context(), ) response_invalid = True - error_details.append("response.output is empty") + error_details.append(f"response.status={_codex_resp_status}: {_codex_error_msg}") + else: + # output_text fallback: stream backfill may have failed + # but normalize can still recover from output_text + _out_text = getattr(response, "output_text", None) + _out_text_stripped = _out_text.strip() if isinstance(_out_text, str) else "" + if _out_text_stripped: + logger.debug( + "Codex response.output is empty but output_text is present " + "(%d chars); deferring to normalization.", + len(_out_text_stripped), + ) + else: + _resp_status = getattr(response, "status", None) + _resp_incomplete = getattr(response, "incomplete_details", None) + logger.warning( + "Codex response.output is empty after stream backfill " + "(status=%s, incomplete_details=%s, model=%s). %s", + _resp_status, _resp_incomplete, + getattr(response, "model", None), + f"api_mode={self.api_mode} provider={self.provider}", + ) + response_invalid = True + error_details.append("response.output is empty") elif self.api_mode == "anthropic_messages": _tv = self._get_transport() if not _tv.validate_response(response):