fix(codex): route auth failures to fallback provider chain

Two related paths where Codex auth failures silently swallowed the
fallback chain instead of switching to the next provider:

1. cli.py — _ensure_runtime_credentials() calls resolve_runtime_provider()
   before each turn. When provider is explicitly configured (not "auto"),
   an AuthError from token refresh is re-raised and printed as a bold-red
   error, returning False before the agent ever starts. The fallback chain
   was never tried. Fix: on AuthError, iterate fallback_providers and
   switch to the first one that resolves successfully.

2. run_agent.py — inside the codex_responses validity gate (inner retry
   loop), response.status in {"failed","cancelled"} with non-empty output
   items was treated as a valid response and broke out of the retry loop,
   reaching _normalize_codex_response() outside the fallback machinery.
   That function raises RuntimeError on status="failed", which propagates
   to the outer except with no fallback logic. Fix: detect terminal status
   codes before the output_items check and set response_invalid=True so
   the existing fallback chain fires normally.
This commit is contained in:
Devzo 2026-04-07 23:27:50 +02:00 committed by Teknium
parent f76df30e08
commit 813dbd9b40
2 changed files with 68 additions and 20 deletions

31
cli.py
View File

@ -3083,6 +3083,8 @@ class HermesCLI:
format_runtime_provider_error,
)
_primary_exc = None
runtime = None
try:
runtime = resolve_runtime_provider(
requested=self.requested_provider,
@ -3090,7 +3092,34 @@ class HermesCLI:
explicit_base_url=self._explicit_base_url,
)
except Exception as exc:
message = format_runtime_provider_error(exc)
_primary_exc = exc
# Primary provider auth failed — try fallback providers before giving up.
if runtime is None and _primary_exc is not None:
from hermes_cli.auth import AuthError
if isinstance(_primary_exc, AuthError):
_fb_chain = self._fallback_model if isinstance(self._fallback_model, list) else []
for _fb in _fb_chain:
_fb_provider = (_fb.get("provider") or "").strip().lower()
_fb_model = (_fb.get("model") or "").strip()
if not _fb_provider or not _fb_model:
continue
try:
runtime = resolve_runtime_provider(requested=_fb_provider)
logger.warning(
"Primary provider auth failed (%s). Falling through to fallback: %s/%s",
_primary_exc, _fb_provider, _fb_model,
)
_cprint(f"⚠️ Primary auth failed — switching to fallback: {_fb_provider} / {_fb_model}")
self.requested_provider = _fb_provider
self.model = _fb_model
_primary_exc = None
break
except Exception:
continue
if runtime is None:
message = format_runtime_provider_error(_primary_exc) if _primary_exc else "Provider resolution failed."
ChatConsole().print(f"[bold red]{message}[/]")
return False

View File

@ -9532,28 +9532,47 @@ class AIAgent:
response_invalid = True
error_details.append("response is None")
else:
# output_text fallback: stream backfill may have failed
# but normalize can still recover from output_text
_out_text = getattr(response, "output_text", None)
_out_text_stripped = _out_text.strip() if isinstance(_out_text, str) else ""
if _out_text_stripped:
logger.debug(
"Codex response.output is empty but output_text is present "
"(%d chars); deferring to normalization.",
len(_out_text_stripped),
# Provider returned a terminal failure (e.g. quota exhaustion).
# Treat as invalid so the fallback chain is triggered instead of
# letting the error bubble up outside the retry/fallback loop.
_codex_resp_status = str(getattr(response, "status", "") or "").strip().lower()
if _codex_resp_status in {"failed", "cancelled"}:
_codex_error_obj = getattr(response, "error", None)
_codex_error_msg = (
_codex_error_obj.get("message") if isinstance(_codex_error_obj, dict)
else str(_codex_error_obj) if _codex_error_obj
else f"Responses API returned status '{_codex_resp_status}'"
)
else:
_resp_status = getattr(response, "status", None)
_resp_incomplete = getattr(response, "incomplete_details", None)
logger.warning(
"Codex response.output is empty after stream backfill "
"(status=%s, incomplete_details=%s, model=%s). %s",
_resp_status, _resp_incomplete,
getattr(response, "model", None),
f"api_mode={self.api_mode} provider={self.provider}",
logging.warning(
"Codex response status='%s' (error=%s). Routing to fallback. %s",
_codex_resp_status, _codex_error_msg,
self._client_log_context(),
)
response_invalid = True
error_details.append("response.output is empty")
error_details.append(f"response.status={_codex_resp_status}: {_codex_error_msg}")
else:
# output_text fallback: stream backfill may have failed
# but normalize can still recover from output_text
_out_text = getattr(response, "output_text", None)
_out_text_stripped = _out_text.strip() if isinstance(_out_text, str) else ""
if _out_text_stripped:
logger.debug(
"Codex response.output is empty but output_text is present "
"(%d chars); deferring to normalization.",
len(_out_text_stripped),
)
else:
_resp_status = getattr(response, "status", None)
_resp_incomplete = getattr(response, "incomplete_details", None)
logger.warning(
"Codex response.output is empty after stream backfill "
"(status=%s, incomplete_details=%s, model=%s). %s",
_resp_status, _resp_incomplete,
getattr(response, "model", None),
f"api_mode={self.api_mode} provider={self.provider}",
)
response_invalid = True
error_details.append("response.output is empty")
elif self.api_mode == "anthropic_messages":
_tv = self._get_transport()
if not _tv.validate_response(response):