fix: credential pool 401 recovery rotates to next credential after failed refresh (#4300)

When an OAuth token refresh fails on a 401 error, the pool recovery
would return 'not recovered' without trying the next credential in the
pool. This meant users who added a second valid credential via
'hermes auth add' would never see it used when the primary credential
was dead.

Now: try refresh first (handles expired tokens quickly), and if that
fails, rotate to the next available credential — same as 429/402
already did.

Adds three tests covering 401 refresh success, refresh-fail-then-rotate,
and refresh-fail-with-no-remaining-credentials.
This commit is contained in:
Teknium 2026-03-31 12:02:29 -07:00 committed by GitHub
parent 143b74ec00
commit 161acb0086
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 72 additions and 0 deletions

View File

@ -3862,6 +3862,13 @@ class AIAgent:
logger.info(f"Credential 401 — refreshed pool entry {getattr(refreshed, 'id', '?')}")
self._swap_credential(refreshed)
return True, has_retried_429
# Refresh failed — rotate to next credential instead of giving up.
# The failed entry is already marked exhausted by try_refresh_current().
next_entry = pool.mark_exhausted_and_rotate(status_code=401)
if next_entry is not None:
logger.info(f"Credential 401 (refresh failed) — rotated to pool entry {getattr(next_entry, 'id', '?')}")
self._swap_credential(next_entry)
return True, False
return False, has_retried_429

View File

@ -1848,6 +1848,71 @@ class TestCredentialPoolRecovery:
agent._swap_credential.assert_called_once_with(next_entry)
def test_recover_with_pool_refreshes_on_401(self, agent):
"""401 with successful refresh should swap to refreshed credential."""
refreshed_entry = SimpleNamespace(label="refreshed-primary", id="abc")
class _Pool:
def try_refresh_current(self):
return refreshed_entry
agent._credential_pool = _Pool()
agent._swap_credential = MagicMock()
recovered, retry_same = agent._recover_with_credential_pool(
status_code=401,
has_retried_429=False,
)
assert recovered is True
agent._swap_credential.assert_called_once_with(refreshed_entry)
def test_recover_with_pool_rotates_on_401_when_refresh_fails(self, agent):
"""401 with failed refresh should rotate to next credential."""
next_entry = SimpleNamespace(label="secondary", id="def")
class _Pool:
def try_refresh_current(self):
return None # refresh failed
def mark_exhausted_and_rotate(self, *, status_code):
assert status_code == 401
return next_entry
agent._credential_pool = _Pool()
agent._swap_credential = MagicMock()
recovered, retry_same = agent._recover_with_credential_pool(
status_code=401,
has_retried_429=False,
)
assert recovered is True
assert retry_same is False
agent._swap_credential.assert_called_once_with(next_entry)
def test_recover_with_pool_401_refresh_fails_no_more_credentials(self, agent):
"""401 with failed refresh and no other credentials returns not recovered."""
class _Pool:
def try_refresh_current(self):
return None
def mark_exhausted_and_rotate(self, *, status_code):
return None # no more credentials
agent._credential_pool = _Pool()
agent._swap_credential = MagicMock()
recovered, retry_same = agent._recover_with_credential_pool(
status_code=401,
has_retried_429=False,
)
assert recovered is False
agent._swap_credential.assert_not_called()
class TestMaxTokensParam:
"""Verify _max_tokens_param returns the correct key for each provider."""