From 161acb0086274e30c806e6abfbcbe0d3a8740873 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Tue, 31 Mar 2026 12:02:29 -0700 Subject: [PATCH] fix: credential pool 401 recovery rotates to next credential after failed refresh (#4300) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an OAuth token refresh fails on a 401 error, the pool recovery would return 'not recovered' without trying the next credential in the pool. This meant users who added a second valid credential via 'hermes auth add' would never see it used when the primary credential was dead. Now: try refresh first (handles expired tokens quickly), and if that fails, rotate to the next available credential — same as 429/402 already did. Adds three tests covering 401 refresh success, refresh-fail-then-rotate, and refresh-fail-with-no-remaining-credentials. --- run_agent.py | 7 +++++ tests/test_run_agent.py | 65 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/run_agent.py b/run_agent.py index 717c26b4..3cfcc12a 100644 --- a/run_agent.py +++ b/run_agent.py @@ -3862,6 +3862,13 @@ class AIAgent: logger.info(f"Credential 401 — refreshed pool entry {getattr(refreshed, 'id', '?')}") self._swap_credential(refreshed) return True, has_retried_429 + # Refresh failed — rotate to next credential instead of giving up. + # The failed entry is already marked exhausted by try_refresh_current(). + next_entry = pool.mark_exhausted_and_rotate(status_code=401) + if next_entry is not None: + logger.info(f"Credential 401 (refresh failed) — rotated to pool entry {getattr(next_entry, 'id', '?')}") + self._swap_credential(next_entry) + return True, False return False, has_retried_429 diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py index aa74164a..99905bb5 100644 --- a/tests/test_run_agent.py +++ b/tests/test_run_agent.py @@ -1848,6 +1848,71 @@ class TestCredentialPoolRecovery: agent._swap_credential.assert_called_once_with(next_entry) + def test_recover_with_pool_refreshes_on_401(self, agent): + """401 with successful refresh should swap to refreshed credential.""" + refreshed_entry = SimpleNamespace(label="refreshed-primary", id="abc") + + class _Pool: + def try_refresh_current(self): + return refreshed_entry + + agent._credential_pool = _Pool() + agent._swap_credential = MagicMock() + + recovered, retry_same = agent._recover_with_credential_pool( + status_code=401, + has_retried_429=False, + ) + + assert recovered is True + agent._swap_credential.assert_called_once_with(refreshed_entry) + + def test_recover_with_pool_rotates_on_401_when_refresh_fails(self, agent): + """401 with failed refresh should rotate to next credential.""" + next_entry = SimpleNamespace(label="secondary", id="def") + + class _Pool: + def try_refresh_current(self): + return None # refresh failed + + def mark_exhausted_and_rotate(self, *, status_code): + assert status_code == 401 + return next_entry + + agent._credential_pool = _Pool() + agent._swap_credential = MagicMock() + + recovered, retry_same = agent._recover_with_credential_pool( + status_code=401, + has_retried_429=False, + ) + + assert recovered is True + assert retry_same is False + agent._swap_credential.assert_called_once_with(next_entry) + + def test_recover_with_pool_401_refresh_fails_no_more_credentials(self, agent): + """401 with failed refresh and no other credentials returns not recovered.""" + + class _Pool: + def try_refresh_current(self): + return None + + def mark_exhausted_and_rotate(self, *, status_code): + return None # no more credentials + + agent._credential_pool = _Pool() + agent._swap_credential = MagicMock() + + recovered, retry_same = agent._recover_with_credential_pool( + status_code=401, + has_retried_429=False, + ) + + assert recovered is False + agent._swap_credential.assert_not_called() + + class TestMaxTokensParam: """Verify _max_tokens_param returns the correct key for each provider."""