Merge pull request #2549 from Molecule-AI/fix/orphan-sweeper-skip-external-runtime

fix(orphan-sweeper): exclude runtime='external' from stale-token revoke
This commit is contained in:
Hongming Wang 2026-05-03 07:52:43 +00:00 committed by GitHub
commit 67f3e49e42
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 22 additions and 8 deletions

View File

@ -413,12 +413,22 @@ func sweepStaleTokensWithoutContainer(ctx context.Context, reaper OrphanReaper)
// `"5m0s"` mismatch with Postgres interval grammar; passing seconds
// as an int keeps the binding portable.
graceSeconds := int(staleTokenGrace.Seconds())
// `runtime != 'external'` is load-bearing: external workspaces have NO
// local container by design (the agent runs off-host), so the
// "no live container" predicate below would match every external
// workspace and revoke its token. The token is the off-host agent's
// only authentication credential — revoking breaks the entire
// external-runtime feature. Discovered 2026-05-03 when a fresh
// external workspace had its token silently revoked ~6 minutes after
// creation by this sweep, killing the operator's MCP heartbeat and
// inbox poll with `HTTP 401 — token may be revoked`.
rows, qErr := db.DB.QueryContext(ctx, `
SELECT DISTINCT t.workspace_id::text
FROM workspace_auth_tokens t
JOIN workspaces w ON w.id = t.workspace_id
WHERE t.revoked_at IS NULL
AND w.status NOT IN ('removed', 'provisioning')
AND w.runtime != 'external'
AND COALESCE(t.last_used_at, t.created_at) < now() - make_interval(secs => $2)
AND (
cardinality($1::text[]) = 0

View File

@ -20,11 +20,13 @@ import (
// individual tests don't have to spell out a query they're not actually
// asserting against.
//
// The regex is anchored at the start of the query AND requires the
// status-filter to keep us from accidentally matching a future query
// that opens with the same column name. R3 from the review.
// The regex is anchored at the start of the query AND requires both the
// status-filter (R3 from the review) and the runtime-filter (2026-05-03
// fix for external workspaces being incorrectly swept), to keep us from
// accidentally matching a future query that opens with the same column
// name OR a regression that drops one of the load-bearing predicates.
func expectStaleTokenSweepNoOp(mock sqlmock.Sqlmock) {
mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\)`).
mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\).*runtime != 'external'`).
WillReturnRows(sqlmock.NewRows([]string{"workspace_id"}))
}
@ -486,9 +488,11 @@ func TestSweepOnce_StaleTokenRevokeFiresWhenNoContainer(t *testing.T) {
// Third-pass query returns the orphaned workspace.
// Tight regex pins the safety guards: status-filter excludes
// 'removed' and 'provisioning' (R2 + the C1 fix), and the
// staleness predicate appears in the SELECT.
mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\).*COALESCE\(t\.last_used_at, t\.created_at\) < now\(\) - make_interval`).
// 'removed' and 'provisioning' (R2 + the C1 fix), runtime filter
// excludes 'external' (2026-05-03 fix — the sweep was incorrectly
// targeting external workspaces which have no container by design),
// and the staleness predicate appears in the SELECT.
mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\).*runtime != 'external'.*COALESCE\(t\.last_used_at, t\.created_at\) < now\(\) - make_interval`).
WillReturnRows(sqlmock.NewRows([]string{"workspace_id"}).
AddRow(orphanedID))
@ -544,7 +548,7 @@ func TestSweepOnce_StaleTokenRevokeFailureBailsLoop(t *testing.T) {
// Third-pass returns two stale-token workspaces; the first revoke
// errors. Loop must bail without attempting the second.
mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\)`).
mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\).*runtime != 'external'`).
WillReturnRows(sqlmock.NewRows([]string{"workspace_id"}).
AddRow("aaaa1111-0000-0000-0000-000000000000").
AddRow("bbbb2222-0000-0000-0000-000000000000"))