forked from molecule-ai/molecule-core
fix(orphan-sweeper): exclude runtime='external' from stale-token revoke
The Docker-mode orphan sweeper was incorrectly targeting external runtime workspaces, revoking their auth tokens ~6 minutes after creation (one sweep cycle past the 5-min grace). External workspaces have NO local container by design — their agent runs off-host. The "no live container" predicate the sweep uses to detect wiped-volume orphans matches every external workspace unconditionally, which was killing the only auth credential the off-host agent has. Reproducer: create runtime=external workspace, paste the auth token into molecule-mcp / curl, wait 5 minutes. Next request returns `HTTP 401 — token may be revoked`. Platform log shows `Orphan sweeper: revoking stale tokens for workspace <id> (no live container; volume likely wiped)`. Fix: add `AND w.runtime != 'external'` to the sweep's SELECT. The existing test regexes (third-pass query expectations + the shared expectStaleTokenSweepNoOp helper) are tightened to require the new predicate, so a regression that drops it fails CI immediately. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
350495f032
commit
be271aef8b
@ -413,12 +413,22 @@ func sweepStaleTokensWithoutContainer(ctx context.Context, reaper OrphanReaper)
|
||||
// `"5m0s"` mismatch with Postgres interval grammar; passing seconds
|
||||
// as an int keeps the binding portable.
|
||||
graceSeconds := int(staleTokenGrace.Seconds())
|
||||
// `runtime != 'external'` is load-bearing: external workspaces have NO
|
||||
// local container by design (the agent runs off-host), so the
|
||||
// "no live container" predicate below would match every external
|
||||
// workspace and revoke its token. The token is the off-host agent's
|
||||
// only authentication credential — revoking breaks the entire
|
||||
// external-runtime feature. Discovered 2026-05-03 when a fresh
|
||||
// external workspace had its token silently revoked ~6 minutes after
|
||||
// creation by this sweep, killing the operator's MCP heartbeat and
|
||||
// inbox poll with `HTTP 401 — token may be revoked`.
|
||||
rows, qErr := db.DB.QueryContext(ctx, `
|
||||
SELECT DISTINCT t.workspace_id::text
|
||||
FROM workspace_auth_tokens t
|
||||
JOIN workspaces w ON w.id = t.workspace_id
|
||||
WHERE t.revoked_at IS NULL
|
||||
AND w.status NOT IN ('removed', 'provisioning')
|
||||
AND w.runtime != 'external'
|
||||
AND COALESCE(t.last_used_at, t.created_at) < now() - make_interval(secs => $2)
|
||||
AND (
|
||||
cardinality($1::text[]) = 0
|
||||
|
||||
@ -20,11 +20,13 @@ import (
|
||||
// individual tests don't have to spell out a query they're not actually
|
||||
// asserting against.
|
||||
//
|
||||
// The regex is anchored at the start of the query AND requires the
|
||||
// status-filter to keep us from accidentally matching a future query
|
||||
// that opens with the same column name. R3 from the review.
|
||||
// The regex is anchored at the start of the query AND requires both the
|
||||
// status-filter (R3 from the review) and the runtime-filter (2026-05-03
|
||||
// fix for external workspaces being incorrectly swept), to keep us from
|
||||
// accidentally matching a future query that opens with the same column
|
||||
// name OR a regression that drops one of the load-bearing predicates.
|
||||
func expectStaleTokenSweepNoOp(mock sqlmock.Sqlmock) {
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\)`).
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\).*runtime != 'external'`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"workspace_id"}))
|
||||
}
|
||||
|
||||
@ -486,9 +488,11 @@ func TestSweepOnce_StaleTokenRevokeFiresWhenNoContainer(t *testing.T) {
|
||||
|
||||
// Third-pass query returns the orphaned workspace.
|
||||
// Tight regex pins the safety guards: status-filter excludes
|
||||
// 'removed' and 'provisioning' (R2 + the C1 fix), and the
|
||||
// staleness predicate appears in the SELECT.
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\).*COALESCE\(t\.last_used_at, t\.created_at\) < now\(\) - make_interval`).
|
||||
// 'removed' and 'provisioning' (R2 + the C1 fix), runtime filter
|
||||
// excludes 'external' (2026-05-03 fix — the sweep was incorrectly
|
||||
// targeting external workspaces which have no container by design),
|
||||
// and the staleness predicate appears in the SELECT.
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\).*runtime != 'external'.*COALESCE\(t\.last_used_at, t\.created_at\) < now\(\) - make_interval`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"workspace_id"}).
|
||||
AddRow(orphanedID))
|
||||
|
||||
@ -544,7 +548,7 @@ func TestSweepOnce_StaleTokenRevokeFailureBailsLoop(t *testing.T) {
|
||||
|
||||
// Third-pass returns two stale-token workspaces; the first revoke
|
||||
// errors. Loop must bail without attempting the second.
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\)`).
|
||||
mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\).*runtime != 'external'`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"workspace_id"}).
|
||||
AddRow("aaaa1111-0000-0000-0000-000000000000").
|
||||
AddRow("bbbb2222-0000-0000-0000-000000000000"))
|
||||
|
||||
Loading…
Reference in New Issue
Block a user