From be271aef8b44bd22688e788d7c0e9d237be2958c Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 3 May 2026 00:49:37 -0700 Subject: [PATCH] fix(orphan-sweeper): exclude runtime='external' from stale-token revoke MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Docker-mode orphan sweeper was incorrectly targeting external runtime workspaces, revoking their auth tokens ~6 minutes after creation (one sweep cycle past the 5-min grace). External workspaces have NO local container by design — their agent runs off-host. The "no live container" predicate the sweep uses to detect wiped-volume orphans matches every external workspace unconditionally, which was killing the only auth credential the off-host agent has. Reproducer: create runtime=external workspace, paste the auth token into molecule-mcp / curl, wait 5 minutes. Next request returns `HTTP 401 — token may be revoked`. Platform log shows `Orphan sweeper: revoking stale tokens for workspace (no live container; volume likely wiped)`. Fix: add `AND w.runtime != 'external'` to the sweep's SELECT. The existing test regexes (third-pass query expectations + the shared expectStaleTokenSweepNoOp helper) are tightened to require the new predicate, so a regression that drops it fails CI immediately. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/registry/orphan_sweeper.go | 10 ++++++++++ .../internal/registry/orphan_sweeper_test.go | 20 +++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/workspace-server/internal/registry/orphan_sweeper.go b/workspace-server/internal/registry/orphan_sweeper.go index 85c07d96..578e29b5 100644 --- a/workspace-server/internal/registry/orphan_sweeper.go +++ b/workspace-server/internal/registry/orphan_sweeper.go @@ -413,12 +413,22 @@ func sweepStaleTokensWithoutContainer(ctx context.Context, reaper OrphanReaper) // `"5m0s"` mismatch with Postgres interval grammar; passing seconds // as an int keeps the binding portable. graceSeconds := int(staleTokenGrace.Seconds()) + // `runtime != 'external'` is load-bearing: external workspaces have NO + // local container by design (the agent runs off-host), so the + // "no live container" predicate below would match every external + // workspace and revoke its token. The token is the off-host agent's + // only authentication credential — revoking breaks the entire + // external-runtime feature. Discovered 2026-05-03 when a fresh + // external workspace had its token silently revoked ~6 minutes after + // creation by this sweep, killing the operator's MCP heartbeat and + // inbox poll with `HTTP 401 — token may be revoked`. rows, qErr := db.DB.QueryContext(ctx, ` SELECT DISTINCT t.workspace_id::text FROM workspace_auth_tokens t JOIN workspaces w ON w.id = t.workspace_id WHERE t.revoked_at IS NULL AND w.status NOT IN ('removed', 'provisioning') + AND w.runtime != 'external' AND COALESCE(t.last_used_at, t.created_at) < now() - make_interval(secs => $2) AND ( cardinality($1::text[]) = 0 diff --git a/workspace-server/internal/registry/orphan_sweeper_test.go b/workspace-server/internal/registry/orphan_sweeper_test.go index 9ce0c292..8a3136f5 100644 --- a/workspace-server/internal/registry/orphan_sweeper_test.go +++ b/workspace-server/internal/registry/orphan_sweeper_test.go @@ -20,11 +20,13 @@ import ( // individual tests don't have to spell out a query they're not actually // asserting against. // -// The regex is anchored at the start of the query AND requires the -// status-filter to keep us from accidentally matching a future query -// that opens with the same column name. R3 from the review. +// The regex is anchored at the start of the query AND requires both the +// status-filter (R3 from the review) and the runtime-filter (2026-05-03 +// fix for external workspaces being incorrectly swept), to keep us from +// accidentally matching a future query that opens with the same column +// name OR a regression that drops one of the load-bearing predicates. func expectStaleTokenSweepNoOp(mock sqlmock.Sqlmock) { - mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\)`). + mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\).*runtime != 'external'`). WillReturnRows(sqlmock.NewRows([]string{"workspace_id"})) } @@ -486,9 +488,11 @@ func TestSweepOnce_StaleTokenRevokeFiresWhenNoContainer(t *testing.T) { // Third-pass query returns the orphaned workspace. // Tight regex pins the safety guards: status-filter excludes - // 'removed' and 'provisioning' (R2 + the C1 fix), and the - // staleness predicate appears in the SELECT. - mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\).*COALESCE\(t\.last_used_at, t\.created_at\) < now\(\) - make_interval`). + // 'removed' and 'provisioning' (R2 + the C1 fix), runtime filter + // excludes 'external' (2026-05-03 fix — the sweep was incorrectly + // targeting external workspaces which have no container by design), + // and the staleness predicate appears in the SELECT. + mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\).*runtime != 'external'.*COALESCE\(t\.last_used_at, t\.created_at\) < now\(\) - make_interval`). WillReturnRows(sqlmock.NewRows([]string{"workspace_id"}). AddRow(orphanedID)) @@ -544,7 +548,7 @@ func TestSweepOnce_StaleTokenRevokeFailureBailsLoop(t *testing.T) { // Third-pass returns two stale-token workspaces; the first revoke // errors. Loop must bail without attempting the second. - mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\)`). + mock.ExpectQuery(`(?s)^\s*SELECT DISTINCT t\.workspace_id::text\s+FROM workspace_auth_tokens.*status NOT IN \('removed', 'provisioning'\).*runtime != 'external'`). WillReturnRows(sqlmock.NewRows([]string{"workspace_id"}). AddRow("aaaa1111-0000-0000-0000-000000000000"). AddRow("bbbb2222-0000-0000-0000-000000000000"))