From cb12601414b69fae306a6802c23c1f495f5113cd Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sat, 25 Apr 2026 23:32:48 -0700 Subject: [PATCH] fix(platform): make Provisioner.Stop return real errors so cleanup gates fire MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review caught a critical issue with 12c49183: the headline "skip RemoveVolume when Stop fails" guarantee was dead code. `Provisioner.Stop` unconditionally `return nil`'d after logging the underlying ContainerRemove error, so the new `if err := h.provisioner.Stop(...); err != nil { skip volume }` guard in workspace_crud.go AND the same guard in the orphan sweeper could never fire. RemoveVolume always ran, predictably failing with "volume in use" when Stop hadn't actually killed the container — which is the exact production bug the commit claimed to fix. Now Stop: - returns nil on successful remove (no change) - returns nil when the container is already gone (uses the existing isContainerNotFound helper — that's the cleanup post-condition, not a failure) - returns the wrapped Docker error otherwise (daemon timeout, ctx cancellation, socket EOF — anything that means the container might still be alive) Audited every Provisioner.Stop caller in the tree (team.go, workspace_restart.go ×4, workspace.go) — all of them already discard the return value, so the widened error surface is purely opt-in for the new cleanup paths and breaks no existing behaviour. Other review-driven fixes in this commit: - workspace_crud.go: detached `broadcaster.RecordAndBroadcast` from the request ctx too. RecordAndBroadcast does INSERT INTO structure_events + Redis Publish; if the canvas hangs up, a request-ctx-bound INSERT can be cancelled mid-write and the WORKSPACE_REMOVED event never lands, leaving other WS clients ignorant of the cascade. - orphan_sweeper.go: added isLikelyWorkspaceID guard before turning Docker container prefixes into SQL LIKE patterns. The Docker name filter is a SUBSTRING match (not prefix), so non-workspace containers like `my-ws-tool` slip through; the in-loop HasPrefix in provisioner trims most, but the in-sweeper alphabet check (hex + dashes only) is the second line of defence and also blocks SQL LIKE wildcards (`_`, `%`) from reaching the query. Two new tests pin this — TestSweepOnce_FiltersNonWorkspacePrefixes and TestIsLikelyWorkspaceID with 10 alphabet cases. - provisioner.go: comment added to ListWorkspaceContainerIDPrefixes flagging the substring/HasPrefix relationship as load-bearing. Verified: full Go test suite passes; all 8 sweeper tests pass (2 new for the LIKE-pattern guard); existing dispatch / delete / provisioner tests unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/workspace_crud.go | 11 +++- .../internal/provisioner/provisioner.go | 38 +++++++++--- .../internal/registry/orphan_sweeper.go | 41 ++++++++++++- .../internal/registry/orphan_sweeper_test.go | 61 +++++++++++++++++++ 4 files changed, 138 insertions(+), 13 deletions(-) diff --git a/workspace-server/internal/handlers/workspace_crud.go b/workspace-server/internal/handlers/workspace_crud.go index 6d7c79a2..4534c8ea 100644 --- a/workspace-server/internal/handlers/workspace_crud.go +++ b/workspace-server/internal/handlers/workspace_crud.go @@ -426,13 +426,20 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) { for _, descID := range descendantIDs { stopAndRemove(descID) db.ClearWorkspaceKeys(cleanupCtx, descID) - h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_REMOVED", descID, map[string]interface{}{}) + // Detach broadcaster ctx for the same reason as the cleanup + // above — RecordAndBroadcast does an INSERT INTO + // structure_events + Redis Publish. If the canvas hangs up, + // a request-ctx-bound INSERT can be cancelled mid-write, + // leaving other WS clients ignorant of the cascade. The DB + // row is already 'removed' so it's recoverable, but the + // inconsistency is avoidable. + h.broadcaster.RecordAndBroadcast(cleanupCtx, "WORKSPACE_REMOVED", descID, map[string]interface{}{}) } stopAndRemove(id) db.ClearWorkspaceKeys(cleanupCtx, id) - h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_REMOVED", id, map[string]interface{}{ + h.broadcaster.RecordAndBroadcast(cleanupCtx, "WORKSPACE_REMOVED", id, map[string]interface{}{ "cascade_deleted": len(descendantIDs), }) diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index 412859b3..a5216789 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -168,6 +168,12 @@ func (p *Provisioner) ListWorkspaceContainerIDPrefixes(ctx context.Context) ([]s // Container names from the API include a leading slash: // "/ws-abc123def456". Strip both the slash and our prefix // to recover the 12-char workspace ID. + // + // The Docker name filter is a SUBSTRING match (not a prefix + // match), so something like "my-ws-thing" would also be + // returned. The HasPrefix check below is load-bearing: + // without it those false positives would flow into the + // orphan sweeper's DB query as bogus LIKE patterns. for _, name := range c.Names { n := strings.TrimPrefix(name, "/") if !strings.HasPrefix(n, containerNamePrefix) { @@ -873,19 +879,35 @@ func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) erro // restart policy: if we ContainerStop first, the restart policy can // respawn the container before ContainerRemove runs, leaving a zombie // that re-registers via heartbeat after deletion. +// +// Returns nil on success AND on "container does not exist" (the cleanup +// goal is achieved either way). Returns the underlying Docker error +// only when the daemon actually failed to remove a live container — +// callers that follow Stop with RemoveVolume MUST check the return +// and skip volume removal on a real error, otherwise the volume +// removal will fail with "volume in use" because the container is +// still alive. func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error { name := ContainerName(workspaceID) // Force-remove kills and removes in one atomic operation, bypassing - // the restart policy entirely. If the container doesn't exist, the - // error is harmless. - if err := p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true}); err != nil { - // Container may already be gone — log but don't fail. - log.Printf("Provisioner: force-remove warning for %s: %v", name, err) + // the restart policy entirely. + err := p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true}) + if err == nil { + log.Printf("Provisioner: stopped and removed container %s", name) + return nil } - - log.Printf("Provisioner: stopped and removed container %s", name) - return nil + if isContainerNotFound(err) { + // Container was already gone — the post-condition we want is + // satisfied. Don't surface as an error. + log.Printf("Provisioner: container %s already gone (no-op)", name) + return nil + } + // Real failure: daemon timeout, socket EOF, ctx cancellation, etc. + // Caller (workspace_crud.stopAndRemove, orphan_sweeper.sweepOnce) + // must propagate this so they can skip the follow-up RemoveVolume. + log.Printf("Provisioner: force-remove failed for %s: %v", name, err) + return fmt.Errorf("force-remove %s: %w", name, err) } // IsRunning checks if a workspace container is currently running. diff --git a/workspace-server/internal/registry/orphan_sweeper.go b/workspace-server/internal/registry/orphan_sweeper.go index 6fbbc0fd..293e5384 100644 --- a/workspace-server/internal/registry/orphan_sweeper.go +++ b/workspace-server/internal/registry/orphan_sweeper.go @@ -36,6 +36,29 @@ type OrphanReaper interface { RemoveVolume(ctx context.Context, workspaceID string) error } +// isLikelyWorkspaceID accepts strings shaped like a UUID prefix — +// hex chars and `-` only. Workspace IDs are full UUIDs and the +// container-name truncation keeps the hex prefix intact, so any +// container name that doesn't match this is by definition not one +// of ours and should be skipped. Also doubles as a SQL LIKE +// wildcard guard (rejects `_` and `%`). +func isLikelyWorkspaceID(s string) bool { + if s == "" { + return false + } + for _, r := range s { + switch { + case r >= '0' && r <= '9': + case r >= 'a' && r <= 'f': + case r >= 'A' && r <= 'F': + case r == '-': + default: + return false + } + } + return true +} + // OrphanSweepInterval is the cadence of the reconcile loop. 60s // matches the heartbeat cadence (30s) × 2 — a single missed cleanup // surfaces within ~90s end-to-end (canvas delete → next sweep tick → @@ -97,13 +120,25 @@ func sweepOnce(parent context.Context, reaper OrphanReaper) { // statistically negligible). Use a single IN-style query so // the cost is one round-trip regardless of leak count. // - // LIKE patterns built from prefixes go through pq.Array → no - // SQL injection vector (prefixes come from Docker, not user - // input, but defence in depth). + // Defence: drop any prefix whose contents fall outside the + // hex-and-dash UUID alphabet. Workspace IDs are UUIDs, so + // container names follow ws-<12 hex chars>. Anything else is + // either a non-workspace container that slipped past the + // substring-match Docker filter (workspace-runner, etc.) or a + // malformed entry — neither should be turned into a LIKE + // pattern. Also blocks SQL LIKE wildcards (`_` and `%`) from + // reaching the query, even though Docker's container-name + // validation would already have rejected them upstream. likes := make([]string, 0, len(prefixes)) for _, p := range prefixes { + if !isLikelyWorkspaceID(p) { + continue + } likes = append(likes, p+"%") } + if len(likes) == 0 { + return + } rows, err := db.DB.QueryContext(ctx, ` SELECT id::text FROM workspaces diff --git a/workspace-server/internal/registry/orphan_sweeper_test.go b/workspace-server/internal/registry/orphan_sweeper_test.go index 99eea4a9..ec17a5c1 100644 --- a/workspace-server/internal/registry/orphan_sweeper_test.go +++ b/workspace-server/internal/registry/orphan_sweeper_test.go @@ -175,6 +175,67 @@ func TestSweepOnce_VolumeRemoveErrorIsNonFatal(t *testing.T) { } } +// TestSweepOnce_FiltersNonWorkspacePrefixes — the Docker name filter +// is a SUBSTRING match so containers like "my-ws-thing" can slip +// through. The HasPrefix check in the provisioner trims those, but +// the in-sweeper isLikelyWorkspaceID guard is the second line of +// defence: anything outside the UUID alphabet (hex + dashes) is +// rejected before being turned into a SQL LIKE pattern. Locks in +// that no DB query fires when every prefix is filtered out. +func TestSweepOnce_FiltersNonWorkspacePrefixes(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + reaper := &fakeReaper{ + listResponse: []string{ + "not_a_uuid_at_all", // underscore not in UUID alphabet + "contains%wildcard", // SQL LIKE wildcard — must not reach the query + "contains_wildcard", // SQL LIKE single-char wildcard + "", // empty + "valid-but-non-workspace-name", // dash + lowercase letters that aren't hex + }, + } + + // No DB query expected — every prefix is rejected before the + // query builds, so we short-circuit. sqlmock fails on any + // unexpected query. + sweepOnce(context.Background(), reaper) + + if len(reaper.stopCalls) != 0 { + t.Errorf("Stop must not fire when all prefixes filtered; got %v", reaper.stopCalls) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +// TestIsLikelyWorkspaceID — pin the alphabet directly. This is the +// guard that prevents SQL LIKE wildcards (`%`, `_`) from reaching +// the sweeper's query. +func TestIsLikelyWorkspaceID(t *testing.T) { + cases := []struct { + in string + want bool + }{ + {"abc123def456", true}, + {"abcdef-1234-5678-90ab-cdef00112233", true}, + {"ABC123DEF456", true}, // uppercase hex still allowed + {"", false}, + {"abc_123", false}, // underscore (SQL LIKE single-char wildcard) + {"abc%123", false}, // percent (SQL LIKE multi-char wildcard) + {"hello world", false}, // space, non-hex letters + {"valid-but-not", false}, // 'l', 't', 'n' aren't hex + {"abc 123", false}, + {".../escape", false}, + } + for _, tc := range cases { + got := isLikelyWorkspaceID(tc.in) + if got != tc.want { + t.Errorf("isLikelyWorkspaceID(%q) = %v, want %v", tc.in, got, tc.want) + } + } +} + // TestStartOrphanSweeper_NilReaperIsNoOp — tolerance for the // nil-provisioner path used by some test harnesses. func TestStartOrphanSweeper_NilReaperIsNoOp(t *testing.T) {