fix(platform): make Provisioner.Stop return real errors so cleanup gates fire

Review caught a critical issue with 12c49183: the headline "skip
RemoveVolume when Stop fails" guarantee was dead code. `Provisioner.Stop`
unconditionally `return nil`'d after logging the underlying
ContainerRemove error, so the new `if err := h.provisioner.Stop(...);
err != nil { skip volume }` guard in workspace_crud.go AND the same
guard in the orphan sweeper could never fire. RemoveVolume always
ran, predictably failing with "volume in use" when Stop hadn't
actually killed the container — which is the exact production bug
the commit claimed to fix.

Now Stop:
  - returns nil on successful remove (no change)
  - returns nil when the container is already gone (uses the existing
    isContainerNotFound helper — that's the cleanup post-condition,
    not a failure)
  - returns the wrapped Docker error otherwise (daemon timeout, ctx
    cancellation, socket EOF — anything that means the container
    might still be alive)

Audited every Provisioner.Stop caller in the tree (team.go,
workspace_restart.go ×4, workspace.go) — all of them already
discard the return value, so the widened error surface is purely
opt-in for the new cleanup paths and breaks no existing behaviour.

Other review-driven fixes in this commit:

- workspace_crud.go: detached `broadcaster.RecordAndBroadcast` from
  the request ctx too. RecordAndBroadcast does INSERT INTO
  structure_events + Redis Publish; if the canvas hangs up, a
  request-ctx-bound INSERT can be cancelled mid-write and the
  WORKSPACE_REMOVED event never lands, leaving other WS clients
  ignorant of the cascade.

- orphan_sweeper.go: added isLikelyWorkspaceID guard before turning
  Docker container prefixes into SQL LIKE patterns. The Docker
  name filter is a SUBSTRING match (not prefix), so non-workspace
  containers like `my-ws-tool` slip through; the in-loop HasPrefix
  in provisioner trims most, but the in-sweeper alphabet check
  (hex + dashes only) is the second line of defence and also
  blocks SQL LIKE wildcards (`_`, `%`) from reaching the query.
  Two new tests pin this — TestSweepOnce_FiltersNonWorkspacePrefixes
  and TestIsLikelyWorkspaceID with 10 alphabet cases.

- provisioner.go: comment added to ListWorkspaceContainerIDPrefixes
  flagging the substring/HasPrefix relationship as load-bearing.

Verified: full Go test suite passes; all 8 sweeper tests pass
(2 new for the LIKE-pattern guard); existing dispatch / delete /
provisioner tests unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-25 23:32:48 -07:00
parent 12c4918318
commit cb12601414
4 changed files with 138 additions and 13 deletions

View File

@ -426,13 +426,20 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
for _, descID := range descendantIDs {
stopAndRemove(descID)
db.ClearWorkspaceKeys(cleanupCtx, descID)
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_REMOVED", descID, map[string]interface{}{})
// Detach broadcaster ctx for the same reason as the cleanup
// above — RecordAndBroadcast does an INSERT INTO
// structure_events + Redis Publish. If the canvas hangs up,
// a request-ctx-bound INSERT can be cancelled mid-write,
// leaving other WS clients ignorant of the cascade. The DB
// row is already 'removed' so it's recoverable, but the
// inconsistency is avoidable.
h.broadcaster.RecordAndBroadcast(cleanupCtx, "WORKSPACE_REMOVED", descID, map[string]interface{}{})
}
stopAndRemove(id)
db.ClearWorkspaceKeys(cleanupCtx, id)
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_REMOVED", id, map[string]interface{}{
h.broadcaster.RecordAndBroadcast(cleanupCtx, "WORKSPACE_REMOVED", id, map[string]interface{}{
"cascade_deleted": len(descendantIDs),
})

View File

@ -168,6 +168,12 @@ func (p *Provisioner) ListWorkspaceContainerIDPrefixes(ctx context.Context) ([]s
// Container names from the API include a leading slash:
// "/ws-abc123def456". Strip both the slash and our prefix
// to recover the 12-char workspace ID.
//
// The Docker name filter is a SUBSTRING match (not a prefix
// match), so something like "my-ws-thing" would also be
// returned. The HasPrefix check below is load-bearing:
// without it those false positives would flow into the
// orphan sweeper's DB query as bogus LIKE patterns.
for _, name := range c.Names {
n := strings.TrimPrefix(name, "/")
if !strings.HasPrefix(n, containerNamePrefix) {
@ -873,19 +879,35 @@ func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) erro
// restart policy: if we ContainerStop first, the restart policy can
// respawn the container before ContainerRemove runs, leaving a zombie
// that re-registers via heartbeat after deletion.
//
// Returns nil on success AND on "container does not exist" (the cleanup
// goal is achieved either way). Returns the underlying Docker error
// only when the daemon actually failed to remove a live container —
// callers that follow Stop with RemoveVolume MUST check the return
// and skip volume removal on a real error, otherwise the volume
// removal will fail with "volume in use" because the container is
// still alive.
func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
name := ContainerName(workspaceID)
// Force-remove kills and removes in one atomic operation, bypassing
// the restart policy entirely. If the container doesn't exist, the
// error is harmless.
if err := p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true}); err != nil {
// Container may already be gone — log but don't fail.
log.Printf("Provisioner: force-remove warning for %s: %v", name, err)
// the restart policy entirely.
err := p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true})
if err == nil {
log.Printf("Provisioner: stopped and removed container %s", name)
return nil
}
log.Printf("Provisioner: stopped and removed container %s", name)
return nil
if isContainerNotFound(err) {
// Container was already gone — the post-condition we want is
// satisfied. Don't surface as an error.
log.Printf("Provisioner: container %s already gone (no-op)", name)
return nil
}
// Real failure: daemon timeout, socket EOF, ctx cancellation, etc.
// Caller (workspace_crud.stopAndRemove, orphan_sweeper.sweepOnce)
// must propagate this so they can skip the follow-up RemoveVolume.
log.Printf("Provisioner: force-remove failed for %s: %v", name, err)
return fmt.Errorf("force-remove %s: %w", name, err)
}
// IsRunning checks if a workspace container is currently running.

View File

@ -36,6 +36,29 @@ type OrphanReaper interface {
RemoveVolume(ctx context.Context, workspaceID string) error
}
// isLikelyWorkspaceID accepts strings shaped like a UUID prefix —
// hex chars and `-` only. Workspace IDs are full UUIDs and the
// container-name truncation keeps the hex prefix intact, so any
// container name that doesn't match this is by definition not one
// of ours and should be skipped. Also doubles as a SQL LIKE
// wildcard guard (rejects `_` and `%`).
func isLikelyWorkspaceID(s string) bool {
if s == "" {
return false
}
for _, r := range s {
switch {
case r >= '0' && r <= '9':
case r >= 'a' && r <= 'f':
case r >= 'A' && r <= 'F':
case r == '-':
default:
return false
}
}
return true
}
// OrphanSweepInterval is the cadence of the reconcile loop. 60s
// matches the heartbeat cadence (30s) × 2 — a single missed cleanup
// surfaces within ~90s end-to-end (canvas delete → next sweep tick →
@ -97,13 +120,25 @@ func sweepOnce(parent context.Context, reaper OrphanReaper) {
// statistically negligible). Use a single IN-style query so
// the cost is one round-trip regardless of leak count.
//
// LIKE patterns built from prefixes go through pq.Array → no
// SQL injection vector (prefixes come from Docker, not user
// input, but defence in depth).
// Defence: drop any prefix whose contents fall outside the
// hex-and-dash UUID alphabet. Workspace IDs are UUIDs, so
// container names follow ws-<12 hex chars>. Anything else is
// either a non-workspace container that slipped past the
// substring-match Docker filter (workspace-runner, etc.) or a
// malformed entry — neither should be turned into a LIKE
// pattern. Also blocks SQL LIKE wildcards (`_` and `%`) from
// reaching the query, even though Docker's container-name
// validation would already have rejected them upstream.
likes := make([]string, 0, len(prefixes))
for _, p := range prefixes {
if !isLikelyWorkspaceID(p) {
continue
}
likes = append(likes, p+"%")
}
if len(likes) == 0 {
return
}
rows, err := db.DB.QueryContext(ctx, `
SELECT id::text
FROM workspaces

View File

@ -175,6 +175,67 @@ func TestSweepOnce_VolumeRemoveErrorIsNonFatal(t *testing.T) {
}
}
// TestSweepOnce_FiltersNonWorkspacePrefixes — the Docker name filter
// is a SUBSTRING match so containers like "my-ws-thing" can slip
// through. The HasPrefix check in the provisioner trims those, but
// the in-sweeper isLikelyWorkspaceID guard is the second line of
// defence: anything outside the UUID alphabet (hex + dashes) is
// rejected before being turned into a SQL LIKE pattern. Locks in
// that no DB query fires when every prefix is filtered out.
func TestSweepOnce_FiltersNonWorkspacePrefixes(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
reaper := &fakeReaper{
listResponse: []string{
"not_a_uuid_at_all", // underscore not in UUID alphabet
"contains%wildcard", // SQL LIKE wildcard — must not reach the query
"contains_wildcard", // SQL LIKE single-char wildcard
"", // empty
"valid-but-non-workspace-name", // dash + lowercase letters that aren't hex
},
}
// No DB query expected — every prefix is rejected before the
// query builds, so we short-circuit. sqlmock fails on any
// unexpected query.
sweepOnce(context.Background(), reaper)
if len(reaper.stopCalls) != 0 {
t.Errorf("Stop must not fire when all prefixes filtered; got %v", reaper.stopCalls)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}
// TestIsLikelyWorkspaceID — pin the alphabet directly. This is the
// guard that prevents SQL LIKE wildcards (`%`, `_`) from reaching
// the sweeper's query.
func TestIsLikelyWorkspaceID(t *testing.T) {
cases := []struct {
in string
want bool
}{
{"abc123def456", true},
{"abcdef-1234-5678-90ab-cdef00112233", true},
{"ABC123DEF456", true}, // uppercase hex still allowed
{"", false},
{"abc_123", false}, // underscore (SQL LIKE single-char wildcard)
{"abc%123", false}, // percent (SQL LIKE multi-char wildcard)
{"hello world", false}, // space, non-hex letters
{"valid-but-not", false}, // 'l', 't', 'n' aren't hex
{"abc 123", false},
{".../escape", false},
}
for _, tc := range cases {
got := isLikelyWorkspaceID(tc.in)
if got != tc.want {
t.Errorf("isLikelyWorkspaceID(%q) = %v, want %v", tc.in, got, tc.want)
}
}
}
// TestStartOrphanSweeper_NilReaperIsNoOp — tolerance for the
// nil-provisioner path used by some test harnesses.
func TestStartOrphanSweeper_NilReaperIsNoOp(t *testing.T) {