fix(platform): make Provisioner.Stop return real errors so cleanup gates fire
Review caught a critical issue with 12c49183: the headline "skip
RemoveVolume when Stop fails" guarantee was dead code. `Provisioner.Stop`
unconditionally `return nil`'d after logging the underlying
ContainerRemove error, so the new `if err := h.provisioner.Stop(...);
err != nil { skip volume }` guard in workspace_crud.go AND the same
guard in the orphan sweeper could never fire. RemoveVolume always
ran, predictably failing with "volume in use" when Stop hadn't
actually killed the container — which is the exact production bug
the commit claimed to fix.
Now Stop:
- returns nil on successful remove (no change)
- returns nil when the container is already gone (uses the existing
isContainerNotFound helper — that's the cleanup post-condition,
not a failure)
- returns the wrapped Docker error otherwise (daemon timeout, ctx
cancellation, socket EOF — anything that means the container
might still be alive)
Audited every Provisioner.Stop caller in the tree (team.go,
workspace_restart.go ×4, workspace.go) — all of them already
discard the return value, so the widened error surface is purely
opt-in for the new cleanup paths and breaks no existing behaviour.
Other review-driven fixes in this commit:
- workspace_crud.go: detached `broadcaster.RecordAndBroadcast` from
the request ctx too. RecordAndBroadcast does INSERT INTO
structure_events + Redis Publish; if the canvas hangs up, a
request-ctx-bound INSERT can be cancelled mid-write and the
WORKSPACE_REMOVED event never lands, leaving other WS clients
ignorant of the cascade.
- orphan_sweeper.go: added isLikelyWorkspaceID guard before turning
Docker container prefixes into SQL LIKE patterns. The Docker
name filter is a SUBSTRING match (not prefix), so non-workspace
containers like `my-ws-tool` slip through; the in-loop HasPrefix
in provisioner trims most, but the in-sweeper alphabet check
(hex + dashes only) is the second line of defence and also
blocks SQL LIKE wildcards (`_`, `%`) from reaching the query.
Two new tests pin this — TestSweepOnce_FiltersNonWorkspacePrefixes
and TestIsLikelyWorkspaceID with 10 alphabet cases.
- provisioner.go: comment added to ListWorkspaceContainerIDPrefixes
flagging the substring/HasPrefix relationship as load-bearing.
Verified: full Go test suite passes; all 8 sweeper tests pass
(2 new for the LIKE-pattern guard); existing dispatch / delete /
provisioner tests unchanged.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
12c4918318
commit
cb12601414
@ -426,13 +426,20 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
|
||||
for _, descID := range descendantIDs {
|
||||
stopAndRemove(descID)
|
||||
db.ClearWorkspaceKeys(cleanupCtx, descID)
|
||||
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_REMOVED", descID, map[string]interface{}{})
|
||||
// Detach broadcaster ctx for the same reason as the cleanup
|
||||
// above — RecordAndBroadcast does an INSERT INTO
|
||||
// structure_events + Redis Publish. If the canvas hangs up,
|
||||
// a request-ctx-bound INSERT can be cancelled mid-write,
|
||||
// leaving other WS clients ignorant of the cascade. The DB
|
||||
// row is already 'removed' so it's recoverable, but the
|
||||
// inconsistency is avoidable.
|
||||
h.broadcaster.RecordAndBroadcast(cleanupCtx, "WORKSPACE_REMOVED", descID, map[string]interface{}{})
|
||||
}
|
||||
|
||||
stopAndRemove(id)
|
||||
db.ClearWorkspaceKeys(cleanupCtx, id)
|
||||
|
||||
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_REMOVED", id, map[string]interface{}{
|
||||
h.broadcaster.RecordAndBroadcast(cleanupCtx, "WORKSPACE_REMOVED", id, map[string]interface{}{
|
||||
"cascade_deleted": len(descendantIDs),
|
||||
})
|
||||
|
||||
|
||||
@ -168,6 +168,12 @@ func (p *Provisioner) ListWorkspaceContainerIDPrefixes(ctx context.Context) ([]s
|
||||
// Container names from the API include a leading slash:
|
||||
// "/ws-abc123def456". Strip both the slash and our prefix
|
||||
// to recover the 12-char workspace ID.
|
||||
//
|
||||
// The Docker name filter is a SUBSTRING match (not a prefix
|
||||
// match), so something like "my-ws-thing" would also be
|
||||
// returned. The HasPrefix check below is load-bearing:
|
||||
// without it those false positives would flow into the
|
||||
// orphan sweeper's DB query as bogus LIKE patterns.
|
||||
for _, name := range c.Names {
|
||||
n := strings.TrimPrefix(name, "/")
|
||||
if !strings.HasPrefix(n, containerNamePrefix) {
|
||||
@ -873,19 +879,35 @@ func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) erro
|
||||
// restart policy: if we ContainerStop first, the restart policy can
|
||||
// respawn the container before ContainerRemove runs, leaving a zombie
|
||||
// that re-registers via heartbeat after deletion.
|
||||
//
|
||||
// Returns nil on success AND on "container does not exist" (the cleanup
|
||||
// goal is achieved either way). Returns the underlying Docker error
|
||||
// only when the daemon actually failed to remove a live container —
|
||||
// callers that follow Stop with RemoveVolume MUST check the return
|
||||
// and skip volume removal on a real error, otherwise the volume
|
||||
// removal will fail with "volume in use" because the container is
|
||||
// still alive.
|
||||
func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
|
||||
name := ContainerName(workspaceID)
|
||||
|
||||
// Force-remove kills and removes in one atomic operation, bypassing
|
||||
// the restart policy entirely. If the container doesn't exist, the
|
||||
// error is harmless.
|
||||
if err := p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true}); err != nil {
|
||||
// Container may already be gone — log but don't fail.
|
||||
log.Printf("Provisioner: force-remove warning for %s: %v", name, err)
|
||||
// the restart policy entirely.
|
||||
err := p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true})
|
||||
if err == nil {
|
||||
log.Printf("Provisioner: stopped and removed container %s", name)
|
||||
return nil
|
||||
}
|
||||
|
||||
log.Printf("Provisioner: stopped and removed container %s", name)
|
||||
return nil
|
||||
if isContainerNotFound(err) {
|
||||
// Container was already gone — the post-condition we want is
|
||||
// satisfied. Don't surface as an error.
|
||||
log.Printf("Provisioner: container %s already gone (no-op)", name)
|
||||
return nil
|
||||
}
|
||||
// Real failure: daemon timeout, socket EOF, ctx cancellation, etc.
|
||||
// Caller (workspace_crud.stopAndRemove, orphan_sweeper.sweepOnce)
|
||||
// must propagate this so they can skip the follow-up RemoveVolume.
|
||||
log.Printf("Provisioner: force-remove failed for %s: %v", name, err)
|
||||
return fmt.Errorf("force-remove %s: %w", name, err)
|
||||
}
|
||||
|
||||
// IsRunning checks if a workspace container is currently running.
|
||||
|
||||
@ -36,6 +36,29 @@ type OrphanReaper interface {
|
||||
RemoveVolume(ctx context.Context, workspaceID string) error
|
||||
}
|
||||
|
||||
// isLikelyWorkspaceID accepts strings shaped like a UUID prefix —
|
||||
// hex chars and `-` only. Workspace IDs are full UUIDs and the
|
||||
// container-name truncation keeps the hex prefix intact, so any
|
||||
// container name that doesn't match this is by definition not one
|
||||
// of ours and should be skipped. Also doubles as a SQL LIKE
|
||||
// wildcard guard (rejects `_` and `%`).
|
||||
func isLikelyWorkspaceID(s string) bool {
|
||||
if s == "" {
|
||||
return false
|
||||
}
|
||||
for _, r := range s {
|
||||
switch {
|
||||
case r >= '0' && r <= '9':
|
||||
case r >= 'a' && r <= 'f':
|
||||
case r >= 'A' && r <= 'F':
|
||||
case r == '-':
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// OrphanSweepInterval is the cadence of the reconcile loop. 60s
|
||||
// matches the heartbeat cadence (30s) × 2 — a single missed cleanup
|
||||
// surfaces within ~90s end-to-end (canvas delete → next sweep tick →
|
||||
@ -97,13 +120,25 @@ func sweepOnce(parent context.Context, reaper OrphanReaper) {
|
||||
// statistically negligible). Use a single IN-style query so
|
||||
// the cost is one round-trip regardless of leak count.
|
||||
//
|
||||
// LIKE patterns built from prefixes go through pq.Array → no
|
||||
// SQL injection vector (prefixes come from Docker, not user
|
||||
// input, but defence in depth).
|
||||
// Defence: drop any prefix whose contents fall outside the
|
||||
// hex-and-dash UUID alphabet. Workspace IDs are UUIDs, so
|
||||
// container names follow ws-<12 hex chars>. Anything else is
|
||||
// either a non-workspace container that slipped past the
|
||||
// substring-match Docker filter (workspace-runner, etc.) or a
|
||||
// malformed entry — neither should be turned into a LIKE
|
||||
// pattern. Also blocks SQL LIKE wildcards (`_` and `%`) from
|
||||
// reaching the query, even though Docker's container-name
|
||||
// validation would already have rejected them upstream.
|
||||
likes := make([]string, 0, len(prefixes))
|
||||
for _, p := range prefixes {
|
||||
if !isLikelyWorkspaceID(p) {
|
||||
continue
|
||||
}
|
||||
likes = append(likes, p+"%")
|
||||
}
|
||||
if len(likes) == 0 {
|
||||
return
|
||||
}
|
||||
rows, err := db.DB.QueryContext(ctx, `
|
||||
SELECT id::text
|
||||
FROM workspaces
|
||||
|
||||
@ -175,6 +175,67 @@ func TestSweepOnce_VolumeRemoveErrorIsNonFatal(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestSweepOnce_FiltersNonWorkspacePrefixes — the Docker name filter
|
||||
// is a SUBSTRING match so containers like "my-ws-thing" can slip
|
||||
// through. The HasPrefix check in the provisioner trims those, but
|
||||
// the in-sweeper isLikelyWorkspaceID guard is the second line of
|
||||
// defence: anything outside the UUID alphabet (hex + dashes) is
|
||||
// rejected before being turned into a SQL LIKE pattern. Locks in
|
||||
// that no DB query fires when every prefix is filtered out.
|
||||
func TestSweepOnce_FiltersNonWorkspacePrefixes(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
|
||||
reaper := &fakeReaper{
|
||||
listResponse: []string{
|
||||
"not_a_uuid_at_all", // underscore not in UUID alphabet
|
||||
"contains%wildcard", // SQL LIKE wildcard — must not reach the query
|
||||
"contains_wildcard", // SQL LIKE single-char wildcard
|
||||
"", // empty
|
||||
"valid-but-non-workspace-name", // dash + lowercase letters that aren't hex
|
||||
},
|
||||
}
|
||||
|
||||
// No DB query expected — every prefix is rejected before the
|
||||
// query builds, so we short-circuit. sqlmock fails on any
|
||||
// unexpected query.
|
||||
sweepOnce(context.Background(), reaper)
|
||||
|
||||
if len(reaper.stopCalls) != 0 {
|
||||
t.Errorf("Stop must not fire when all prefixes filtered; got %v", reaper.stopCalls)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsLikelyWorkspaceID — pin the alphabet directly. This is the
|
||||
// guard that prevents SQL LIKE wildcards (`%`, `_`) from reaching
|
||||
// the sweeper's query.
|
||||
func TestIsLikelyWorkspaceID(t *testing.T) {
|
||||
cases := []struct {
|
||||
in string
|
||||
want bool
|
||||
}{
|
||||
{"abc123def456", true},
|
||||
{"abcdef-1234-5678-90ab-cdef00112233", true},
|
||||
{"ABC123DEF456", true}, // uppercase hex still allowed
|
||||
{"", false},
|
||||
{"abc_123", false}, // underscore (SQL LIKE single-char wildcard)
|
||||
{"abc%123", false}, // percent (SQL LIKE multi-char wildcard)
|
||||
{"hello world", false}, // space, non-hex letters
|
||||
{"valid-but-not", false}, // 'l', 't', 'n' aren't hex
|
||||
{"abc 123", false},
|
||||
{".../escape", false},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
got := isLikelyWorkspaceID(tc.in)
|
||||
if got != tc.want {
|
||||
t.Errorf("isLikelyWorkspaceID(%q) = %v, want %v", tc.in, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestStartOrphanSweeper_NilReaperIsNoOp — tolerance for the
|
||||
// nil-provisioner path used by some test harnesses.
|
||||
func TestStartOrphanSweeper_NilReaperIsNoOp(t *testing.T) {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user