diff --git a/workspace-server/internal/provisioner/isrunning_test.go b/workspace-server/internal/provisioner/isrunning_test.go index 0f5c587c..3d217301 100644 --- a/workspace-server/internal/provisioner/isrunning_test.go +++ b/workspace-server/internal/provisioner/isrunning_test.go @@ -49,3 +49,48 @@ func TestIsContainerNotFound(t *testing.T) { }) } } + +// isRemovalInProgress decides whether Stop() treats Docker's "already +// being removed" race as success (the container WILL be gone) versus +// surfacing a 500 to the caller. False negative on cascade-delete +// breaks the UX ("workspace marked removed, but stop call(s) failed — +// please retry" when the workspace is, in fact, removed). False +// positive would silently swallow a different daemon error and skip +// the volume cleanup. Both directions matter — pin the truth table. +func TestIsRemovalInProgress(t *testing.T) { + cases := []struct { + name string + err error + want bool + }{ + {"nil", nil, false}, + {"docker race message", + errors.New(`Error response from daemon: removal of container ws-eeb99b5d-607 is already in progress`), + true}, + {"docker race without ws prefix", + errors.New(`removal of container abc123 is already in progress`), + true}, + // "already in progress" alone is too generic — would false- + // positive on e.g. "image pull is already in progress". Both + // substrings must be present. + {"unrelated already in progress", + errors.New(`image pull is already in progress`), + false}, + {"not-found is NOT removal-in-progress", + errors.New(`Error response from daemon: No such container: ws-abc`), + false}, + {"context deadline", + errors.New("context deadline exceeded"), + false}, + {"empty string", + errors.New(""), + false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := isRemovalInProgress(tc.err); got != tc.want { + t.Errorf("isRemovalInProgress(%q) = %v, want %v", tc.err, got, tc.want) + } + }) + } +} diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index e9171d77..dd94b08d 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -983,6 +983,17 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error { log.Printf("Provisioner: container %s already gone (no-op)", name) return nil } + if isRemovalInProgress(err) { + // Another concurrent caller (orphan sweeper, sibling cascade + // delete, manual `docker rm -f`) is already removing this + // container. The post-condition is the same as success: the + // container WILL be gone shortly. Surfacing this as a 500 on + // cascade-delete causes UI confusion ("workspace marked + // removed, but stop call(s) failed — please retry") even + // though retrying would just race the same in-flight removal. + log.Printf("Provisioner: container %s removal already in progress (no-op)", name) + return nil + } // Real failure: daemon timeout, socket EOF, ctx cancellation, etc. // Caller (workspace_crud.stopAndRemove, orphan_sweeper.sweepOnce) // must propagate this so they can skip the follow-up RemoveVolume. @@ -1048,6 +1059,29 @@ func isContainerNotFound(err error) bool { strings.Contains(s, "not found") } +// isRemovalInProgress detects the race where Docker is already removing +// the container in response to a concurrent call. Symptom observed +// during cascade-delete of a 7-workspace org: two of the seven returned +// +// Error response from daemon: removal of container ws-xxx is already in progress +// +// because the platform's deletion fanout fired Stop() on every workspace +// in parallel and the orphan sweeper happened to also reap two of them +// at the same instant. The post-condition is identical to a successful +// removal — the container WILL be gone — so callers should treat this +// as a no-op rather than a real failure. +// +// String-match for the same reason as isContainerNotFound: docker/docker +// surfaces this as a plain error string, no typed predicate. The CLI +// itself relies on the message text. +func isRemovalInProgress(err error) bool { + if err == nil { + return false + } + return strings.Contains(err.Error(), "removal of container") && + strings.Contains(err.Error(), "already in progress") +} + // DockerClient returns the underlying Docker client for sharing with other handlers. func (p *Provisioner) DockerClient() *client.Client { return p.cli