From 9511870f8ca69fa51630b2505266f9d0d96d1c19 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Tue, 9 Jun 2026 18:52:02 +0000 Subject: [PATCH 1/8] fix(provisioner): KI-013 rename-migrate legacy truncated containers/volumes in-place MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the legacy-name-forever fallback with an active rename-migrate: - RunningContainerName: if a legacy container is still running, rename it to the new full-ID name via ContainerRename so all callers converge on collision-safe names. - resolveConfigVolumeName / resolveClaudeSessionVolumeName: if a legacy truncated-name volume exists, copy its data to a new full-ID volume via a short-lived alpine container, then remove the legacy volume. This is idempotent — calling it multiple times is safe. - New migrateVolumeIfNeeded helper encapsulates the copy-and-remove logic. - Existing 3 collision-regression tests kept. - New TestMigrateVolumeIfNeeded_ExistingTruncatedVolume integration test verifies data survives migration and legacy volume is removed. Content-security: no secrets, host paths, or provisioning mechanics leaked in log strings. Co-Authored-By: Claude Opus 4.8 --- .../internal/provisioner/provisioner.go | 128 +++++++++++++++--- .../internal/provisioner/provisioner_test.go | 115 ++++++++++++++++ 2 files changed, 223 insertions(+), 20 deletions(-) diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index 29ddf619d..f8c161296 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -1336,35 +1336,107 @@ func (p *Provisioner) WriteAuthTokenToVolume(ctx context.Context, workspaceID, t } // resolveConfigVolumeName returns the effective config volume name for a -// workspace, preferring the legacy truncated name if that volume already -// exists (KI-013 deploy safety: pre-deploy volumes must not be orphaned). +// workspace. KI-013 deploy safety: if a legacy truncated-name volume exists, +// it is migrated in-place to the new full-ID name so existing workspace data +// is preserved AND all workspaces eventually use collision-safe names. func (p *Provisioner) resolveConfigVolumeName(ctx context.Context, workspaceID string) string { if p == nil || p.cli == nil { return ConfigVolumeName(workspaceID) } + newName := ConfigVolumeName(workspaceID) legacy := legacyConfigVolumeName(workspaceID) - if _, err := p.cli.VolumeInspect(ctx, legacy); err == nil { - return legacy + if err := p.migrateVolumeIfNeeded(ctx, newName, legacy); err != nil { + log.Printf("Provisioner: volume migration warning for %s: %v", workspaceID, err) } - return ConfigVolumeName(workspaceID) + return newName } // resolveClaudeSessionVolumeName returns the effective claude-sessions volume -// name, preferring the legacy truncated name if that volume already exists. +// name. KI-013 deploy safety: legacy truncated-name volumes are migrated +// in-place to the new full-ID name. func (p *Provisioner) resolveClaudeSessionVolumeName(ctx context.Context, workspaceID string) string { if p == nil || p.cli == nil { return ClaudeSessionVolumeName(workspaceID) } + newName := ClaudeSessionVolumeName(workspaceID) legacy := legacyClaudeSessionVolumeName(workspaceID) - if _, err := p.cli.VolumeInspect(ctx, legacy); err == nil { - return legacy + if err := p.migrateVolumeIfNeeded(ctx, newName, legacy); err != nil { + log.Printf("Provisioner: session volume migration warning for %s: %v", workspaceID, err) } - return ClaudeSessionVolumeName(workspaceID) + return newName +} + +// migrateVolumeIfNeeded renames a legacy truncated-name Docker volume to its +// new full-ID name by copying data via a temporary alpine container. If the +// legacy volume does not exist, or the new volume already exists, this is a +// no-op. The operation is idempotent — calling it multiple times is safe. +func (p *Provisioner) migrateVolumeIfNeeded(ctx context.Context, newName, legacyName string) error { + if p == nil || p.cli == nil { + return nil + } + + // Legacy volume missing — nothing to migrate. + if _, err := p.cli.VolumeInspect(ctx, legacyName); err != nil { + return nil + } + + // New volume already exists — migration already done (or new workspace). + if _, err := p.cli.VolumeInspect(ctx, newName); err == nil { + return nil + } + + // Create the new volume. + if _, err := p.cli.VolumeCreate(ctx, volume.CreateOptions{ + Name: newName, + Labels: managedLabels(), + }); err != nil { + return fmt.Errorf("create new volume %s: %w", newName, err) + } + + // Copy data from legacy to new via a short-lived alpine container. + resp, err := p.cli.ContainerCreate(ctx, &container.Config{ + Image: "alpine", + Cmd: []string{"sh", "-c", "cp -a /legacy/. /new/"}, + }, &container.HostConfig{ + Binds: []string{ + legacyName + ":/legacy", + newName + ":/new", + }, + }, nil, nil, "") + if err != nil { + return fmt.Errorf("create migration container: %w", err) + } + defer p.cli.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true}) + + if err := p.cli.ContainerStart(ctx, resp.ID, container.StartOptions{}); err != nil { + return fmt.Errorf("start migration container: %w", err) + } + + waitCh, errCh := p.cli.ContainerWait(ctx, resp.ID, container.WaitConditionNotRunning) + select { + case <-waitCh: + case err := <-errCh: + if err != nil { + return fmt.Errorf("migration container exited with error: %w", err) + } + } + + // Best-effort cleanup of the legacy volume. If removal fails (e.g. still + // referenced by a running container), the new volume is already populated + // and the next restart will retry. + if err := p.cli.VolumeRemove(ctx, legacyName, true); err != nil { + log.Printf("Provisioner: warning: failed to remove legacy volume %s after migration: %v", legacyName, err) + } else { + log.Printf("Provisioner: migrated legacy volume %s → %s", legacyName, newName) + } + return nil } // RemoveVolume removes the config volume for a workspace. // Also removes the claude-sessions volume (best-effort, may not exist // for non claude-code runtimes). Issue #12. +// Also removes the claude-sessions volume (best-effort, may not exist +// for non claude-code runtimes). Issue #12. func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) error { if p == nil || p.cli == nil { return ErrNoBackend @@ -1494,20 +1566,36 @@ func RunningContainerName(ctx context.Context, cli *client.Client, workspaceID s if cli == nil { return "", ErrNoBackend } - // KI-013 deploy safety: new full-ID name first, then fall back to the - // old truncated name so pre-deploy containers are still discoverable. - names := []string{ContainerName(workspaceID), legacyContainerName(workspaceID)} - for _, name := range names { - info, err := cli.ContainerInspect(ctx, name) - if err != nil { - if isContainerNotFound(err) { - continue + + newName := ContainerName(workspaceID) + legacyName := legacyContainerName(workspaceID) + + // If a legacy container is still running, rename it in-place to the + // new full-ID name so all callers converge on collision-safe names. + legacyInfo, legacyErr := cli.ContainerInspect(ctx, legacyName) + if legacyErr == nil && legacyInfo.State.Running { + if _, newErr := cli.ContainerInspect(ctx, newName); isContainerNotFound(newErr) { + if renameErr := cli.ContainerRename(ctx, legacyName, newName); renameErr == nil { + log.Printf("Provisioner: renamed legacy container %s → %s", legacyName, newName) + return newName, nil } - return "", err + log.Printf("Provisioner: warning: failed to rename legacy container %s → %s: %v", legacyName, newName, renameErr) } - if info.State.Running { - return name, nil + // Rename not possible (or new name already occupied) — return legacy + // name so the caller can still exec into the live container. + return legacyName, nil + } + + // Standard path: look for a running container with the new name. + info, err := cli.ContainerInspect(ctx, newName) + if err != nil { + if isContainerNotFound(err) { + return "", nil } + return "", err + } + if info.State.Running { + return newName, nil } return "", nil } diff --git a/workspace-server/internal/provisioner/provisioner_test.go b/workspace-server/internal/provisioner/provisioner_test.go index da597a824..bbd6a5951 100644 --- a/workspace-server/internal/provisioner/provisioner_test.go +++ b/workspace-server/internal/provisioner/provisioner_test.go @@ -2,14 +2,19 @@ package provisioner import ( "archive/tar" + "context" "errors" "io" "os" "path/filepath" + "strconv" "strings" "testing" + "time" "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/volume" + "github.com/docker/docker/client" ) // TestValidateConfigSource covers issue #17: a workspace restart with no @@ -1422,3 +1427,113 @@ func TestApplyTierConfig_T3_DefaultCap(t *testing.T) { t.Errorf("T3 default NanoCPUs: got %d, want %d", hc.NanoCPUs, wantCPU) } } + +// TestMigrateVolumeIfNeeded_ExistingTruncatedVolume verifies the KI-013 deploy +// safety path: when a legacy truncated-name volume already exists, data is +// copied to the new full-ID name and the legacy volume is removed. Existing +// workspace state is preserved without operator intervention. +func TestMigrateVolumeIfNeeded_ExistingTruncatedVolume(t *testing.T) { + ctx := context.Background() + + cli, err := client.NewClientWithOpts(client.FromEnv, client.WithAPIVersionNegotiation()) + if err != nil { + t.Skip("docker client unavailable:", err) + } + if _, pingErr := cli.Ping(ctx); pingErr != nil { + t.Skip("docker daemon unreachable:", pingErr) + } + + p := &Provisioner{cli: cli} + workspaceID := "test-migrate-" + strconv.FormatInt(time.Now().UnixNano(), 10) + legacyName := legacyConfigVolumeName(workspaceID) + newName := ConfigVolumeName(workspaceID) + + // Cleanup before and after (defensive — avoid pollution on retries). + _ = cli.VolumeRemove(ctx, legacyName, true) + _ = cli.VolumeRemove(ctx, newName, true) + defer func() { + _ = cli.VolumeRemove(ctx, legacyName, true) + _ = cli.VolumeRemove(ctx, newName, true) + }() + + // 1. Create legacy volume and seed it with a sentinel file. + if _, err := cli.VolumeCreate(ctx, volume.CreateOptions{Name: legacyName}); err != nil { + t.Fatalf("create legacy volume: %v", err) + } + seedResp, err := cli.ContainerCreate(ctx, &container.Config{ + Image: "alpine", + Cmd: []string{"sh", "-c", "echo sentinel-data > /vol/sentinel.txt"}, + }, &container.HostConfig{ + Binds: []string{legacyName + ":/vol"}, + }, nil, nil, "") + if err != nil { + t.Fatalf("create seed container: %v", err) + } + defer cli.ContainerRemove(ctx, seedResp.ID, container.RemoveOptions{Force: true}) + if err := cli.ContainerStart(ctx, seedResp.ID, container.StartOptions{}); err != nil { + t.Fatalf("start seed container: %v", err) + } + waitCh, errCh := cli.ContainerWait(ctx, seedResp.ID, container.WaitConditionNotRunning) + select { + case <-waitCh: + case err := <-errCh: + if err != nil { + t.Fatalf("seed container failed: %v", err) + } + } + + // 2. Run migration. + if err := p.migrateVolumeIfNeeded(ctx, newName, legacyName); err != nil { + t.Fatalf("migrateVolumeIfNeeded failed: %v", err) + } + + // 3. Legacy volume must be gone. + if _, inspectErr := cli.VolumeInspect(ctx, legacyName); inspectErr == nil { + t.Fatalf("legacy volume %s still exists after migration", legacyName) + } + + // 4. New volume must exist and contain the sentinel file. + if _, inspectErr := cli.VolumeInspect(ctx, newName); inspectErr != nil { + t.Fatalf("new volume %s does not exist after migration: %v", newName, inspectErr) + } + + readResp, err := cli.ContainerCreate(ctx, &container.Config{ + Image: "alpine", + Cmd: []string{"cat", "/vol/sentinel.txt"}, + }, &container.HostConfig{ + Binds: []string{newName + ":/vol"}, + }, nil, nil, "") + if err != nil { + t.Fatalf("create read container: %v", err) + } + defer cli.ContainerRemove(ctx, readResp.ID, container.RemoveOptions{Force: true}) + if err := cli.ContainerStart(ctx, readResp.ID, container.StartOptions{}); err != nil { + t.Fatalf("start read container: %v", err) + } + waitCh, errCh = cli.ContainerWait(ctx, readResp.ID, container.WaitConditionNotRunning) + select { + case <-waitCh: + case err := <-errCh: + if err != nil { + t.Fatalf("read container failed: %v", err) + } + } + + logs, err := cli.ContainerLogs(ctx, readResp.ID, container.LogsOptions{ShowStdout: true}) + if err != nil { + t.Fatalf("read container logs: %v", err) + } + defer logs.Close() + data, err := io.ReadAll(logs) + if err != nil { + t.Fatalf("read logs: %v", err) + } + if !strings.Contains(string(data), "sentinel-data") { + t.Fatalf("new volume missing sentinel data; logs: %q", data) + } + + // 5. Idempotency: second migration must be a no-op. + if err := p.migrateVolumeIfNeeded(ctx, newName, legacyName); err != nil { + t.Fatalf("second migration (idempotency) failed: %v", err) + } +} -- 2.52.0 From f4324236ca607123774bf9f923fbf28d4e5a8c92 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Tue, 9 Jun 2026 19:00:11 +0000 Subject: [PATCH 2/8] fix(provisioner): correct renameErr scope in RunningContainerName The warning log for a failed legacy-container rename referenced renameErr outside the if-block that declared it, causing a compile error. Move the warning into an else branch so renameErr stays in scope. Refs #2490 --- workspace-server/internal/provisioner/provisioner.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index f8c161296..61df9d8f7 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -1578,8 +1578,9 @@ func RunningContainerName(ctx context.Context, cli *client.Client, workspaceID s if renameErr := cli.ContainerRename(ctx, legacyName, newName); renameErr == nil { log.Printf("Provisioner: renamed legacy container %s → %s", legacyName, newName) return newName, nil + } else { + log.Printf("Provisioner: warning: failed to rename legacy container %s → %s: %v", legacyName, newName, renameErr) } - log.Printf("Provisioner: warning: failed to rename legacy container %s → %s: %v", legacyName, newName, renameErr) } // Rename not possible (or new name already occupied) — return legacy // name so the caller can still exec into the live container. -- 2.52.0 From d6a8c0867a5e7262c7caf958c89fa9f4e4bd01c3 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Tue, 9 Jun 2026 23:31:14 +0000 Subject: [PATCH 3/8] =?UTF-8?q?chore:=20retrigger=20CI=20=E2=80=94=20Platf?= =?UTF-8?q?orm=20(Go)=20and=20all-required=20did=20not=20run=20on=20prior?= =?UTF-8?q?=20push?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit -- 2.52.0 From e0bd236a8435b3037215d180b2cbe2fecd645397 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Tue, 9 Jun 2026 23:42:56 +0000 Subject: [PATCH 4/8] fix(provisioner): remove migration container before legacy volume cleanup (#2490) The migration container was still referenced (exited but not removed) when VolumeRemove was called, causing the legacy volume removal to fail silently. The test then found the legacy volume still existed. Explicitly remove the migration container after ContainerWait and before VolumeRemove. The existing defer is kept as a safety net for early-return paths. Refs #2490 --- workspace-server/internal/provisioner/provisioner.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index 61df9d8f7..57cf8c8e7 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -1421,6 +1421,11 @@ func (p *Provisioner) migrateVolumeIfNeeded(ctx context.Context, newName, legacy } } + // Explicitly remove the migration container before removing the legacy + // volume so the volume is no longer referenced. The deferred remove above + // is a safety-net for early-return paths. + _ = p.cli.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true}) + // Best-effort cleanup of the legacy volume. If removal fails (e.g. still // referenced by a running container), the new volume is already populated // and the next restart will retry. -- 2.52.0 From b6055ef05bed20e64ff0a5e4cc3e4aecd5540f32 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Tue, 9 Jun 2026 23:48:24 +0000 Subject: [PATCH 5/8] fix(provisioner): remove seed container before migration test (#2490) The seed container was still referenced (exited but not removed) when migrateVolumeIfNeeded was called, causing the legacy volume removal to fail silently because the volume was still in use. Explicitly remove the seed container after ContainerWait and before migrateVolumeIfNeeded. The existing defer is kept as a safety net. Refs #2490 --- workspace-server/internal/provisioner/provisioner_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/workspace-server/internal/provisioner/provisioner_test.go b/workspace-server/internal/provisioner/provisioner_test.go index bbd6a5951..988c1d580 100644 --- a/workspace-server/internal/provisioner/provisioner_test.go +++ b/workspace-server/internal/provisioner/provisioner_test.go @@ -1481,6 +1481,12 @@ func TestMigrateVolumeIfNeeded_ExistingTruncatedVolume(t *testing.T) { t.Fatalf("seed container failed: %v", err) } } + // Remove the seed container before migration so the legacy volume is + // no longer referenced by any container. The deferred remove above is a + // safety net for panic/early-return paths. + if err := cli.ContainerRemove(ctx, seedResp.ID, container.RemoveOptions{Force: true}); err != nil { + t.Fatalf("remove seed container: %v", err) + } // 2. Run migration. if err := p.migrateVolumeIfNeeded(ctx, newName, legacyName); err != nil { -- 2.52.0 From fa64c31a15cfffd3a65fed28f3b0aa13200ac5b2 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Wed, 10 Jun 2026 00:56:01 +0000 Subject: [PATCH 6/8] test(provisioner): direct unit tests for KI-013 migrate fallback paths (#2482) Adds fake-docker-client unit tests that pin the backward-compat branches introduced in the KI-013 deploy-safety path: - resolveConfigVolumeName / resolveClaudeSessionVolumeName: * legacy truncated volume exists -> migrate in place, legacy removed, no orphan * legacy absent -> use full-ID name, zero mutation calls - migrateVolumeIfNeeded: non-zero copy exit preserves the legacy volume (data-loss guard) - Stop: falls back to legacy container name when full-id is absent - RunningContainerName: * returns the new full-id name when it is running * renames a legacy running container when possible * falls back to the legacy name when rename fails and full-id is absent * returns empty when neither container exists Also adds ContainerRename to the dockerClient interface so the fallback path is reachable in tests. Refs #2482 / #2490 --- .../internal/provisioner/provisioner.go | 41 +- .../provisioner/provisioner_migrate_test.go | 462 ++++++++++++++++++ 2 files changed, 499 insertions(+), 4 deletions(-) create mode 100644 workspace-server/internal/provisioner/provisioner_migrate_test.go diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index 57cf8c8e7..3967e0a67 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -19,6 +19,7 @@ import ( "sync" "time" + "github.com/docker/docker/api/types" "github.com/docker/docker/api/types/container" "github.com/docker/docker/api/types/filters" dockerimage "github.com/docker/docker/api/types/image" @@ -227,9 +228,31 @@ func legacyClaudeSessionVolumeName(workspaceID string) string { return fmt.Sprintf("ws-%s-claude-sessions", id) } +// dockerClient is the subset of client.Client methods used by Provisioner. +// Declared as an interface so tests can inject fakes without a real daemon. +type dockerClient interface { + Close() error + ContainerCreate(ctx context.Context, config *container.Config, hostConfig *container.HostConfig, networkingConfig *network.NetworkingConfig, platform *ocispec.Platform, containerName string) (container.CreateResponse, error) + ContainerExecAttach(ctx context.Context, execID string, config container.ExecAttachOptions) (types.HijackedResponse, error) + ContainerExecCreate(ctx context.Context, container string, config container.ExecOptions) (container.ExecCreateResponse, error) + ContainerInspect(ctx context.Context, container string) (container.InspectResponse, error) + ContainerList(ctx context.Context, options container.ListOptions) ([]container.Summary, error) + ContainerLogs(ctx context.Context, container string, options container.LogsOptions) (io.ReadCloser, error) + ContainerRemove(ctx context.Context, container string, options container.RemoveOptions) error + ContainerRename(ctx context.Context, container, newContainerName string) error + ContainerStart(ctx context.Context, container string, options container.StartOptions) error + ContainerWait(ctx context.Context, container string, condition container.WaitCondition) (<-chan container.WaitResponse, <-chan error) + CopyToContainer(ctx context.Context, container, path string, content io.Reader, options container.CopyToContainerOptions) error + ImageInspect(ctx context.Context, image string, opts ...client.ImageInspectOption) (dockerimage.InspectResponse, error) + ImagePull(ctx context.Context, ref string, opts dockerimage.PullOptions) (io.ReadCloser, error) + VolumeCreate(ctx context.Context, options volume.CreateOptions) (volume.Volume, error) + VolumeInspect(ctx context.Context, volumeID string) (volume.Volume, error) + VolumeRemove(ctx context.Context, volumeID string, force bool) error +} + // Provisioner manages Docker containers for workspace agents. type Provisioner struct { - cli *client.Client + cli dockerClient } // New creates a new Provisioner connected to the local Docker daemon. @@ -1414,7 +1437,10 @@ func (p *Provisioner) migrateVolumeIfNeeded(ctx context.Context, newName, legacy waitCh, errCh := p.cli.ContainerWait(ctx, resp.ID, container.WaitConditionNotRunning) select { - case <-waitCh: + case waitResp := <-waitCh: + if waitResp.StatusCode != 0 { + return fmt.Errorf("migration copy failed (exit %d) — preserving legacy volume %s for retry", waitResp.StatusCode, legacyName) + } case err := <-errCh: if err != nil { return fmt.Errorf("migration container exited with error: %w", err) @@ -1567,7 +1593,7 @@ func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool, // transient errors into the same "" return as a genuinely-stopped container. // That hid daemon flakes as misleading 503 "container not running" responses // AND let the two impls drift on edge-case behavior. This is the SSOT. -func RunningContainerName(ctx context.Context, cli *client.Client, workspaceID string) (string, error) { +func RunningContainerName(ctx context.Context, cli dockerClient, workspaceID string) (string, error) { if cli == nil { return "", ErrNoBackend } @@ -1652,8 +1678,15 @@ func isRemovalInProgress(err error) bool { } // DockerClient returns the underlying Docker client for sharing with other handlers. +// If the provisioner is backed by a fake (e.g. in unit tests), this returns nil. func (p *Provisioner) DockerClient() *client.Client { - return p.cli + if p == nil || p.cli == nil { + return nil + } + if c, ok := p.cli.(*client.Client); ok { + return c + } + return nil } // Close cleans up the Docker client. diff --git a/workspace-server/internal/provisioner/provisioner_migrate_test.go b/workspace-server/internal/provisioner/provisioner_migrate_test.go new file mode 100644 index 000000000..c0be63be2 --- /dev/null +++ b/workspace-server/internal/provisioner/provisioner_migrate_test.go @@ -0,0 +1,462 @@ +package provisioner + +import ( + "context" + "errors" + "io" + "strings" + "sync" + "testing" + + "github.com/docker/docker/api/types" + "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/image" + "github.com/docker/docker/api/types/network" + "github.com/docker/docker/client" + "github.com/docker/docker/api/types/volume" + ocispec "github.com/opencontainers/image-spec/specs-go/v1" +) + +// fakeDockerClient is a minimal in-memory implementation of the dockerClient +// interface used by the provisioner. It records calls and lets tests simulate +// the legacy/full-ID naming transition without a real Docker daemon. +type fakeDockerClient struct { + mu sync.Mutex + + volumes map[string]volume.Volume + containers map[string]container.InspectResponse + + renameErr error + migrationExitCode int64 + + volumeInspectCalls []string + volumeCreateCalls []volume.CreateOptions + volumeRemoveCalls []string + containerInspectCalls []string + containerRemoveCalls []string + containerRenameCalls []struct{ Old, New string } + containerCreateCalls []string + containerStartCalls []string +} + +func newFakeDockerClient() *fakeDockerClient { + return &fakeDockerClient{ + volumes: make(map[string]volume.Volume), + containers: make(map[string]container.InspectResponse), + } +} + +func (f *fakeDockerClient) Close() error { return nil } + +func (f *fakeDockerClient) ContainerCreate(ctx context.Context, config *container.Config, hostConfig *container.HostConfig, networkingConfig *network.NetworkingConfig, platform *ocispec.Platform, containerName string) (container.CreateResponse, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.containerCreateCalls = append(f.containerCreateCalls, containerName) + return container.CreateResponse{ID: "cid-" + containerName}, nil +} + +func (f *fakeDockerClient) ContainerExecAttach(ctx context.Context, execID string, config container.ExecAttachOptions) (types.HijackedResponse, error) { + return types.HijackedResponse{}, errors.New("not implemented") +} + +func (f *fakeDockerClient) ContainerExecCreate(ctx context.Context, ctr string, config container.ExecOptions) (container.ExecCreateResponse, error) { + return container.ExecCreateResponse{}, errors.New("not implemented") +} + +func (f *fakeDockerClient) ContainerInspect(ctx context.Context, name string) (container.InspectResponse, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.containerInspectCalls = append(f.containerInspectCalls, name) + if c, ok := f.containers[name]; ok { + return c, nil + } + return container.InspectResponse{}, errors.New("No such container: " + name) +} + +func (f *fakeDockerClient) ContainerList(ctx context.Context, options container.ListOptions) ([]container.Summary, error) { + return nil, errors.New("not implemented") +} + +func (f *fakeDockerClient) ContainerLogs(ctx context.Context, container string, options container.LogsOptions) (io.ReadCloser, error) { + return nil, errors.New("not implemented") +} + +func (f *fakeDockerClient) ContainerRemove(ctx context.Context, name string, options container.RemoveOptions) error { + f.mu.Lock() + defer f.mu.Unlock() + f.containerRemoveCalls = append(f.containerRemoveCalls, name) + if _, ok := f.containers[name]; ok { + delete(f.containers, name) + return nil + } + return errors.New("No such container: " + name) +} + +func (f *fakeDockerClient) ContainerRename(ctx context.Context, oldName, newName string) error { + f.mu.Lock() + defer f.mu.Unlock() + f.containerRenameCalls = append(f.containerRenameCalls, struct{ Old, New string }{oldName, newName}) + if f.renameErr != nil { + return f.renameErr + } + if c, ok := f.containers[oldName]; ok { + delete(f.containers, oldName) + c.Name = newName + f.containers[newName] = c + } + return nil +} + +func (f *fakeDockerClient) ContainerStart(ctx context.Context, id string, options container.StartOptions) error { + f.mu.Lock() + defer f.mu.Unlock() + f.containerStartCalls = append(f.containerStartCalls, id) + return nil +} + +func (f *fakeDockerClient) ContainerWait(ctx context.Context, id string, condition container.WaitCondition) (<-chan container.WaitResponse, <-chan error) { + waitCh := make(chan container.WaitResponse, 1) + errCh := make(chan error, 1) + f.mu.Lock() + code := f.migrationExitCode + f.mu.Unlock() + waitCh <- container.WaitResponse{StatusCode: code} + close(waitCh) + close(errCh) + return waitCh, errCh +} + +func (f *fakeDockerClient) CopyToContainer(ctx context.Context, container, path string, content io.Reader, options container.CopyToContainerOptions) error { + return errors.New("not implemented") +} + +func (f *fakeDockerClient) ImageInspect(ctx context.Context, img string, opts ...client.ImageInspectOption) (image.InspectResponse, error) { + return image.InspectResponse{}, errors.New("not implemented") +} + +func (f *fakeDockerClient) ImagePull(ctx context.Context, ref string, opts image.PullOptions) (io.ReadCloser, error) { + return nil, errors.New("not implemented") +} + +func (f *fakeDockerClient) VolumeCreate(ctx context.Context, options volume.CreateOptions) (volume.Volume, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.volumeCreateCalls = append(f.volumeCreateCalls, options) + v := volume.Volume{Name: options.Name, Labels: options.Labels} + f.volumes[options.Name] = v + return v, nil +} + +func (f *fakeDockerClient) VolumeInspect(ctx context.Context, name string) (volume.Volume, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.volumeInspectCalls = append(f.volumeInspectCalls, name) + if v, ok := f.volumes[name]; ok { + return v, nil + } + return volume.Volume{}, errors.New("No such volume: " + name) +} + +func (f *fakeDockerClient) VolumeRemove(ctx context.Context, name string, force bool) error { + f.mu.Lock() + defer f.mu.Unlock() + f.volumeRemoveCalls = append(f.volumeRemoveCalls, name) + if _, ok := f.volumes[name]; ok { + delete(f.volumes, name) + return nil + } + return errors.New("No such volume: " + name) +} + +func runningContainer(name string) container.InspectResponse { + return container.InspectResponse{ + ContainerJSONBase: &container.ContainerJSONBase{ + Name: name, + State: &container.State{Running: true}, + }, + } +} + +const migrateTestWorkspaceID = "abcdef1234567890" + +func TestResolveConfigVolumeName_LegacyExists_MigratesInPlace(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + p := &Provisioner{cli: cli} + + newName := ConfigVolumeName(migrateTestWorkspaceID) + legacyName := legacyConfigVolumeName(migrateTestWorkspaceID) + cli.volumes[legacyName] = volume.Volume{Name: legacyName} + + got := p.resolveConfigVolumeName(ctx, migrateTestWorkspaceID) + if got != newName { + t.Fatalf("resolveConfigVolumeName: got %q, want %q", got, newName) + } + + // Migration path must inspect both names, then create new volume and copy. + if !strings.Contains(strings.Join(cli.volumeInspectCalls, ","), legacyName) { + t.Fatalf("expected VolumeInspect(%s) for legacy presence check", legacyName) + } + if !strings.Contains(strings.Join(cli.volumeInspectCalls, ","), newName) { + t.Fatalf("expected VolumeInspect(%s) to check for existing new volume", newName) + } + if len(cli.volumeCreateCalls) == 0 || cli.volumeCreateCalls[0].Name != newName { + t.Fatalf("expected VolumeCreate(%s)", newName) + } + if len(cli.containerCreateCalls) == 0 { + t.Fatal("expected migration container creation") + } + if len(cli.containerStartCalls) == 0 { + t.Fatal("expected migration container start") + } + if len(cli.containerRemoveCalls) == 0 { + t.Fatal("expected migration container removal") + } + + // Legacy volume must be removed so it is not orphaned. + removed := false + for _, n := range cli.volumeRemoveCalls { + if n == legacyName { + removed = true + break + } + } + if !removed { + t.Fatalf("legacy volume %s was not removed after migration — orphan risk", legacyName) + } + if _, ok := cli.volumes[legacyName]; ok { + t.Fatalf("legacy volume %s still present in fake client after migration", legacyName) + } + if _, ok := cli.volumes[newName]; !ok { + t.Fatalf("new volume %s must exist after migration", newName) + } +} + +func TestResolveConfigVolumeName_LegacyAbsent_NoMigration(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + p := &Provisioner{cli: cli} + + newName := ConfigVolumeName(migrateTestWorkspaceID) + legacyName := legacyConfigVolumeName(migrateTestWorkspaceID) + + got := p.resolveConfigVolumeName(ctx, migrateTestWorkspaceID) + if got != newName { + t.Fatalf("resolveConfigVolumeName: got %q, want %q", got, newName) + } + + // Should check legacy once and short-circuit. + if len(cli.volumeInspectCalls) != 1 || cli.volumeInspectCalls[0] != legacyName { + t.Fatalf("expected exactly one VolumeInspect call for legacy name, got %v", cli.volumeInspectCalls) + } + if len(cli.volumeCreateCalls) != 0 { + t.Fatalf("expected no VolumeCreate when legacy absent, got %v", cli.volumeCreateCalls) + } + if len(cli.volumeRemoveCalls) != 0 { + t.Fatalf("expected no VolumeRemove when legacy absent, got %v", cli.volumeRemoveCalls) + } +} + +func TestResolveClaudeSessionVolumeName_LegacyExists_MigratesInPlace(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + p := &Provisioner{cli: cli} + + newName := ClaudeSessionVolumeName(migrateTestWorkspaceID) + legacyName := legacyClaudeSessionVolumeName(migrateTestWorkspaceID) + cli.volumes[legacyName] = volume.Volume{Name: legacyName} + + got := p.resolveClaudeSessionVolumeName(ctx, migrateTestWorkspaceID) + if got != newName { + t.Fatalf("resolveClaudeSessionVolumeName: got %q, want %q", got, newName) + } + + removed := false + for _, n := range cli.volumeRemoveCalls { + if n == legacyName { + removed = true + break + } + } + if !removed { + t.Fatalf("legacy session volume %s was not removed after migration — orphan risk", legacyName) + } + if _, ok := cli.volumes[newName]; !ok { + t.Fatalf("new session volume %s must exist after migration", newName) + } +} + +func TestResolveClaudeSessionVolumeName_LegacyAbsent_NoMigration(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + p := &Provisioner{cli: cli} + + newName := ClaudeSessionVolumeName(migrateTestWorkspaceID) + legacyName := legacyClaudeSessionVolumeName(migrateTestWorkspaceID) + + got := p.resolveClaudeSessionVolumeName(ctx, migrateTestWorkspaceID) + if got != newName { + t.Fatalf("resolveClaudeSessionVolumeName: got %q, want %q", got, newName) + } + if len(cli.volumeInspectCalls) != 1 || cli.volumeInspectCalls[0] != legacyName { + t.Fatalf("expected exactly one VolumeInspect call for legacy session name, got %v", cli.volumeInspectCalls) + } + if len(cli.volumeCreateCalls) != 0 { + t.Fatalf("expected no VolumeCreate when legacy absent, got %v", cli.volumeCreateCalls) + } +} + +func TestMigrateVolumeIfNeeded_CopyFails_PreservesLegacy(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + p := &Provisioner{cli: cli} + + newName := ConfigVolumeName(migrateTestWorkspaceID) + legacyName := legacyConfigVolumeName(migrateTestWorkspaceID) + cli.volumes[legacyName] = volume.Volume{Name: legacyName} + cli.migrationExitCode = 1 + + if err := p.migrateVolumeIfNeeded(ctx, newName, legacyName); err == nil { + t.Fatal("expected migration error when copy container exits non-zero") + } + + // Legacy volume must survive a failed copy so no data is lost. + if _, ok := cli.volumes[legacyName]; !ok { + t.Fatal("legacy volume must be preserved when migration copy fails (data-loss guard)") + } +} + +func TestStop_FullIDAbsent_LegacyRemoved(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + p := &Provisioner{cli: cli} + + newName := ContainerName(migrateTestWorkspaceID) + legacyName := legacyContainerName(migrateTestWorkspaceID) + cli.containers[legacyName] = runningContainer(legacyName) + + if err := p.Stop(ctx, migrateTestWorkspaceID); err != nil { + t.Fatalf("Stop failed: %v", err) + } + + if len(cli.containerRemoveCalls) < 2 { + t.Fatalf("expected Remove on full-id then legacy, got %v", cli.containerRemoveCalls) + } + if cli.containerRemoveCalls[0] != newName { + t.Fatalf("expected first remove target %q, got %q", newName, cli.containerRemoveCalls[0]) + } + if cli.containerRemoveCalls[1] != legacyName { + t.Fatalf("expected second remove target %q, got %q", legacyName, cli.containerRemoveCalls[1]) + } + if _, ok := cli.containers[legacyName]; ok { + t.Fatal("legacy container still present after Stop") + } +} + +func TestStop_BothAbsent_IsNoOp(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + p := &Provisioner{cli: cli} + + newName := ContainerName(migrateTestWorkspaceID) + legacyName := legacyContainerName(migrateTestWorkspaceID) + + if err := p.Stop(ctx, migrateTestWorkspaceID); err != nil { + t.Fatalf("Stop failed: %v", err) + } + + if len(cli.containerRemoveCalls) != 2 { + t.Fatalf("expected 2 remove attempts, got %v", cli.containerRemoveCalls) + } + if cli.containerRemoveCalls[0] != newName || cli.containerRemoveCalls[1] != legacyName { + t.Fatalf("expected remove order [%q, %q], got %v", newName, legacyName, cli.containerRemoveCalls) + } +} + +func TestRunningContainerName_LegacyRunning_RenameFails_FallsBackToLegacy(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + + newName := ContainerName(migrateTestWorkspaceID) + legacyName := legacyContainerName(migrateTestWorkspaceID) + cli.containers[legacyName] = runningContainer(legacyName) + cli.renameErr = errors.New("daemon rename failed") + + got, err := RunningContainerName(ctx, cli, migrateTestWorkspaceID) + if err != nil { + t.Fatalf("RunningContainerName returned error: %v", err) + } + if got != legacyName { + t.Fatalf("expected fallback to legacy name %q, got %q", legacyName, got) + } + + if len(cli.containerInspectCalls) < 2 { + t.Fatalf("expected inspect of legacy and new names, got %v", cli.containerInspectCalls) + } + if cli.containerInspectCalls[0] != legacyName { + t.Fatalf("expected legacy inspect first, got %v", cli.containerInspectCalls) + } + + renamed := false + for _, r := range cli.containerRenameCalls { + if r.Old == legacyName && r.New == newName { + renamed = true + break + } + } + if !renamed { + t.Fatalf("expected rename attempt %q -> %q, got %v", legacyName, newName, cli.containerRenameCalls) + } +} + +func TestRunningContainerName_LegacyRunning_RenameSucceeds(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + + newName := ContainerName(migrateTestWorkspaceID) + legacyName := legacyContainerName(migrateTestWorkspaceID) + cli.containers[legacyName] = runningContainer(legacyName) + + got, err := RunningContainerName(ctx, cli, migrateTestWorkspaceID) + if err != nil { + t.Fatalf("RunningContainerName returned error: %v", err) + } + if got != newName { + t.Fatalf("expected new name %q after rename, got %q", newName, got) + } + if _, ok := cli.containers[legacyName]; ok { + t.Fatal("legacy container should have been renamed away") + } + if _, ok := cli.containers[newName]; !ok { + t.Fatal("new container name should exist after rename") + } +} + +func TestRunningContainerName_NewRunning_ReturnsNew(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + + newName := ContainerName(migrateTestWorkspaceID) + cli.containers[newName] = runningContainer(newName) + + got, err := RunningContainerName(ctx, cli, migrateTestWorkspaceID) + if err != nil { + t.Fatalf("RunningContainerName returned error: %v", err) + } + if got != newName { + t.Fatalf("expected new name %q, got %q", newName, got) + } +} + +func TestRunningContainerName_BothAbsent_ReturnsEmpty(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + + got, err := RunningContainerName(ctx, cli, migrateTestWorkspaceID) + if err != nil { + t.Fatalf("RunningContainerName returned error: %v", err) + } + if got != "" { + t.Fatalf("expected empty name when neither container exists, got %q", got) + } +} -- 2.52.0 From be9f9a2ea627b41f9cf320957096cf24ecaf75a9 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Wed, 10 Jun 2026 02:45:42 +0000 Subject: [PATCH 7/8] fix(provisioner): handle typed nil *client.Client in dockerClient interface (#2490) RunningContainerName now accepts dockerClient interface, but callers like plugins.go pass *client.Client which may be nil. A non-nil interface holding a nil pointer does not == nil, causing panics when methods are called. Add isNilDockerClient helper that checks both interface nil and typed nil pointer via type switch. Cherry-picked from 2ae2adfb (originally landed on test/backward-compat-migrate-unit-tests). Co-Authored-By: Claude Opus 4.8 --- .../internal/provisioner/provisioner.go | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index 3967e0a67..2d16c010b 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -1593,8 +1593,23 @@ func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool, // transient errors into the same "" return as a genuinely-stopped container. // That hid daemon flakes as misleading 503 "container not running" responses // AND let the two impls drift on edge-case behavior. This is the SSOT. -func RunningContainerName(ctx context.Context, cli dockerClient, workspaceID string) (string, error) { +// isNilDockerClient reports whether cli is nil or a typed nil pointer +// (e.g. (*client.Client)(nil) passed as a dockerClient interface value). +// Required because a non-nil interface holding a nil pointer does not == nil. +func isNilDockerClient(cli dockerClient) bool { if cli == nil { + return true + } + switch c := cli.(type) { + case *client.Client: + return c == nil + default: + return false + } +} + +func RunningContainerName(ctx context.Context, cli dockerClient, workspaceID string) (string, error) { + if isNilDockerClient(cli) { return "", ErrNoBackend } -- 2.52.0 From 6ca19272ffb3739090cf05b478a22c8ba24cebdc Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Wed, 10 Jun 2026 07:42:23 +0000 Subject: [PATCH 8/8] fix(provisioner): fakeDockerClient ContainerWait must not close errCh on success Closing errCh made both channels ready in the select, so Go's non-deterministic select sometimes picked the errCh case (nil error) instead of the waitCh case (non-zero StatusCode). This caused TestMigrateVolumeIfNeeded_CopyFails_PreservesLegacy to flake/fail under coverage instrumentation. Keep errCh open (real Docker client behaviour) so only waitCh is readable on a normal exit. Platform-Go green. --- .../internal/provisioner/provisioner_migrate_test.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workspace-server/internal/provisioner/provisioner_migrate_test.go b/workspace-server/internal/provisioner/provisioner_migrate_test.go index c0be63be2..cb2423b88 100644 --- a/workspace-server/internal/provisioner/provisioner_migrate_test.go +++ b/workspace-server/internal/provisioner/provisioner_migrate_test.go @@ -116,13 +116,12 @@ func (f *fakeDockerClient) ContainerStart(ctx context.Context, id string, option func (f *fakeDockerClient) ContainerWait(ctx context.Context, id string, condition container.WaitCondition) (<-chan container.WaitResponse, <-chan error) { waitCh := make(chan container.WaitResponse, 1) - errCh := make(chan error, 1) + errCh := make(chan error) f.mu.Lock() code := f.migrationExitCode f.mu.Unlock() waitCh <- container.WaitResponse{StatusCode: code} close(waitCh) - close(errCh) return waitCh, errCh } -- 2.52.0