diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index 29ddf619d..2d16c010b 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -19,6 +19,7 @@ import ( "sync" "time" + "github.com/docker/docker/api/types" "github.com/docker/docker/api/types/container" "github.com/docker/docker/api/types/filters" dockerimage "github.com/docker/docker/api/types/image" @@ -227,9 +228,31 @@ func legacyClaudeSessionVolumeName(workspaceID string) string { return fmt.Sprintf("ws-%s-claude-sessions", id) } +// dockerClient is the subset of client.Client methods used by Provisioner. +// Declared as an interface so tests can inject fakes without a real daemon. +type dockerClient interface { + Close() error + ContainerCreate(ctx context.Context, config *container.Config, hostConfig *container.HostConfig, networkingConfig *network.NetworkingConfig, platform *ocispec.Platform, containerName string) (container.CreateResponse, error) + ContainerExecAttach(ctx context.Context, execID string, config container.ExecAttachOptions) (types.HijackedResponse, error) + ContainerExecCreate(ctx context.Context, container string, config container.ExecOptions) (container.ExecCreateResponse, error) + ContainerInspect(ctx context.Context, container string) (container.InspectResponse, error) + ContainerList(ctx context.Context, options container.ListOptions) ([]container.Summary, error) + ContainerLogs(ctx context.Context, container string, options container.LogsOptions) (io.ReadCloser, error) + ContainerRemove(ctx context.Context, container string, options container.RemoveOptions) error + ContainerRename(ctx context.Context, container, newContainerName string) error + ContainerStart(ctx context.Context, container string, options container.StartOptions) error + ContainerWait(ctx context.Context, container string, condition container.WaitCondition) (<-chan container.WaitResponse, <-chan error) + CopyToContainer(ctx context.Context, container, path string, content io.Reader, options container.CopyToContainerOptions) error + ImageInspect(ctx context.Context, image string, opts ...client.ImageInspectOption) (dockerimage.InspectResponse, error) + ImagePull(ctx context.Context, ref string, opts dockerimage.PullOptions) (io.ReadCloser, error) + VolumeCreate(ctx context.Context, options volume.CreateOptions) (volume.Volume, error) + VolumeInspect(ctx context.Context, volumeID string) (volume.Volume, error) + VolumeRemove(ctx context.Context, volumeID string, force bool) error +} + // Provisioner manages Docker containers for workspace agents. type Provisioner struct { - cli *client.Client + cli dockerClient } // New creates a new Provisioner connected to the local Docker daemon. @@ -1336,35 +1359,115 @@ func (p *Provisioner) WriteAuthTokenToVolume(ctx context.Context, workspaceID, t } // resolveConfigVolumeName returns the effective config volume name for a -// workspace, preferring the legacy truncated name if that volume already -// exists (KI-013 deploy safety: pre-deploy volumes must not be orphaned). +// workspace. KI-013 deploy safety: if a legacy truncated-name volume exists, +// it is migrated in-place to the new full-ID name so existing workspace data +// is preserved AND all workspaces eventually use collision-safe names. func (p *Provisioner) resolveConfigVolumeName(ctx context.Context, workspaceID string) string { if p == nil || p.cli == nil { return ConfigVolumeName(workspaceID) } + newName := ConfigVolumeName(workspaceID) legacy := legacyConfigVolumeName(workspaceID) - if _, err := p.cli.VolumeInspect(ctx, legacy); err == nil { - return legacy + if err := p.migrateVolumeIfNeeded(ctx, newName, legacy); err != nil { + log.Printf("Provisioner: volume migration warning for %s: %v", workspaceID, err) } - return ConfigVolumeName(workspaceID) + return newName } // resolveClaudeSessionVolumeName returns the effective claude-sessions volume -// name, preferring the legacy truncated name if that volume already exists. +// name. KI-013 deploy safety: legacy truncated-name volumes are migrated +// in-place to the new full-ID name. func (p *Provisioner) resolveClaudeSessionVolumeName(ctx context.Context, workspaceID string) string { if p == nil || p.cli == nil { return ClaudeSessionVolumeName(workspaceID) } + newName := ClaudeSessionVolumeName(workspaceID) legacy := legacyClaudeSessionVolumeName(workspaceID) - if _, err := p.cli.VolumeInspect(ctx, legacy); err == nil { - return legacy + if err := p.migrateVolumeIfNeeded(ctx, newName, legacy); err != nil { + log.Printf("Provisioner: session volume migration warning for %s: %v", workspaceID, err) } - return ClaudeSessionVolumeName(workspaceID) + return newName +} + +// migrateVolumeIfNeeded renames a legacy truncated-name Docker volume to its +// new full-ID name by copying data via a temporary alpine container. If the +// legacy volume does not exist, or the new volume already exists, this is a +// no-op. The operation is idempotent — calling it multiple times is safe. +func (p *Provisioner) migrateVolumeIfNeeded(ctx context.Context, newName, legacyName string) error { + if p == nil || p.cli == nil { + return nil + } + + // Legacy volume missing — nothing to migrate. + if _, err := p.cli.VolumeInspect(ctx, legacyName); err != nil { + return nil + } + + // New volume already exists — migration already done (or new workspace). + if _, err := p.cli.VolumeInspect(ctx, newName); err == nil { + return nil + } + + // Create the new volume. + if _, err := p.cli.VolumeCreate(ctx, volume.CreateOptions{ + Name: newName, + Labels: managedLabels(), + }); err != nil { + return fmt.Errorf("create new volume %s: %w", newName, err) + } + + // Copy data from legacy to new via a short-lived alpine container. + resp, err := p.cli.ContainerCreate(ctx, &container.Config{ + Image: "alpine", + Cmd: []string{"sh", "-c", "cp -a /legacy/. /new/"}, + }, &container.HostConfig{ + Binds: []string{ + legacyName + ":/legacy", + newName + ":/new", + }, + }, nil, nil, "") + if err != nil { + return fmt.Errorf("create migration container: %w", err) + } + defer p.cli.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true}) + + if err := p.cli.ContainerStart(ctx, resp.ID, container.StartOptions{}); err != nil { + return fmt.Errorf("start migration container: %w", err) + } + + waitCh, errCh := p.cli.ContainerWait(ctx, resp.ID, container.WaitConditionNotRunning) + select { + case waitResp := <-waitCh: + if waitResp.StatusCode != 0 { + return fmt.Errorf("migration copy failed (exit %d) — preserving legacy volume %s for retry", waitResp.StatusCode, legacyName) + } + case err := <-errCh: + if err != nil { + return fmt.Errorf("migration container exited with error: %w", err) + } + } + + // Explicitly remove the migration container before removing the legacy + // volume so the volume is no longer referenced. The deferred remove above + // is a safety-net for early-return paths. + _ = p.cli.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true}) + + // Best-effort cleanup of the legacy volume. If removal fails (e.g. still + // referenced by a running container), the new volume is already populated + // and the next restart will retry. + if err := p.cli.VolumeRemove(ctx, legacyName, true); err != nil { + log.Printf("Provisioner: warning: failed to remove legacy volume %s after migration: %v", legacyName, err) + } else { + log.Printf("Provisioner: migrated legacy volume %s → %s", legacyName, newName) + } + return nil } // RemoveVolume removes the config volume for a workspace. // Also removes the claude-sessions volume (best-effort, may not exist // for non claude-code runtimes). Issue #12. +// Also removes the claude-sessions volume (best-effort, may not exist +// for non claude-code runtimes). Issue #12. func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) error { if p == nil || p.cli == nil { return ErrNoBackend @@ -1490,24 +1593,56 @@ func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool, // transient errors into the same "" return as a genuinely-stopped container. // That hid daemon flakes as misleading 503 "container not running" responses // AND let the two impls drift on edge-case behavior. This is the SSOT. -func RunningContainerName(ctx context.Context, cli *client.Client, workspaceID string) (string, error) { +// isNilDockerClient reports whether cli is nil or a typed nil pointer +// (e.g. (*client.Client)(nil) passed as a dockerClient interface value). +// Required because a non-nil interface holding a nil pointer does not == nil. +func isNilDockerClient(cli dockerClient) bool { if cli == nil { + return true + } + switch c := cli.(type) { + case *client.Client: + return c == nil + default: + return false + } +} + +func RunningContainerName(ctx context.Context, cli dockerClient, workspaceID string) (string, error) { + if isNilDockerClient(cli) { return "", ErrNoBackend } - // KI-013 deploy safety: new full-ID name first, then fall back to the - // old truncated name so pre-deploy containers are still discoverable. - names := []string{ContainerName(workspaceID), legacyContainerName(workspaceID)} - for _, name := range names { - info, err := cli.ContainerInspect(ctx, name) - if err != nil { - if isContainerNotFound(err) { - continue + + newName := ContainerName(workspaceID) + legacyName := legacyContainerName(workspaceID) + + // If a legacy container is still running, rename it in-place to the + // new full-ID name so all callers converge on collision-safe names. + legacyInfo, legacyErr := cli.ContainerInspect(ctx, legacyName) + if legacyErr == nil && legacyInfo.State.Running { + if _, newErr := cli.ContainerInspect(ctx, newName); isContainerNotFound(newErr) { + if renameErr := cli.ContainerRename(ctx, legacyName, newName); renameErr == nil { + log.Printf("Provisioner: renamed legacy container %s → %s", legacyName, newName) + return newName, nil + } else { + log.Printf("Provisioner: warning: failed to rename legacy container %s → %s: %v", legacyName, newName, renameErr) } - return "", err } - if info.State.Running { - return name, nil + // Rename not possible (or new name already occupied) — return legacy + // name so the caller can still exec into the live container. + return legacyName, nil + } + + // Standard path: look for a running container with the new name. + info, err := cli.ContainerInspect(ctx, newName) + if err != nil { + if isContainerNotFound(err) { + return "", nil } + return "", err + } + if info.State.Running { + return newName, nil } return "", nil } @@ -1558,8 +1693,15 @@ func isRemovalInProgress(err error) bool { } // DockerClient returns the underlying Docker client for sharing with other handlers. +// If the provisioner is backed by a fake (e.g. in unit tests), this returns nil. func (p *Provisioner) DockerClient() *client.Client { - return p.cli + if p == nil || p.cli == nil { + return nil + } + if c, ok := p.cli.(*client.Client); ok { + return c + } + return nil } // Close cleans up the Docker client. diff --git a/workspace-server/internal/provisioner/provisioner_migrate_test.go b/workspace-server/internal/provisioner/provisioner_migrate_test.go new file mode 100644 index 000000000..cb2423b88 --- /dev/null +++ b/workspace-server/internal/provisioner/provisioner_migrate_test.go @@ -0,0 +1,461 @@ +package provisioner + +import ( + "context" + "errors" + "io" + "strings" + "sync" + "testing" + + "github.com/docker/docker/api/types" + "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/image" + "github.com/docker/docker/api/types/network" + "github.com/docker/docker/client" + "github.com/docker/docker/api/types/volume" + ocispec "github.com/opencontainers/image-spec/specs-go/v1" +) + +// fakeDockerClient is a minimal in-memory implementation of the dockerClient +// interface used by the provisioner. It records calls and lets tests simulate +// the legacy/full-ID naming transition without a real Docker daemon. +type fakeDockerClient struct { + mu sync.Mutex + + volumes map[string]volume.Volume + containers map[string]container.InspectResponse + + renameErr error + migrationExitCode int64 + + volumeInspectCalls []string + volumeCreateCalls []volume.CreateOptions + volumeRemoveCalls []string + containerInspectCalls []string + containerRemoveCalls []string + containerRenameCalls []struct{ Old, New string } + containerCreateCalls []string + containerStartCalls []string +} + +func newFakeDockerClient() *fakeDockerClient { + return &fakeDockerClient{ + volumes: make(map[string]volume.Volume), + containers: make(map[string]container.InspectResponse), + } +} + +func (f *fakeDockerClient) Close() error { return nil } + +func (f *fakeDockerClient) ContainerCreate(ctx context.Context, config *container.Config, hostConfig *container.HostConfig, networkingConfig *network.NetworkingConfig, platform *ocispec.Platform, containerName string) (container.CreateResponse, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.containerCreateCalls = append(f.containerCreateCalls, containerName) + return container.CreateResponse{ID: "cid-" + containerName}, nil +} + +func (f *fakeDockerClient) ContainerExecAttach(ctx context.Context, execID string, config container.ExecAttachOptions) (types.HijackedResponse, error) { + return types.HijackedResponse{}, errors.New("not implemented") +} + +func (f *fakeDockerClient) ContainerExecCreate(ctx context.Context, ctr string, config container.ExecOptions) (container.ExecCreateResponse, error) { + return container.ExecCreateResponse{}, errors.New("not implemented") +} + +func (f *fakeDockerClient) ContainerInspect(ctx context.Context, name string) (container.InspectResponse, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.containerInspectCalls = append(f.containerInspectCalls, name) + if c, ok := f.containers[name]; ok { + return c, nil + } + return container.InspectResponse{}, errors.New("No such container: " + name) +} + +func (f *fakeDockerClient) ContainerList(ctx context.Context, options container.ListOptions) ([]container.Summary, error) { + return nil, errors.New("not implemented") +} + +func (f *fakeDockerClient) ContainerLogs(ctx context.Context, container string, options container.LogsOptions) (io.ReadCloser, error) { + return nil, errors.New("not implemented") +} + +func (f *fakeDockerClient) ContainerRemove(ctx context.Context, name string, options container.RemoveOptions) error { + f.mu.Lock() + defer f.mu.Unlock() + f.containerRemoveCalls = append(f.containerRemoveCalls, name) + if _, ok := f.containers[name]; ok { + delete(f.containers, name) + return nil + } + return errors.New("No such container: " + name) +} + +func (f *fakeDockerClient) ContainerRename(ctx context.Context, oldName, newName string) error { + f.mu.Lock() + defer f.mu.Unlock() + f.containerRenameCalls = append(f.containerRenameCalls, struct{ Old, New string }{oldName, newName}) + if f.renameErr != nil { + return f.renameErr + } + if c, ok := f.containers[oldName]; ok { + delete(f.containers, oldName) + c.Name = newName + f.containers[newName] = c + } + return nil +} + +func (f *fakeDockerClient) ContainerStart(ctx context.Context, id string, options container.StartOptions) error { + f.mu.Lock() + defer f.mu.Unlock() + f.containerStartCalls = append(f.containerStartCalls, id) + return nil +} + +func (f *fakeDockerClient) ContainerWait(ctx context.Context, id string, condition container.WaitCondition) (<-chan container.WaitResponse, <-chan error) { + waitCh := make(chan container.WaitResponse, 1) + errCh := make(chan error) + f.mu.Lock() + code := f.migrationExitCode + f.mu.Unlock() + waitCh <- container.WaitResponse{StatusCode: code} + close(waitCh) + return waitCh, errCh +} + +func (f *fakeDockerClient) CopyToContainer(ctx context.Context, container, path string, content io.Reader, options container.CopyToContainerOptions) error { + return errors.New("not implemented") +} + +func (f *fakeDockerClient) ImageInspect(ctx context.Context, img string, opts ...client.ImageInspectOption) (image.InspectResponse, error) { + return image.InspectResponse{}, errors.New("not implemented") +} + +func (f *fakeDockerClient) ImagePull(ctx context.Context, ref string, opts image.PullOptions) (io.ReadCloser, error) { + return nil, errors.New("not implemented") +} + +func (f *fakeDockerClient) VolumeCreate(ctx context.Context, options volume.CreateOptions) (volume.Volume, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.volumeCreateCalls = append(f.volumeCreateCalls, options) + v := volume.Volume{Name: options.Name, Labels: options.Labels} + f.volumes[options.Name] = v + return v, nil +} + +func (f *fakeDockerClient) VolumeInspect(ctx context.Context, name string) (volume.Volume, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.volumeInspectCalls = append(f.volumeInspectCalls, name) + if v, ok := f.volumes[name]; ok { + return v, nil + } + return volume.Volume{}, errors.New("No such volume: " + name) +} + +func (f *fakeDockerClient) VolumeRemove(ctx context.Context, name string, force bool) error { + f.mu.Lock() + defer f.mu.Unlock() + f.volumeRemoveCalls = append(f.volumeRemoveCalls, name) + if _, ok := f.volumes[name]; ok { + delete(f.volumes, name) + return nil + } + return errors.New("No such volume: " + name) +} + +func runningContainer(name string) container.InspectResponse { + return container.InspectResponse{ + ContainerJSONBase: &container.ContainerJSONBase{ + Name: name, + State: &container.State{Running: true}, + }, + } +} + +const migrateTestWorkspaceID = "abcdef1234567890" + +func TestResolveConfigVolumeName_LegacyExists_MigratesInPlace(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + p := &Provisioner{cli: cli} + + newName := ConfigVolumeName(migrateTestWorkspaceID) + legacyName := legacyConfigVolumeName(migrateTestWorkspaceID) + cli.volumes[legacyName] = volume.Volume{Name: legacyName} + + got := p.resolveConfigVolumeName(ctx, migrateTestWorkspaceID) + if got != newName { + t.Fatalf("resolveConfigVolumeName: got %q, want %q", got, newName) + } + + // Migration path must inspect both names, then create new volume and copy. + if !strings.Contains(strings.Join(cli.volumeInspectCalls, ","), legacyName) { + t.Fatalf("expected VolumeInspect(%s) for legacy presence check", legacyName) + } + if !strings.Contains(strings.Join(cli.volumeInspectCalls, ","), newName) { + t.Fatalf("expected VolumeInspect(%s) to check for existing new volume", newName) + } + if len(cli.volumeCreateCalls) == 0 || cli.volumeCreateCalls[0].Name != newName { + t.Fatalf("expected VolumeCreate(%s)", newName) + } + if len(cli.containerCreateCalls) == 0 { + t.Fatal("expected migration container creation") + } + if len(cli.containerStartCalls) == 0 { + t.Fatal("expected migration container start") + } + if len(cli.containerRemoveCalls) == 0 { + t.Fatal("expected migration container removal") + } + + // Legacy volume must be removed so it is not orphaned. + removed := false + for _, n := range cli.volumeRemoveCalls { + if n == legacyName { + removed = true + break + } + } + if !removed { + t.Fatalf("legacy volume %s was not removed after migration — orphan risk", legacyName) + } + if _, ok := cli.volumes[legacyName]; ok { + t.Fatalf("legacy volume %s still present in fake client after migration", legacyName) + } + if _, ok := cli.volumes[newName]; !ok { + t.Fatalf("new volume %s must exist after migration", newName) + } +} + +func TestResolveConfigVolumeName_LegacyAbsent_NoMigration(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + p := &Provisioner{cli: cli} + + newName := ConfigVolumeName(migrateTestWorkspaceID) + legacyName := legacyConfigVolumeName(migrateTestWorkspaceID) + + got := p.resolveConfigVolumeName(ctx, migrateTestWorkspaceID) + if got != newName { + t.Fatalf("resolveConfigVolumeName: got %q, want %q", got, newName) + } + + // Should check legacy once and short-circuit. + if len(cli.volumeInspectCalls) != 1 || cli.volumeInspectCalls[0] != legacyName { + t.Fatalf("expected exactly one VolumeInspect call for legacy name, got %v", cli.volumeInspectCalls) + } + if len(cli.volumeCreateCalls) != 0 { + t.Fatalf("expected no VolumeCreate when legacy absent, got %v", cli.volumeCreateCalls) + } + if len(cli.volumeRemoveCalls) != 0 { + t.Fatalf("expected no VolumeRemove when legacy absent, got %v", cli.volumeRemoveCalls) + } +} + +func TestResolveClaudeSessionVolumeName_LegacyExists_MigratesInPlace(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + p := &Provisioner{cli: cli} + + newName := ClaudeSessionVolumeName(migrateTestWorkspaceID) + legacyName := legacyClaudeSessionVolumeName(migrateTestWorkspaceID) + cli.volumes[legacyName] = volume.Volume{Name: legacyName} + + got := p.resolveClaudeSessionVolumeName(ctx, migrateTestWorkspaceID) + if got != newName { + t.Fatalf("resolveClaudeSessionVolumeName: got %q, want %q", got, newName) + } + + removed := false + for _, n := range cli.volumeRemoveCalls { + if n == legacyName { + removed = true + break + } + } + if !removed { + t.Fatalf("legacy session volume %s was not removed after migration — orphan risk", legacyName) + } + if _, ok := cli.volumes[newName]; !ok { + t.Fatalf("new session volume %s must exist after migration", newName) + } +} + +func TestResolveClaudeSessionVolumeName_LegacyAbsent_NoMigration(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + p := &Provisioner{cli: cli} + + newName := ClaudeSessionVolumeName(migrateTestWorkspaceID) + legacyName := legacyClaudeSessionVolumeName(migrateTestWorkspaceID) + + got := p.resolveClaudeSessionVolumeName(ctx, migrateTestWorkspaceID) + if got != newName { + t.Fatalf("resolveClaudeSessionVolumeName: got %q, want %q", got, newName) + } + if len(cli.volumeInspectCalls) != 1 || cli.volumeInspectCalls[0] != legacyName { + t.Fatalf("expected exactly one VolumeInspect call for legacy session name, got %v", cli.volumeInspectCalls) + } + if len(cli.volumeCreateCalls) != 0 { + t.Fatalf("expected no VolumeCreate when legacy absent, got %v", cli.volumeCreateCalls) + } +} + +func TestMigrateVolumeIfNeeded_CopyFails_PreservesLegacy(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + p := &Provisioner{cli: cli} + + newName := ConfigVolumeName(migrateTestWorkspaceID) + legacyName := legacyConfigVolumeName(migrateTestWorkspaceID) + cli.volumes[legacyName] = volume.Volume{Name: legacyName} + cli.migrationExitCode = 1 + + if err := p.migrateVolumeIfNeeded(ctx, newName, legacyName); err == nil { + t.Fatal("expected migration error when copy container exits non-zero") + } + + // Legacy volume must survive a failed copy so no data is lost. + if _, ok := cli.volumes[legacyName]; !ok { + t.Fatal("legacy volume must be preserved when migration copy fails (data-loss guard)") + } +} + +func TestStop_FullIDAbsent_LegacyRemoved(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + p := &Provisioner{cli: cli} + + newName := ContainerName(migrateTestWorkspaceID) + legacyName := legacyContainerName(migrateTestWorkspaceID) + cli.containers[legacyName] = runningContainer(legacyName) + + if err := p.Stop(ctx, migrateTestWorkspaceID); err != nil { + t.Fatalf("Stop failed: %v", err) + } + + if len(cli.containerRemoveCalls) < 2 { + t.Fatalf("expected Remove on full-id then legacy, got %v", cli.containerRemoveCalls) + } + if cli.containerRemoveCalls[0] != newName { + t.Fatalf("expected first remove target %q, got %q", newName, cli.containerRemoveCalls[0]) + } + if cli.containerRemoveCalls[1] != legacyName { + t.Fatalf("expected second remove target %q, got %q", legacyName, cli.containerRemoveCalls[1]) + } + if _, ok := cli.containers[legacyName]; ok { + t.Fatal("legacy container still present after Stop") + } +} + +func TestStop_BothAbsent_IsNoOp(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + p := &Provisioner{cli: cli} + + newName := ContainerName(migrateTestWorkspaceID) + legacyName := legacyContainerName(migrateTestWorkspaceID) + + if err := p.Stop(ctx, migrateTestWorkspaceID); err != nil { + t.Fatalf("Stop failed: %v", err) + } + + if len(cli.containerRemoveCalls) != 2 { + t.Fatalf("expected 2 remove attempts, got %v", cli.containerRemoveCalls) + } + if cli.containerRemoveCalls[0] != newName || cli.containerRemoveCalls[1] != legacyName { + t.Fatalf("expected remove order [%q, %q], got %v", newName, legacyName, cli.containerRemoveCalls) + } +} + +func TestRunningContainerName_LegacyRunning_RenameFails_FallsBackToLegacy(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + + newName := ContainerName(migrateTestWorkspaceID) + legacyName := legacyContainerName(migrateTestWorkspaceID) + cli.containers[legacyName] = runningContainer(legacyName) + cli.renameErr = errors.New("daemon rename failed") + + got, err := RunningContainerName(ctx, cli, migrateTestWorkspaceID) + if err != nil { + t.Fatalf("RunningContainerName returned error: %v", err) + } + if got != legacyName { + t.Fatalf("expected fallback to legacy name %q, got %q", legacyName, got) + } + + if len(cli.containerInspectCalls) < 2 { + t.Fatalf("expected inspect of legacy and new names, got %v", cli.containerInspectCalls) + } + if cli.containerInspectCalls[0] != legacyName { + t.Fatalf("expected legacy inspect first, got %v", cli.containerInspectCalls) + } + + renamed := false + for _, r := range cli.containerRenameCalls { + if r.Old == legacyName && r.New == newName { + renamed = true + break + } + } + if !renamed { + t.Fatalf("expected rename attempt %q -> %q, got %v", legacyName, newName, cli.containerRenameCalls) + } +} + +func TestRunningContainerName_LegacyRunning_RenameSucceeds(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + + newName := ContainerName(migrateTestWorkspaceID) + legacyName := legacyContainerName(migrateTestWorkspaceID) + cli.containers[legacyName] = runningContainer(legacyName) + + got, err := RunningContainerName(ctx, cli, migrateTestWorkspaceID) + if err != nil { + t.Fatalf("RunningContainerName returned error: %v", err) + } + if got != newName { + t.Fatalf("expected new name %q after rename, got %q", newName, got) + } + if _, ok := cli.containers[legacyName]; ok { + t.Fatal("legacy container should have been renamed away") + } + if _, ok := cli.containers[newName]; !ok { + t.Fatal("new container name should exist after rename") + } +} + +func TestRunningContainerName_NewRunning_ReturnsNew(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + + newName := ContainerName(migrateTestWorkspaceID) + cli.containers[newName] = runningContainer(newName) + + got, err := RunningContainerName(ctx, cli, migrateTestWorkspaceID) + if err != nil { + t.Fatalf("RunningContainerName returned error: %v", err) + } + if got != newName { + t.Fatalf("expected new name %q, got %q", newName, got) + } +} + +func TestRunningContainerName_BothAbsent_ReturnsEmpty(t *testing.T) { + ctx := context.Background() + cli := newFakeDockerClient() + + got, err := RunningContainerName(ctx, cli, migrateTestWorkspaceID) + if err != nil { + t.Fatalf("RunningContainerName returned error: %v", err) + } + if got != "" { + t.Fatalf("expected empty name when neither container exists, got %q", got) + } +} diff --git a/workspace-server/internal/provisioner/provisioner_test.go b/workspace-server/internal/provisioner/provisioner_test.go index da597a824..988c1d580 100644 --- a/workspace-server/internal/provisioner/provisioner_test.go +++ b/workspace-server/internal/provisioner/provisioner_test.go @@ -2,14 +2,19 @@ package provisioner import ( "archive/tar" + "context" "errors" "io" "os" "path/filepath" + "strconv" "strings" "testing" + "time" "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/volume" + "github.com/docker/docker/client" ) // TestValidateConfigSource covers issue #17: a workspace restart with no @@ -1422,3 +1427,119 @@ func TestApplyTierConfig_T3_DefaultCap(t *testing.T) { t.Errorf("T3 default NanoCPUs: got %d, want %d", hc.NanoCPUs, wantCPU) } } + +// TestMigrateVolumeIfNeeded_ExistingTruncatedVolume verifies the KI-013 deploy +// safety path: when a legacy truncated-name volume already exists, data is +// copied to the new full-ID name and the legacy volume is removed. Existing +// workspace state is preserved without operator intervention. +func TestMigrateVolumeIfNeeded_ExistingTruncatedVolume(t *testing.T) { + ctx := context.Background() + + cli, err := client.NewClientWithOpts(client.FromEnv, client.WithAPIVersionNegotiation()) + if err != nil { + t.Skip("docker client unavailable:", err) + } + if _, pingErr := cli.Ping(ctx); pingErr != nil { + t.Skip("docker daemon unreachable:", pingErr) + } + + p := &Provisioner{cli: cli} + workspaceID := "test-migrate-" + strconv.FormatInt(time.Now().UnixNano(), 10) + legacyName := legacyConfigVolumeName(workspaceID) + newName := ConfigVolumeName(workspaceID) + + // Cleanup before and after (defensive — avoid pollution on retries). + _ = cli.VolumeRemove(ctx, legacyName, true) + _ = cli.VolumeRemove(ctx, newName, true) + defer func() { + _ = cli.VolumeRemove(ctx, legacyName, true) + _ = cli.VolumeRemove(ctx, newName, true) + }() + + // 1. Create legacy volume and seed it with a sentinel file. + if _, err := cli.VolumeCreate(ctx, volume.CreateOptions{Name: legacyName}); err != nil { + t.Fatalf("create legacy volume: %v", err) + } + seedResp, err := cli.ContainerCreate(ctx, &container.Config{ + Image: "alpine", + Cmd: []string{"sh", "-c", "echo sentinel-data > /vol/sentinel.txt"}, + }, &container.HostConfig{ + Binds: []string{legacyName + ":/vol"}, + }, nil, nil, "") + if err != nil { + t.Fatalf("create seed container: %v", err) + } + defer cli.ContainerRemove(ctx, seedResp.ID, container.RemoveOptions{Force: true}) + if err := cli.ContainerStart(ctx, seedResp.ID, container.StartOptions{}); err != nil { + t.Fatalf("start seed container: %v", err) + } + waitCh, errCh := cli.ContainerWait(ctx, seedResp.ID, container.WaitConditionNotRunning) + select { + case <-waitCh: + case err := <-errCh: + if err != nil { + t.Fatalf("seed container failed: %v", err) + } + } + // Remove the seed container before migration so the legacy volume is + // no longer referenced by any container. The deferred remove above is a + // safety net for panic/early-return paths. + if err := cli.ContainerRemove(ctx, seedResp.ID, container.RemoveOptions{Force: true}); err != nil { + t.Fatalf("remove seed container: %v", err) + } + + // 2. Run migration. + if err := p.migrateVolumeIfNeeded(ctx, newName, legacyName); err != nil { + t.Fatalf("migrateVolumeIfNeeded failed: %v", err) + } + + // 3. Legacy volume must be gone. + if _, inspectErr := cli.VolumeInspect(ctx, legacyName); inspectErr == nil { + t.Fatalf("legacy volume %s still exists after migration", legacyName) + } + + // 4. New volume must exist and contain the sentinel file. + if _, inspectErr := cli.VolumeInspect(ctx, newName); inspectErr != nil { + t.Fatalf("new volume %s does not exist after migration: %v", newName, inspectErr) + } + + readResp, err := cli.ContainerCreate(ctx, &container.Config{ + Image: "alpine", + Cmd: []string{"cat", "/vol/sentinel.txt"}, + }, &container.HostConfig{ + Binds: []string{newName + ":/vol"}, + }, nil, nil, "") + if err != nil { + t.Fatalf("create read container: %v", err) + } + defer cli.ContainerRemove(ctx, readResp.ID, container.RemoveOptions{Force: true}) + if err := cli.ContainerStart(ctx, readResp.ID, container.StartOptions{}); err != nil { + t.Fatalf("start read container: %v", err) + } + waitCh, errCh = cli.ContainerWait(ctx, readResp.ID, container.WaitConditionNotRunning) + select { + case <-waitCh: + case err := <-errCh: + if err != nil { + t.Fatalf("read container failed: %v", err) + } + } + + logs, err := cli.ContainerLogs(ctx, readResp.ID, container.LogsOptions{ShowStdout: true}) + if err != nil { + t.Fatalf("read container logs: %v", err) + } + defer logs.Close() + data, err := io.ReadAll(logs) + if err != nil { + t.Fatalf("read logs: %v", err) + } + if !strings.Contains(string(data), "sentinel-data") { + t.Fatalf("new volume missing sentinel data; logs: %q", data) + } + + // 5. Idempotency: second migration must be a no-op. + if err := p.migrateVolumeIfNeeded(ctx, newName, legacyName); err != nil { + t.Fatalf("second migration (idempotency) failed: %v", err) + } +}