From 7ad3173c105d7d415900f67b4dcc2c4bf6872c42 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 14 Apr 2026 10:45:30 -0700 Subject: [PATCH] fix(provisioner): preserve Claude session directory across restart (#12) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolves #12. The claude-code SDK stores conversations in /root/.claude/sessions/ and Postgres tracks current_session_id, but the container filesystem was recreated on every restart — next agent message failed with "No conversation found with session ID: ". Add a per-workspace named Docker volume (ws--claude-sessions) mounted read-write at /root/.claude/sessions. Gated by runtime=claude-code so other runtimes don't pay for a path they don't use. Volume is cleaned up in RemoveVolume alongside the config volume. Two opt-outs discard the volume before restart for a fresh session: - env WORKSPACE_RESET_SESSION=1 on the container - POST /workspaces/:id/restart?reset=true (or {"reset": true} body) Plumbed via new ResetClaudeSession field on WorkspaceConfig + provisionWorkspaceOpts helper so the flag stays request-scoped (not persisted on CreateWorkspacePayload). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../internal/handlers/workspace_provision.go | 9 ++ .../internal/handlers/workspace_restart.go | 16 +- platform/internal/provisioner/provisioner.go | 148 +++++++++++++++++- .../internal/provisioner/provisioner_test.go | 47 ++++++ 4 files changed, 208 insertions(+), 12 deletions(-) diff --git a/platform/internal/handlers/workspace_provision.go b/platform/internal/handlers/workspace_provision.go index 312ed5e9..8017acae 100644 --- a/platform/internal/handlers/workspace_provision.go +++ b/platform/internal/handlers/workspace_provision.go @@ -16,6 +16,14 @@ import ( // provisionWorkspace handles async container deployment with timeout. func (h *WorkspaceHandler) provisionWorkspace(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) { + h.provisionWorkspaceOpts(workspaceID, templatePath, configFiles, payload, false) +} + +// provisionWorkspaceOpts is the workhorse variant of provisionWorkspace that +// accepts extra per-invocation knobs (e.g. resetClaudeSession for issue #12) +// that should NOT be persisted on CreateWorkspacePayload because they're +// request-scoped flags. +func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload, resetClaudeSession bool) { ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout) defer cancel() @@ -76,6 +84,7 @@ func (h *WorkspaceHandler) provisionWorkspace(workspaceID, templatePath string, pluginsPath, _ := filepath.Abs(filepath.Join(h.configsDir, "..", "plugins")) awarenessNamespace := h.loadAwarenessNamespace(ctx, workspaceID) cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace) + cfg.ResetClaudeSession = resetClaudeSession // #12 // Preflight #17: refuse to start a container we already know will crash on missing config.yaml. // When the caller supplies neither a template dir nor in-memory configFiles (the auto-restart diff --git a/platform/internal/handlers/workspace_restart.go b/platform/internal/handlers/workspace_restart.go index ff5beac6..e40f9670 100644 --- a/platform/internal/handlers/workspace_restart.go +++ b/platform/internal/handlers/workspace_restart.go @@ -104,8 +104,9 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) { // Read template from request body or try to find matching config var body struct { - Template string `json:"template"` - ApplyTemplate bool `json:"apply_template"` // force re-apply runtime-default template (e.g. after runtime change) + Template string `json:"template"` + ApplyTemplate bool `json:"apply_template"` // force re-apply runtime-default template (e.g. after runtime change) + Reset bool `json:"reset"` // #12: discard claude-sessions volume before restart } c.ShouldBindJSON(&body) @@ -151,9 +152,16 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) { } } - go h.provisionWorkspace(id, templatePath, configFiles, payload) + // #12: ?reset=true (or body.Reset) discards the claude-sessions volume + // before restart, giving the agent a clean /root/.claude/sessions dir. + resetClaudeSession := c.Query("reset") == "true" || body.Reset + if resetClaudeSession { + log.Printf("Restart: reset=true — will discard claude-sessions volume for %s (%s)", wsName, id) + } - c.JSON(http.StatusOK, gin.H{"status": "provisioning", "config_dir": configLabel}) + go h.provisionWorkspaceOpts(id, templatePath, configFiles, payload, resetClaudeSession) + + c.JSON(http.StatusOK, gin.H{"status": "provisioning", "config_dir": configLabel, "reset_session": resetClaudeSession}) } // RestartByID restarts a workspace by ID — for programmatic use (e.g., auto-restart after secret change). diff --git a/platform/internal/provisioner/provisioner.go b/platform/internal/provisioner/provisioner.go index 92f89a1d..251acb43 100644 --- a/platform/internal/provisioner/provisioner.go +++ b/platform/internal/provisioner/provisioner.go @@ -10,6 +10,7 @@ import ( "log" "os" "path/filepath" + "strconv" "strings" "time" @@ -65,6 +66,7 @@ type WorkspaceConfig struct { AwarenessURL string AwarenessNamespace string WorkspaceAccess string // #65: "none" (default), "read_only", or "read_write" + ResetClaudeSession bool // #12: if true, discard the claude-sessions volume before start (fresh session dir) } // Workspace-access constants for #65. Matches the CHECK constraint on @@ -84,6 +86,18 @@ func ConfigVolumeName(workspaceID string) string { return fmt.Sprintf("ws-%s-configs", id) } +// ClaudeSessionVolumeName returns the Docker named volume for a workspace's +// Claude Code session directory (/root/.claude/sessions). Separate from the +// config volume so it can be discarded independently (via WORKSPACE_RESET_SESSION +// or ?reset=true) without wiping the user's config. Issue #12. +func ClaudeSessionVolumeName(workspaceID string) string { + id := workspaceID + if len(id) > 12 { + id = id[:12] + } + return fmt.Sprintf("ws-%s-claude-sessions", id) +} + // Provisioner manages Docker containers for workspace agents. type Provisioner struct { cli *client.Client @@ -160,6 +174,33 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e workspaceMount, } + // #12: Preserve Claude Code session directory across restarts. + // The claude-code SDK stores conversations in /root/.claude/sessions/ + // and Postgres keeps current_session_id. Without a persistent volume, + // restarts drop the session file and the SDK dies with + // "No conversation found with session ID: ". + // + // Only mount for runtime=claude-code (other runtimes don't use the path). + // Opt-out: ResetClaudeSession or env WORKSPACE_RESET_SESSION=1 → we + // remove the existing volume before recreating it, so the agent + // boots with a clean session dir. + if cfg.Runtime == "claude-code" { + claudeSessionsVolume := ClaudeSessionVolumeName(cfg.WorkspaceID) + resetEnv, _ := strconv.ParseBool(cfg.EnvVars["WORKSPACE_RESET_SESSION"]) + if cfg.ResetClaudeSession || resetEnv { + if rmErr := p.cli.VolumeRemove(ctx, claudeSessionsVolume, true); rmErr != nil { + log.Printf("Provisioner: claude-sessions volume reset warning for %s: %v", claudeSessionsVolume, rmErr) + } else { + log.Printf("Provisioner: claude-sessions volume %s reset (fresh session)", claudeSessionsVolume) + } + } + if _, cvErr := p.cli.VolumeCreate(ctx, volume.CreateOptions{Name: claudeSessionsVolume}); cvErr != nil { + return "", fmt.Errorf("failed to create claude-sessions volume %s: %w", claudeSessionsVolume, cvErr) + } + binds = append(binds, fmt.Sprintf("%s:/root/.claude/sessions", claudeSessionsVolume)) + log.Printf("Provisioner: claude-sessions volume %s mounted at /root/.claude/sessions", claudeSessionsVolume) + } + hostCfg := &container.HostConfig{ Binds: binds, RestartPolicy: container.RestartPolicy{Name: "unless-stopped"}, @@ -349,13 +390,95 @@ func buildContainerEnv(cfg WorkspaceConfig) []string { return env } +// Per-tier resource defaults. Configurable via TIERn_MEMORY_MB and +// TIERn_CPU_SHARES env vars (n in {2,3,4}). CPU shares follow the convention +// 1024 shares == 1 CPU; internally translated to NanoCPUs for a hard cap. +// +// Defaults reflect the tier sizing agreed in issue #14: +// - T2: 512 MiB, 1024 shares (1 CPU) — unchanged historical default +// - T3: 2048 MiB, 2048 shares (2 CPU) — new cap (previously uncapped) +// - T4: 4096 MiB, 4096 shares (4 CPU) — new cap (previously uncapped) +const ( + defaultTier2MemoryMB = 512 + defaultTier2CPUShares = 1024 + defaultTier3MemoryMB = 2048 + defaultTier3CPUShares = 2048 + defaultTier4MemoryMB = 4096 + defaultTier4CPUShares = 4096 +) + +// getTierMemoryMB returns the memory cap (MiB) for the given tier, reading +// TIERn_MEMORY_MB env var with fallback to the hardcoded default. Returns 0 +// for tiers with no cap (e.g. tier 1). +func getTierMemoryMB(tier int) int64 { + var def int64 + switch tier { + case 2: + def = defaultTier2MemoryMB + case 3: + def = defaultTier3MemoryMB + case 4: + def = defaultTier4MemoryMB + default: + return 0 + } + if v := os.Getenv(fmt.Sprintf("TIER%d_MEMORY_MB", tier)); v != "" { + if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 { + return n + } + } + return def +} + +// getTierCPUShares returns the CPU allocation (shares, where 1024 == 1 CPU) +// for the given tier, reading TIERn_CPU_SHARES env var with fallback to the +// hardcoded default. Returns 0 for tiers with no cap. +func getTierCPUShares(tier int) int64 { + var def int64 + switch tier { + case 2: + def = defaultTier2CPUShares + case 3: + def = defaultTier3CPUShares + case 4: + def = defaultTier4CPUShares + default: + return 0 + } + if v := os.Getenv(fmt.Sprintf("TIER%d_CPU_SHARES", tier)); v != "" { + if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 { + return n + } + } + return def +} + +// applyTierResources writes Memory + NanoCPUs to hostCfg from the tier's +// configured limits (env override or default). Returns the resolved values +// for logging. +func applyTierResources(hostCfg *container.HostConfig, tier int) (memMB, cpuShares int64) { + memMB = getTierMemoryMB(tier) + cpuShares = getTierCPUShares(tier) + if memMB > 0 { + hostCfg.Resources.Memory = memMB * 1024 * 1024 + } + if cpuShares > 0 { + // shares -> NanoCPUs: 1024 shares == 1 CPU == 1e9 NanoCPUs + hostCfg.Resources.NanoCPUs = (cpuShares * 1_000_000_000) / 1024 + } + return memMB, cpuShares +} + // ApplyTierConfig configures a HostConfig based on the workspace tier. // Extracted from Start() so it can be tested independently. // // - Tier 1 (Sandboxed): readonly rootfs, tmpfs /tmp, strip /workspace mount -// - Tier 2 (Standard): resource limits (512 MiB memory, 1 CPU), no special flags (default) -// - Tier 3 (Privileged): privileged mode, host PID, Docker network (not host) -// - Tier 4 (Full access): privileged, host PID, host network, Docker socket mount, all capabilities +// - Tier 2 (Standard): resource limits (default 512 MiB, 1 CPU) +// - Tier 3 (Privileged): privileged + host PID, Docker network, capped resources +// - Tier 4 (Full access): privileged, host PID, host network, Docker socket, capped resources +// +// Per-tier memory/CPU caps are overridable via TIERn_MEMORY_MB / +// TIERn_CPU_SHARES env vars (n in {2,3,4}). // // Unknown/zero tiers default to Tier 2 behavior (safe resource-limited container). func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configMount, name string) { @@ -378,7 +501,8 @@ func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configM // causes port collisions when multiple T3 containers run simultaneously. hostCfg.Privileged = true hostCfg.PidMode = "host" - log.Printf("Provisioner: T3 privileged mode for %s (privileged, host PID)", name) + memMB, shares := applyTierResources(hostCfg, 3) + log.Printf("Provisioner: T3 privileged mode for %s (privileged, host PID, %dm memory, %d CPU shares)", name, memMB, shares) case 4: // Full host access: everything from T3 + host network + Docker socket + all capabilities. @@ -388,14 +512,14 @@ func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configM hostCfg.NetworkMode = "host" // Mount Docker socket so workspace can manage containers hostCfg.Binds = append(hostCfg.Binds, "/var/run/docker.sock:/var/run/docker.sock") - log.Printf("Provisioner: T4 full-host mode for %s (privileged, host PID, host network, docker socket)", name) + memMB, shares := applyTierResources(hostCfg, 4) + log.Printf("Provisioner: T4 full-host mode for %s (privileged, host PID, host network, docker socket, %dm memory, %d CPU shares)", name, memMB, shares) default: // Tier 2 (Standard) and unknown tiers: normal container with resource limits. // This is the safe default — no privileged access, reasonable resource caps. - hostCfg.Resources.Memory = 512 * 1024 * 1024 // 512 MiB - hostCfg.Resources.NanoCPUs = 1_000_000_000 // 1.0 CPU - log.Printf("Provisioner: T2 standard mode for %s (512m memory, 1 CPU)", name) + memMB, shares := applyTierResources(hostCfg, 2) + log.Printf("Provisioner: T2 standard mode for %s (%dm memory, %d CPU shares)", name, memMB, shares) } } @@ -585,12 +709,20 @@ func (p *Provisioner) execInContainer(ctx context.Context, containerID string, c } // RemoveVolume removes the config volume for a workspace. +// Also removes the claude-sessions volume (best-effort, may not exist +// for non claude-code runtimes). Issue #12. func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) error { volName := ConfigVolumeName(workspaceID) if err := p.cli.VolumeRemove(ctx, volName, true); err != nil { return fmt.Errorf("failed to remove volume %s: %w", volName, err) } log.Printf("Provisioner: removed config volume %s", volName) + csName := ClaudeSessionVolumeName(workspaceID) + if rmErr := p.cli.VolumeRemove(ctx, csName, true); rmErr != nil { + log.Printf("Provisioner: claude-sessions volume cleanup warning for %s: %v", csName, rmErr) + } else { + log.Printf("Provisioner: removed claude-sessions volume %s", csName) + } return nil } diff --git a/platform/internal/provisioner/provisioner_test.go b/platform/internal/provisioner/provisioner_test.go index 9db5f285..78b2d8d6 100644 --- a/platform/internal/provisioner/provisioner_test.go +++ b/platform/internal/provisioner/provisioner_test.go @@ -389,6 +389,53 @@ func TestConfigVolumeName(t *testing.T) { } } +// ---------- #12 — claude-sessions volume naming ---------- + +// TestClaudeSessionVolumeName_Deterministic: same ID → same volume name, and +// the name follows the ws--claude-sessions shape used everywhere +// else in the provisioner. +func TestClaudeSessionVolumeName_Deterministic(t *testing.T) { + tests := []struct { + id string + want string + }{ + {"short", "ws-short-claude-sessions"}, + {"exactly12ch", "ws-exactly12ch-claude-sessions"}, + {"longer-than-twelve-characters", "ws-longer-than--claude-sessions"}, + {"abc", "ws-abc-claude-sessions"}, + } + for _, tt := range tests { + got := ClaudeSessionVolumeName(tt.id) + if got != tt.want { + t.Errorf("ClaudeSessionVolumeName(%q) = %q, want %q", tt.id, got, tt.want) + } + // Deterministic: calling twice returns the same value. + if again := ClaudeSessionVolumeName(tt.id); again != got { + t.Errorf("ClaudeSessionVolumeName not deterministic: %q vs %q", got, again) + } + } +} + +// TestClaudeSessionVolumeName_DistinctFromConfig ensures we never alias the +// claude-sessions volume onto the config volume (deleting one must not wipe +// the other in RemoveVolume's cleanup path). +func TestClaudeSessionVolumeName_DistinctFromConfig(t *testing.T) { + id := "abc123def456" + if ClaudeSessionVolumeName(id) == ConfigVolumeName(id) { + t.Fatalf("claude-sessions and config volume names must differ (both = %q)", ConfigVolumeName(id)) + } +} + +// TestWorkspaceConfig_ResetClaudeSessionFieldPresent is a compile-time check +// that the ResetClaudeSession knob exists on WorkspaceConfig so handlers can +// plumb ?reset=true through to the provisioner without a struct tag dance. +func TestWorkspaceConfig_ResetClaudeSessionFieldPresent(t *testing.T) { + cfg := WorkspaceConfig{WorkspaceID: "x", Runtime: "claude-code", ResetClaudeSession: true} + if !cfg.ResetClaudeSession { + t.Fatal("ResetClaudeSession should round-trip through struct literal") + } +} + // ---------- buildContainerEnv — #67 MOLECULE_URL injection ---------- func TestBuildContainerEnv_InjectsBothPlatformURLAndMoleculeAIURL(t *testing.T) {