fix(provisioner): preserve Claude session directory across restart (#12)

Resolves #12. The claude-code SDK stores conversations in
/root/.claude/sessions/ and Postgres tracks current_session_id, but the
container filesystem was recreated on every restart — next agent message
failed with "No conversation found with session ID: <uuid>".

Add a per-workspace named Docker volume (ws-<id>-claude-sessions) mounted
read-write at /root/.claude/sessions. Gated by runtime=claude-code so
other runtimes don't pay for a path they don't use. Volume is cleaned up
in RemoveVolume alongside the config volume.

Two opt-outs discard the volume before restart for a fresh session:
  - env WORKSPACE_RESET_SESSION=1 on the container
  - POST /workspaces/:id/restart?reset=true (or {"reset": true} body)

Plumbed via new ResetClaudeSession field on WorkspaceConfig +
provisionWorkspaceOpts helper so the flag stays request-scoped (not
persisted on CreateWorkspacePayload).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-14 10:45:30 -07:00
parent a16a25b1f1
commit 4ff65b82c7
4 changed files with 208 additions and 12 deletions

View File

@ -16,6 +16,14 @@ import (
// provisionWorkspace handles async container deployment with timeout.
func (h *WorkspaceHandler) provisionWorkspace(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) {
h.provisionWorkspaceOpts(workspaceID, templatePath, configFiles, payload, false)
}
// provisionWorkspaceOpts is the workhorse variant of provisionWorkspace that
// accepts extra per-invocation knobs (e.g. resetClaudeSession for issue #12)
// that should NOT be persisted on CreateWorkspacePayload because they're
// request-scoped flags.
func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload, resetClaudeSession bool) {
ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
defer cancel()
@ -76,6 +84,7 @@ func (h *WorkspaceHandler) provisionWorkspace(workspaceID, templatePath string,
pluginsPath, _ := filepath.Abs(filepath.Join(h.configsDir, "..", "plugins"))
awarenessNamespace := h.loadAwarenessNamespace(ctx, workspaceID)
cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
cfg.ResetClaudeSession = resetClaudeSession // #12
// Preflight #17: refuse to start a container we already know will crash on missing config.yaml.
// When the caller supplies neither a template dir nor in-memory configFiles (the auto-restart

View File

@ -104,8 +104,9 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
// Read template from request body or try to find matching config
var body struct {
Template string `json:"template"`
ApplyTemplate bool `json:"apply_template"` // force re-apply runtime-default template (e.g. after runtime change)
Template string `json:"template"`
ApplyTemplate bool `json:"apply_template"` // force re-apply runtime-default template (e.g. after runtime change)
Reset bool `json:"reset"` // #12: discard claude-sessions volume before restart
}
c.ShouldBindJSON(&body)
@ -151,9 +152,16 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
}
}
go h.provisionWorkspace(id, templatePath, configFiles, payload)
// #12: ?reset=true (or body.Reset) discards the claude-sessions volume
// before restart, giving the agent a clean /root/.claude/sessions dir.
resetClaudeSession := c.Query("reset") == "true" || body.Reset
if resetClaudeSession {
log.Printf("Restart: reset=true — will discard claude-sessions volume for %s (%s)", wsName, id)
}
c.JSON(http.StatusOK, gin.H{"status": "provisioning", "config_dir": configLabel})
go h.provisionWorkspaceOpts(id, templatePath, configFiles, payload, resetClaudeSession)
c.JSON(http.StatusOK, gin.H{"status": "provisioning", "config_dir": configLabel, "reset_session": resetClaudeSession})
}
// RestartByID restarts a workspace by ID — for programmatic use (e.g., auto-restart after secret change).

View File

@ -10,6 +10,7 @@ import (
"log"
"os"
"path/filepath"
"strconv"
"strings"
"time"
@ -65,6 +66,7 @@ type WorkspaceConfig struct {
AwarenessURL string
AwarenessNamespace string
WorkspaceAccess string // #65: "none" (default), "read_only", or "read_write"
ResetClaudeSession bool // #12: if true, discard the claude-sessions volume before start (fresh session dir)
}
// Workspace-access constants for #65. Matches the CHECK constraint on
@ -84,6 +86,18 @@ func ConfigVolumeName(workspaceID string) string {
return fmt.Sprintf("ws-%s-configs", id)
}
// ClaudeSessionVolumeName returns the Docker named volume for a workspace's
// Claude Code session directory (/root/.claude/sessions). Separate from the
// config volume so it can be discarded independently (via WORKSPACE_RESET_SESSION
// or ?reset=true) without wiping the user's config. Issue #12.
func ClaudeSessionVolumeName(workspaceID string) string {
id := workspaceID
if len(id) > 12 {
id = id[:12]
}
return fmt.Sprintf("ws-%s-claude-sessions", id)
}
// Provisioner manages Docker containers for workspace agents.
type Provisioner struct {
cli *client.Client
@ -160,6 +174,33 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
workspaceMount,
}
// #12: Preserve Claude Code session directory across restarts.
// The claude-code SDK stores conversations in /root/.claude/sessions/
// and Postgres keeps current_session_id. Without a persistent volume,
// restarts drop the session file and the SDK dies with
// "No conversation found with session ID: <uuid>".
//
// Only mount for runtime=claude-code (other runtimes don't use the path).
// Opt-out: ResetClaudeSession or env WORKSPACE_RESET_SESSION=1 → we
// remove the existing volume before recreating it, so the agent
// boots with a clean session dir.
if cfg.Runtime == "claude-code" {
claudeSessionsVolume := ClaudeSessionVolumeName(cfg.WorkspaceID)
resetEnv, _ := strconv.ParseBool(cfg.EnvVars["WORKSPACE_RESET_SESSION"])
if cfg.ResetClaudeSession || resetEnv {
if rmErr := p.cli.VolumeRemove(ctx, claudeSessionsVolume, true); rmErr != nil {
log.Printf("Provisioner: claude-sessions volume reset warning for %s: %v", claudeSessionsVolume, rmErr)
} else {
log.Printf("Provisioner: claude-sessions volume %s reset (fresh session)", claudeSessionsVolume)
}
}
if _, cvErr := p.cli.VolumeCreate(ctx, volume.CreateOptions{Name: claudeSessionsVolume}); cvErr != nil {
return "", fmt.Errorf("failed to create claude-sessions volume %s: %w", claudeSessionsVolume, cvErr)
}
binds = append(binds, fmt.Sprintf("%s:/root/.claude/sessions", claudeSessionsVolume))
log.Printf("Provisioner: claude-sessions volume %s mounted at /root/.claude/sessions", claudeSessionsVolume)
}
hostCfg := &container.HostConfig{
Binds: binds,
RestartPolicy: container.RestartPolicy{Name: "unless-stopped"},
@ -349,13 +390,95 @@ func buildContainerEnv(cfg WorkspaceConfig) []string {
return env
}
// Per-tier resource defaults. Configurable via TIERn_MEMORY_MB and
// TIERn_CPU_SHARES env vars (n in {2,3,4}). CPU shares follow the convention
// 1024 shares == 1 CPU; internally translated to NanoCPUs for a hard cap.
//
// Defaults reflect the tier sizing agreed in issue #14:
// - T2: 512 MiB, 1024 shares (1 CPU) — unchanged historical default
// - T3: 2048 MiB, 2048 shares (2 CPU) — new cap (previously uncapped)
// - T4: 4096 MiB, 4096 shares (4 CPU) — new cap (previously uncapped)
const (
defaultTier2MemoryMB = 512
defaultTier2CPUShares = 1024
defaultTier3MemoryMB = 2048
defaultTier3CPUShares = 2048
defaultTier4MemoryMB = 4096
defaultTier4CPUShares = 4096
)
// getTierMemoryMB returns the memory cap (MiB) for the given tier, reading
// TIERn_MEMORY_MB env var with fallback to the hardcoded default. Returns 0
// for tiers with no cap (e.g. tier 1).
func getTierMemoryMB(tier int) int64 {
var def int64
switch tier {
case 2:
def = defaultTier2MemoryMB
case 3:
def = defaultTier3MemoryMB
case 4:
def = defaultTier4MemoryMB
default:
return 0
}
if v := os.Getenv(fmt.Sprintf("TIER%d_MEMORY_MB", tier)); v != "" {
if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 {
return n
}
}
return def
}
// getTierCPUShares returns the CPU allocation (shares, where 1024 == 1 CPU)
// for the given tier, reading TIERn_CPU_SHARES env var with fallback to the
// hardcoded default. Returns 0 for tiers with no cap.
func getTierCPUShares(tier int) int64 {
var def int64
switch tier {
case 2:
def = defaultTier2CPUShares
case 3:
def = defaultTier3CPUShares
case 4:
def = defaultTier4CPUShares
default:
return 0
}
if v := os.Getenv(fmt.Sprintf("TIER%d_CPU_SHARES", tier)); v != "" {
if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 {
return n
}
}
return def
}
// applyTierResources writes Memory + NanoCPUs to hostCfg from the tier's
// configured limits (env override or default). Returns the resolved values
// for logging.
func applyTierResources(hostCfg *container.HostConfig, tier int) (memMB, cpuShares int64) {
memMB = getTierMemoryMB(tier)
cpuShares = getTierCPUShares(tier)
if memMB > 0 {
hostCfg.Resources.Memory = memMB * 1024 * 1024
}
if cpuShares > 0 {
// shares -> NanoCPUs: 1024 shares == 1 CPU == 1e9 NanoCPUs
hostCfg.Resources.NanoCPUs = (cpuShares * 1_000_000_000) / 1024
}
return memMB, cpuShares
}
// ApplyTierConfig configures a HostConfig based on the workspace tier.
// Extracted from Start() so it can be tested independently.
//
// - Tier 1 (Sandboxed): readonly rootfs, tmpfs /tmp, strip /workspace mount
// - Tier 2 (Standard): resource limits (512 MiB memory, 1 CPU), no special flags (default)
// - Tier 3 (Privileged): privileged mode, host PID, Docker network (not host)
// - Tier 4 (Full access): privileged, host PID, host network, Docker socket mount, all capabilities
// - Tier 2 (Standard): resource limits (default 512 MiB, 1 CPU)
// - Tier 3 (Privileged): privileged + host PID, Docker network, capped resources
// - Tier 4 (Full access): privileged, host PID, host network, Docker socket, capped resources
//
// Per-tier memory/CPU caps are overridable via TIERn_MEMORY_MB /
// TIERn_CPU_SHARES env vars (n in {2,3,4}).
//
// Unknown/zero tiers default to Tier 2 behavior (safe resource-limited container).
func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configMount, name string) {
@ -378,7 +501,8 @@ func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configM
// causes port collisions when multiple T3 containers run simultaneously.
hostCfg.Privileged = true
hostCfg.PidMode = "host"
log.Printf("Provisioner: T3 privileged mode for %s (privileged, host PID)", name)
memMB, shares := applyTierResources(hostCfg, 3)
log.Printf("Provisioner: T3 privileged mode for %s (privileged, host PID, %dm memory, %d CPU shares)", name, memMB, shares)
case 4:
// Full host access: everything from T3 + host network + Docker socket + all capabilities.
@ -388,14 +512,14 @@ func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configM
hostCfg.NetworkMode = "host"
// Mount Docker socket so workspace can manage containers
hostCfg.Binds = append(hostCfg.Binds, "/var/run/docker.sock:/var/run/docker.sock")
log.Printf("Provisioner: T4 full-host mode for %s (privileged, host PID, host network, docker socket)", name)
memMB, shares := applyTierResources(hostCfg, 4)
log.Printf("Provisioner: T4 full-host mode for %s (privileged, host PID, host network, docker socket, %dm memory, %d CPU shares)", name, memMB, shares)
default:
// Tier 2 (Standard) and unknown tiers: normal container with resource limits.
// This is the safe default — no privileged access, reasonable resource caps.
hostCfg.Resources.Memory = 512 * 1024 * 1024 // 512 MiB
hostCfg.Resources.NanoCPUs = 1_000_000_000 // 1.0 CPU
log.Printf("Provisioner: T2 standard mode for %s (512m memory, 1 CPU)", name)
memMB, shares := applyTierResources(hostCfg, 2)
log.Printf("Provisioner: T2 standard mode for %s (%dm memory, %d CPU shares)", name, memMB, shares)
}
}
@ -585,12 +709,20 @@ func (p *Provisioner) execInContainer(ctx context.Context, containerID string, c
}
// RemoveVolume removes the config volume for a workspace.
// Also removes the claude-sessions volume (best-effort, may not exist
// for non claude-code runtimes). Issue #12.
func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) error {
volName := ConfigVolumeName(workspaceID)
if err := p.cli.VolumeRemove(ctx, volName, true); err != nil {
return fmt.Errorf("failed to remove volume %s: %w", volName, err)
}
log.Printf("Provisioner: removed config volume %s", volName)
csName := ClaudeSessionVolumeName(workspaceID)
if rmErr := p.cli.VolumeRemove(ctx, csName, true); rmErr != nil {
log.Printf("Provisioner: claude-sessions volume cleanup warning for %s: %v", csName, rmErr)
} else {
log.Printf("Provisioner: removed claude-sessions volume %s", csName)
}
return nil
}

View File

@ -389,6 +389,53 @@ func TestConfigVolumeName(t *testing.T) {
}
}
// ---------- #12 — claude-sessions volume naming ----------
// TestClaudeSessionVolumeName_Deterministic: same ID → same volume name, and
// the name follows the ws-<id[:12]>-claude-sessions shape used everywhere
// else in the provisioner.
func TestClaudeSessionVolumeName_Deterministic(t *testing.T) {
tests := []struct {
id string
want string
}{
{"short", "ws-short-claude-sessions"},
{"exactly12ch", "ws-exactly12ch-claude-sessions"},
{"longer-than-twelve-characters", "ws-longer-than--claude-sessions"},
{"abc", "ws-abc-claude-sessions"},
}
for _, tt := range tests {
got := ClaudeSessionVolumeName(tt.id)
if got != tt.want {
t.Errorf("ClaudeSessionVolumeName(%q) = %q, want %q", tt.id, got, tt.want)
}
// Deterministic: calling twice returns the same value.
if again := ClaudeSessionVolumeName(tt.id); again != got {
t.Errorf("ClaudeSessionVolumeName not deterministic: %q vs %q", got, again)
}
}
}
// TestClaudeSessionVolumeName_DistinctFromConfig ensures we never alias the
// claude-sessions volume onto the config volume (deleting one must not wipe
// the other in RemoveVolume's cleanup path).
func TestClaudeSessionVolumeName_DistinctFromConfig(t *testing.T) {
id := "abc123def456"
if ClaudeSessionVolumeName(id) == ConfigVolumeName(id) {
t.Fatalf("claude-sessions and config volume names must differ (both = %q)", ConfigVolumeName(id))
}
}
// TestWorkspaceConfig_ResetClaudeSessionFieldPresent is a compile-time check
// that the ResetClaudeSession knob exists on WorkspaceConfig so handlers can
// plumb ?reset=true through to the provisioner without a struct tag dance.
func TestWorkspaceConfig_ResetClaudeSessionFieldPresent(t *testing.T) {
cfg := WorkspaceConfig{WorkspaceID: "x", Runtime: "claude-code", ResetClaudeSession: true}
if !cfg.ResetClaudeSession {
t.Fatal("ResetClaudeSession should round-trip through struct literal")
}
}
// ---------- buildContainerEnv — #67 MOLECULE_URL injection ----------
func TestBuildContainerEnv_InjectsBothPlatformURLAndMoleculeAIURL(t *testing.T) {