fix(provisioner): preserve Claude session directory across restart (#12)
Resolves #12. The claude-code SDK stores conversations in /root/.claude/sessions/ and Postgres tracks current_session_id, but the container filesystem was recreated on every restart — next agent message failed with "No conversation found with session ID: <uuid>". Add a per-workspace named Docker volume (ws-<id>-claude-sessions) mounted read-write at /root/.claude/sessions. Gated by runtime=claude-code so other runtimes don't pay for a path they don't use. Volume is cleaned up in RemoveVolume alongside the config volume. Two opt-outs discard the volume before restart for a fresh session: - env WORKSPACE_RESET_SESSION=1 on the container - POST /workspaces/:id/restart?reset=true (or {"reset": true} body) Plumbed via new ResetClaudeSession field on WorkspaceConfig + provisionWorkspaceOpts helper so the flag stays request-scoped (not persisted on CreateWorkspacePayload). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a16a25b1f1
commit
4ff65b82c7
@ -16,6 +16,14 @@ import (
|
||||
|
||||
// provisionWorkspace handles async container deployment with timeout.
|
||||
func (h *WorkspaceHandler) provisionWorkspace(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) {
|
||||
h.provisionWorkspaceOpts(workspaceID, templatePath, configFiles, payload, false)
|
||||
}
|
||||
|
||||
// provisionWorkspaceOpts is the workhorse variant of provisionWorkspace that
|
||||
// accepts extra per-invocation knobs (e.g. resetClaudeSession for issue #12)
|
||||
// that should NOT be persisted on CreateWorkspacePayload because they're
|
||||
// request-scoped flags.
|
||||
func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload, resetClaudeSession bool) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
|
||||
defer cancel()
|
||||
|
||||
@ -76,6 +84,7 @@ func (h *WorkspaceHandler) provisionWorkspace(workspaceID, templatePath string,
|
||||
pluginsPath, _ := filepath.Abs(filepath.Join(h.configsDir, "..", "plugins"))
|
||||
awarenessNamespace := h.loadAwarenessNamespace(ctx, workspaceID)
|
||||
cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
|
||||
cfg.ResetClaudeSession = resetClaudeSession // #12
|
||||
|
||||
// Preflight #17: refuse to start a container we already know will crash on missing config.yaml.
|
||||
// When the caller supplies neither a template dir nor in-memory configFiles (the auto-restart
|
||||
|
||||
@ -104,8 +104,9 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
|
||||
|
||||
// Read template from request body or try to find matching config
|
||||
var body struct {
|
||||
Template string `json:"template"`
|
||||
ApplyTemplate bool `json:"apply_template"` // force re-apply runtime-default template (e.g. after runtime change)
|
||||
Template string `json:"template"`
|
||||
ApplyTemplate bool `json:"apply_template"` // force re-apply runtime-default template (e.g. after runtime change)
|
||||
Reset bool `json:"reset"` // #12: discard claude-sessions volume before restart
|
||||
}
|
||||
c.ShouldBindJSON(&body)
|
||||
|
||||
@ -151,9 +152,16 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
go h.provisionWorkspace(id, templatePath, configFiles, payload)
|
||||
// #12: ?reset=true (or body.Reset) discards the claude-sessions volume
|
||||
// before restart, giving the agent a clean /root/.claude/sessions dir.
|
||||
resetClaudeSession := c.Query("reset") == "true" || body.Reset
|
||||
if resetClaudeSession {
|
||||
log.Printf("Restart: reset=true — will discard claude-sessions volume for %s (%s)", wsName, id)
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{"status": "provisioning", "config_dir": configLabel})
|
||||
go h.provisionWorkspaceOpts(id, templatePath, configFiles, payload, resetClaudeSession)
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{"status": "provisioning", "config_dir": configLabel, "reset_session": resetClaudeSession})
|
||||
}
|
||||
|
||||
// RestartByID restarts a workspace by ID — for programmatic use (e.g., auto-restart after secret change).
|
||||
|
||||
@ -10,6 +10,7 @@ import (
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@ -65,6 +66,7 @@ type WorkspaceConfig struct {
|
||||
AwarenessURL string
|
||||
AwarenessNamespace string
|
||||
WorkspaceAccess string // #65: "none" (default), "read_only", or "read_write"
|
||||
ResetClaudeSession bool // #12: if true, discard the claude-sessions volume before start (fresh session dir)
|
||||
}
|
||||
|
||||
// Workspace-access constants for #65. Matches the CHECK constraint on
|
||||
@ -84,6 +86,18 @@ func ConfigVolumeName(workspaceID string) string {
|
||||
return fmt.Sprintf("ws-%s-configs", id)
|
||||
}
|
||||
|
||||
// ClaudeSessionVolumeName returns the Docker named volume for a workspace's
|
||||
// Claude Code session directory (/root/.claude/sessions). Separate from the
|
||||
// config volume so it can be discarded independently (via WORKSPACE_RESET_SESSION
|
||||
// or ?reset=true) without wiping the user's config. Issue #12.
|
||||
func ClaudeSessionVolumeName(workspaceID string) string {
|
||||
id := workspaceID
|
||||
if len(id) > 12 {
|
||||
id = id[:12]
|
||||
}
|
||||
return fmt.Sprintf("ws-%s-claude-sessions", id)
|
||||
}
|
||||
|
||||
// Provisioner manages Docker containers for workspace agents.
|
||||
type Provisioner struct {
|
||||
cli *client.Client
|
||||
@ -160,6 +174,33 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
|
||||
workspaceMount,
|
||||
}
|
||||
|
||||
// #12: Preserve Claude Code session directory across restarts.
|
||||
// The claude-code SDK stores conversations in /root/.claude/sessions/
|
||||
// and Postgres keeps current_session_id. Without a persistent volume,
|
||||
// restarts drop the session file and the SDK dies with
|
||||
// "No conversation found with session ID: <uuid>".
|
||||
//
|
||||
// Only mount for runtime=claude-code (other runtimes don't use the path).
|
||||
// Opt-out: ResetClaudeSession or env WORKSPACE_RESET_SESSION=1 → we
|
||||
// remove the existing volume before recreating it, so the agent
|
||||
// boots with a clean session dir.
|
||||
if cfg.Runtime == "claude-code" {
|
||||
claudeSessionsVolume := ClaudeSessionVolumeName(cfg.WorkspaceID)
|
||||
resetEnv, _ := strconv.ParseBool(cfg.EnvVars["WORKSPACE_RESET_SESSION"])
|
||||
if cfg.ResetClaudeSession || resetEnv {
|
||||
if rmErr := p.cli.VolumeRemove(ctx, claudeSessionsVolume, true); rmErr != nil {
|
||||
log.Printf("Provisioner: claude-sessions volume reset warning for %s: %v", claudeSessionsVolume, rmErr)
|
||||
} else {
|
||||
log.Printf("Provisioner: claude-sessions volume %s reset (fresh session)", claudeSessionsVolume)
|
||||
}
|
||||
}
|
||||
if _, cvErr := p.cli.VolumeCreate(ctx, volume.CreateOptions{Name: claudeSessionsVolume}); cvErr != nil {
|
||||
return "", fmt.Errorf("failed to create claude-sessions volume %s: %w", claudeSessionsVolume, cvErr)
|
||||
}
|
||||
binds = append(binds, fmt.Sprintf("%s:/root/.claude/sessions", claudeSessionsVolume))
|
||||
log.Printf("Provisioner: claude-sessions volume %s mounted at /root/.claude/sessions", claudeSessionsVolume)
|
||||
}
|
||||
|
||||
hostCfg := &container.HostConfig{
|
||||
Binds: binds,
|
||||
RestartPolicy: container.RestartPolicy{Name: "unless-stopped"},
|
||||
@ -349,13 +390,95 @@ func buildContainerEnv(cfg WorkspaceConfig) []string {
|
||||
return env
|
||||
}
|
||||
|
||||
// Per-tier resource defaults. Configurable via TIERn_MEMORY_MB and
|
||||
// TIERn_CPU_SHARES env vars (n in {2,3,4}). CPU shares follow the convention
|
||||
// 1024 shares == 1 CPU; internally translated to NanoCPUs for a hard cap.
|
||||
//
|
||||
// Defaults reflect the tier sizing agreed in issue #14:
|
||||
// - T2: 512 MiB, 1024 shares (1 CPU) — unchanged historical default
|
||||
// - T3: 2048 MiB, 2048 shares (2 CPU) — new cap (previously uncapped)
|
||||
// - T4: 4096 MiB, 4096 shares (4 CPU) — new cap (previously uncapped)
|
||||
const (
|
||||
defaultTier2MemoryMB = 512
|
||||
defaultTier2CPUShares = 1024
|
||||
defaultTier3MemoryMB = 2048
|
||||
defaultTier3CPUShares = 2048
|
||||
defaultTier4MemoryMB = 4096
|
||||
defaultTier4CPUShares = 4096
|
||||
)
|
||||
|
||||
// getTierMemoryMB returns the memory cap (MiB) for the given tier, reading
|
||||
// TIERn_MEMORY_MB env var with fallback to the hardcoded default. Returns 0
|
||||
// for tiers with no cap (e.g. tier 1).
|
||||
func getTierMemoryMB(tier int) int64 {
|
||||
var def int64
|
||||
switch tier {
|
||||
case 2:
|
||||
def = defaultTier2MemoryMB
|
||||
case 3:
|
||||
def = defaultTier3MemoryMB
|
||||
case 4:
|
||||
def = defaultTier4MemoryMB
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
if v := os.Getenv(fmt.Sprintf("TIER%d_MEMORY_MB", tier)); v != "" {
|
||||
if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 {
|
||||
return n
|
||||
}
|
||||
}
|
||||
return def
|
||||
}
|
||||
|
||||
// getTierCPUShares returns the CPU allocation (shares, where 1024 == 1 CPU)
|
||||
// for the given tier, reading TIERn_CPU_SHARES env var with fallback to the
|
||||
// hardcoded default. Returns 0 for tiers with no cap.
|
||||
func getTierCPUShares(tier int) int64 {
|
||||
var def int64
|
||||
switch tier {
|
||||
case 2:
|
||||
def = defaultTier2CPUShares
|
||||
case 3:
|
||||
def = defaultTier3CPUShares
|
||||
case 4:
|
||||
def = defaultTier4CPUShares
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
if v := os.Getenv(fmt.Sprintf("TIER%d_CPU_SHARES", tier)); v != "" {
|
||||
if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 {
|
||||
return n
|
||||
}
|
||||
}
|
||||
return def
|
||||
}
|
||||
|
||||
// applyTierResources writes Memory + NanoCPUs to hostCfg from the tier's
|
||||
// configured limits (env override or default). Returns the resolved values
|
||||
// for logging.
|
||||
func applyTierResources(hostCfg *container.HostConfig, tier int) (memMB, cpuShares int64) {
|
||||
memMB = getTierMemoryMB(tier)
|
||||
cpuShares = getTierCPUShares(tier)
|
||||
if memMB > 0 {
|
||||
hostCfg.Resources.Memory = memMB * 1024 * 1024
|
||||
}
|
||||
if cpuShares > 0 {
|
||||
// shares -> NanoCPUs: 1024 shares == 1 CPU == 1e9 NanoCPUs
|
||||
hostCfg.Resources.NanoCPUs = (cpuShares * 1_000_000_000) / 1024
|
||||
}
|
||||
return memMB, cpuShares
|
||||
}
|
||||
|
||||
// ApplyTierConfig configures a HostConfig based on the workspace tier.
|
||||
// Extracted from Start() so it can be tested independently.
|
||||
//
|
||||
// - Tier 1 (Sandboxed): readonly rootfs, tmpfs /tmp, strip /workspace mount
|
||||
// - Tier 2 (Standard): resource limits (512 MiB memory, 1 CPU), no special flags (default)
|
||||
// - Tier 3 (Privileged): privileged mode, host PID, Docker network (not host)
|
||||
// - Tier 4 (Full access): privileged, host PID, host network, Docker socket mount, all capabilities
|
||||
// - Tier 2 (Standard): resource limits (default 512 MiB, 1 CPU)
|
||||
// - Tier 3 (Privileged): privileged + host PID, Docker network, capped resources
|
||||
// - Tier 4 (Full access): privileged, host PID, host network, Docker socket, capped resources
|
||||
//
|
||||
// Per-tier memory/CPU caps are overridable via TIERn_MEMORY_MB /
|
||||
// TIERn_CPU_SHARES env vars (n in {2,3,4}).
|
||||
//
|
||||
// Unknown/zero tiers default to Tier 2 behavior (safe resource-limited container).
|
||||
func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configMount, name string) {
|
||||
@ -378,7 +501,8 @@ func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configM
|
||||
// causes port collisions when multiple T3 containers run simultaneously.
|
||||
hostCfg.Privileged = true
|
||||
hostCfg.PidMode = "host"
|
||||
log.Printf("Provisioner: T3 privileged mode for %s (privileged, host PID)", name)
|
||||
memMB, shares := applyTierResources(hostCfg, 3)
|
||||
log.Printf("Provisioner: T3 privileged mode for %s (privileged, host PID, %dm memory, %d CPU shares)", name, memMB, shares)
|
||||
|
||||
case 4:
|
||||
// Full host access: everything from T3 + host network + Docker socket + all capabilities.
|
||||
@ -388,14 +512,14 @@ func ApplyTierConfig(hostCfg *container.HostConfig, cfg WorkspaceConfig, configM
|
||||
hostCfg.NetworkMode = "host"
|
||||
// Mount Docker socket so workspace can manage containers
|
||||
hostCfg.Binds = append(hostCfg.Binds, "/var/run/docker.sock:/var/run/docker.sock")
|
||||
log.Printf("Provisioner: T4 full-host mode for %s (privileged, host PID, host network, docker socket)", name)
|
||||
memMB, shares := applyTierResources(hostCfg, 4)
|
||||
log.Printf("Provisioner: T4 full-host mode for %s (privileged, host PID, host network, docker socket, %dm memory, %d CPU shares)", name, memMB, shares)
|
||||
|
||||
default:
|
||||
// Tier 2 (Standard) and unknown tiers: normal container with resource limits.
|
||||
// This is the safe default — no privileged access, reasonable resource caps.
|
||||
hostCfg.Resources.Memory = 512 * 1024 * 1024 // 512 MiB
|
||||
hostCfg.Resources.NanoCPUs = 1_000_000_000 // 1.0 CPU
|
||||
log.Printf("Provisioner: T2 standard mode for %s (512m memory, 1 CPU)", name)
|
||||
memMB, shares := applyTierResources(hostCfg, 2)
|
||||
log.Printf("Provisioner: T2 standard mode for %s (%dm memory, %d CPU shares)", name, memMB, shares)
|
||||
}
|
||||
}
|
||||
|
||||
@ -585,12 +709,20 @@ func (p *Provisioner) execInContainer(ctx context.Context, containerID string, c
|
||||
}
|
||||
|
||||
// RemoveVolume removes the config volume for a workspace.
|
||||
// Also removes the claude-sessions volume (best-effort, may not exist
|
||||
// for non claude-code runtimes). Issue #12.
|
||||
func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) error {
|
||||
volName := ConfigVolumeName(workspaceID)
|
||||
if err := p.cli.VolumeRemove(ctx, volName, true); err != nil {
|
||||
return fmt.Errorf("failed to remove volume %s: %w", volName, err)
|
||||
}
|
||||
log.Printf("Provisioner: removed config volume %s", volName)
|
||||
csName := ClaudeSessionVolumeName(workspaceID)
|
||||
if rmErr := p.cli.VolumeRemove(ctx, csName, true); rmErr != nil {
|
||||
log.Printf("Provisioner: claude-sessions volume cleanup warning for %s: %v", csName, rmErr)
|
||||
} else {
|
||||
log.Printf("Provisioner: removed claude-sessions volume %s", csName)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@ -389,6 +389,53 @@ func TestConfigVolumeName(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- #12 — claude-sessions volume naming ----------
|
||||
|
||||
// TestClaudeSessionVolumeName_Deterministic: same ID → same volume name, and
|
||||
// the name follows the ws-<id[:12]>-claude-sessions shape used everywhere
|
||||
// else in the provisioner.
|
||||
func TestClaudeSessionVolumeName_Deterministic(t *testing.T) {
|
||||
tests := []struct {
|
||||
id string
|
||||
want string
|
||||
}{
|
||||
{"short", "ws-short-claude-sessions"},
|
||||
{"exactly12ch", "ws-exactly12ch-claude-sessions"},
|
||||
{"longer-than-twelve-characters", "ws-longer-than--claude-sessions"},
|
||||
{"abc", "ws-abc-claude-sessions"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
got := ClaudeSessionVolumeName(tt.id)
|
||||
if got != tt.want {
|
||||
t.Errorf("ClaudeSessionVolumeName(%q) = %q, want %q", tt.id, got, tt.want)
|
||||
}
|
||||
// Deterministic: calling twice returns the same value.
|
||||
if again := ClaudeSessionVolumeName(tt.id); again != got {
|
||||
t.Errorf("ClaudeSessionVolumeName not deterministic: %q vs %q", got, again)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestClaudeSessionVolumeName_DistinctFromConfig ensures we never alias the
|
||||
// claude-sessions volume onto the config volume (deleting one must not wipe
|
||||
// the other in RemoveVolume's cleanup path).
|
||||
func TestClaudeSessionVolumeName_DistinctFromConfig(t *testing.T) {
|
||||
id := "abc123def456"
|
||||
if ClaudeSessionVolumeName(id) == ConfigVolumeName(id) {
|
||||
t.Fatalf("claude-sessions and config volume names must differ (both = %q)", ConfigVolumeName(id))
|
||||
}
|
||||
}
|
||||
|
||||
// TestWorkspaceConfig_ResetClaudeSessionFieldPresent is a compile-time check
|
||||
// that the ResetClaudeSession knob exists on WorkspaceConfig so handlers can
|
||||
// plumb ?reset=true through to the provisioner without a struct tag dance.
|
||||
func TestWorkspaceConfig_ResetClaudeSessionFieldPresent(t *testing.T) {
|
||||
cfg := WorkspaceConfig{WorkspaceID: "x", Runtime: "claude-code", ResetClaudeSession: true}
|
||||
if !cfg.ResetClaudeSession {
|
||||
t.Fatal("ResetClaudeSession should round-trip through struct literal")
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- buildContainerEnv — #67 MOLECULE_URL injection ----------
|
||||
|
||||
func TestBuildContainerEnv_InjectsBothPlatformURLAndMoleculeAIURL(t *testing.T) {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user