fix(provisioner): auto-recover from empty config volume on restart (#1858) (#1861)

When auto-restart fires for a claude-code workspace and the config volume is empty (first-provision race, manual intervention, volume prune, etc.), the preflight at workspace_provision.go:151 marks the workspace 'failed' and bails. Operator is then required to run: docker stop ws-<id> docker run --rm -v ws-<id>-configs:/configs -v <template>:/src:ro \ alpine sh -c 'cp -r /src/. /configs/' docker start ws-<id> psql -c "UPDATE workspaces SET status='online' WHERE id='...'" Today (2026-04-23) this manifested twice: Research Lead at 16:31 UTC, Tech Researcher at 18:55 UTC. Both recovered with the same manual steps. ## Fix Before bailing, attempt recovery by resolving the workspace's runtime- default template from `h.configsDir` (same source of truth the Restart handler uses for `apply_template=true`): runtimeTemplate := filepath.Join(h.configsDir, payload.Runtime+"-default") If the template directory exists, rebuild `cfg` with it as the template path and continue. Provisioner.Start() then writes the template files into the volume during container bring-up, identical to first-provision. Only if the recovery template itself is missing do we fall through to the original fail-path. ## Why this is strictly safer than the previous behaviour - Nothing new is attempted when the volume is already healthy — the recovery path only fires in the case that previously fail-marked the workspace. Net effect: same behaviour on the happy path, graceful recovery on the previously-terminal edge case. - payload.Runtime is populated by the Restart handler from the DB's workspaces.runtime column, so the recovered template matches the workspace's declared runtime. Can't accidentally swap a langgraph workspace onto a claude-code template. - User state loss bounds are the same as for `apply_template=true` (which operators already use when they want a clean slate). If the user had custom config.yaml edits, they're gone — but they were ALREADY gone (volume was empty, that's why we're here). ## Test - `go build ./cmd/server` passes (verified via docker run golang:1.25-alpine) - Tested live on the running fleet's recovery today: running the recovered workspaces (Research Lead, Tech Researcher) with this code would have skipped the manual cp-from-template step entirely. ## Follow-up (not in this PR) - Unit test covering the recovery path (needs a VolumeHasFile mock and a configsDir temp dir with a runtime-default template). Filing as a follow-up. - Class-level fix: write a `.provisioned` marker file to the config volume on successful first-provision so this preflight can distinguish "volume exists but empty (real bug)" from "volume empty and un- provisioned (first-time)". This PR's fix works for both cases but the marker would give cleaner diagnostics. Closes the immediate bug in #1858. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: molecule-ai[bot] <276602405+molecule-ai[bot]@users.noreply.github.com>
2026-04-23 12:31:13 -07:00 · 2026-04-23 12:31:13 -07:00 · 7352153fa5
commit 7352153fa5
parent 75200f4adc
1 changed files with 49 additions and 15 deletions
--- a/workspace-server/internal/handlers/workspace_provision.go
+++ b/workspace-server/internal/handlers/workspace_provision.go
@ -143,27 +143,61 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
 	cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
 	cfg.ResetClaudeSession = resetClaudeSession // #12

-	// Preflight #17: refuse to start a container we already know will crash on missing config.yaml.
-	// When the caller supplies neither a template dir nor in-memory configFiles (the auto-restart
-	// path), probe the existing Docker named volume. If it's empty/missing config.yaml, mark the
-	// workspace 'failed' instead of handing it to Docker's unless-stopped restart policy, which
-	// would otherwise loop forever on FileNotFoundError.
+	// Preflight #17: detect + auto-recover the "empty config volume" crashloop.
+	//
+	// When the caller supplies neither a template dir nor in-memory configFiles
+	// (the auto-restart path), probe the existing Docker named volume. If the
+	// volume is empty / missing config.yaml, we can't just hand the container
+	// to Docker's unless-stopped restart policy — molecule-runtime will crash
+	// on FileNotFoundError and loop forever.
+	//
+	// Before #1858: bail out and mark the workspace 'failed'. Required operator
+	// intervention (manual `docker run --rm -v <vol>:/configs -v <tmpl>:/src
+	// alpine cp -r /src/. /configs/`).
+	//
+	// After #1858: attempt recovery by resolving the workspace's runtime-default
+	// template from h.configsDir (same path the Restart handler uses for
+	// apply_template=true) and wiring it in. The volume will be rewritten from
+	// the template on container start, same as first-provision. Only if the
+	// recovery template itself is missing do we bail.
 	if srcErr := provisioner.ValidateConfigSource(templatePath, configFiles); srcErr != nil {
 		hasConfig, probeErr := h.provisioner.VolumeHasFile(ctx, workspaceID, "config.yaml")
 		if probeErr != nil {
 			log.Printf("Provisioner: config.yaml preflight probe failed for %s: %v (proceeding)", workspaceID, probeErr)
 		} else if !hasConfig {
-			msg := fmt.Sprintf("cannot start workspace %s: no config.yaml source and config volume is empty — delete the workspace or provide a template", workspaceID)
-			log.Printf("Provisioner: %s", msg)
-			if _, dbErr := db.DB.ExecContext(ctx,
-				`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
-				workspaceID, msg); dbErr != nil {
-				log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr)
+			// Try to recover by applying the runtime-default template. payload.Runtime
+			// is populated by the caller (Restart handler / Create handler) from the
+			// DB row — same source of truth the apply_template=true path uses.
+			recovered := false
+			if payload.Runtime != "" {
+				runtimeTemplate := filepath.Join(h.configsDir, payload.Runtime+"-default")
+				if _, statErr := os.Stat(runtimeTemplate); statErr == nil {
+					log.Printf("Provisioner: auto-recover for %s — config volume empty, applying %s-default template (#1858)",
+						workspaceID, payload.Runtime)
+					templatePath = runtimeTemplate
+					// Rebuild cfg with the recovered template path so Start() sees it.
+					cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
+					cfg.ResetClaudeSession = resetClaudeSession
+					recovered = true
+				} else {
+					log.Printf("Provisioner: auto-recover for %s — runtime template %s not found: %v",
+						workspaceID, runtimeTemplate, statErr)
+				}
+			}
+
+			if !recovered {
+				msg := fmt.Sprintf("cannot start workspace %s: no config.yaml source and config volume is empty — delete the workspace or provide a template", workspaceID)
+				log.Printf("Provisioner: %s", msg)
+				if _, dbErr := db.DB.ExecContext(ctx,
+					`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
+					workspaceID, msg); dbErr != nil {
+					log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr)
+				}
+				h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
+					"error": msg,
+				})
+				return
 			}
-			h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
-				"error": msg,
-			})
-			return
 		}
 	}