fix(provision): SaaS auto-restart re-delivers template config.yaml + prompts #3055

Merged
devops-engineer merged 1 commits from fix/saas-restart-template-redelivery into main 2026-06-19 03:36:04 +00:00
@@ -6,6 +6,7 @@ import (
"io"
"log"
"net/http"
"os"
"runtime/debug"
"strings"
"sync"
@@ -984,15 +985,36 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {
// Runtime from DB — no more config file parsing
payload := withStoredCompute(ctx, workspaceID, models.CreateWorkspacePayload{Name: wsName, Tier: tier, Runtime: dbRuntime})
// RFC#2843 #33: on SaaS (cpProv), restore the persisted template so the
// re-provision re-delivers config.yaml + prompts — TemplateIdentity is
// derived from payload.Template (workspace_provision.go). Without this the
// SaaS re-provision ran with template="" → 218-byte stub config + dropped
// skills on every restart. Docker keeps its persistent config volume, so it
// retains the "do not re-apply templates" behavior (template left empty).
// RFC#2843 #33 + SaaS restart re-stub fix: on SaaS (cpProv), restore the
// persisted template AND resolve its LOCAL template dir so the re-provision
// re-delivers config.yaml + prompts.
//
// Setting payload.Template alone is NOT enough: it only drives the fetcher's
// TemplateAssets wire field, which the deployed control-plane does NOT consume
// (CP /cp/workspaces/provision reads `config_files` only — see
// controlplane wsProvisionRequest). config.yaml/prompts reach a SaaS box
// EXCLUSIVELY via `config_files`, which collectCPConfigFiles populates by
// walking cfg.TemplatePath. The restart previously passed templatePath="" →
// config routed only into the dropped TemplateAssets → every SaaS restart
// (each a fresh EC2 with empty /configs) landed a 218-byte stub config.yaml.
// That is the template-delivery-e2e intermittent (loses the race vs the
// post-online plugin-reconcile restart) AND a prod identity-loss-on-restart
// bug. Re-applying only touches template-owned, allowlisted files
// (config.yaml, prompts/**); user/agent paths (CLAUDE.md, MEMORY.md,
// .claude/**, /workspace) are excluded by IsCPTemplateAssetPath, so a
// persisted /configs is never clobbered. Docker keeps templatePath="" to
// preserve its persistent config volume.
restartTemplatePath := ""
if h.cpProv != nil {
if storedTmpl := storedWorkspaceTemplate(ctx, workspaceID); storedTmpl != "" {
payload.Template = storedTmpl
if p, resolveErr := resolveWorkspaceTemplatePath(h.configsDir, h.cacheDir, storedTmpl); resolveErr != nil {
log.Printf("Auto-restart: %s template %q path resolution failed: %v — re-provision may land a stub config", workspaceID, storedTmpl, resolveErr)
} else if _, statErr := os.Stat(p); statErr == nil {
restartTemplatePath = p
} else {
log.Printf("Auto-restart: %s template %q dir absent at %q — re-provision may land a stub config", workspaceID, storedTmpl, p)
}
}
}
@@ -1022,7 +1044,7 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {
// status='provisioning' (the UPDATE above already ran). User-
// observable result on SaaS pre-fix: dead workspace → manual canvas
// restart was the only recovery path.
h.provisionWorkspaceAutoSyncLocked(workspaceID, "", nil, payload)
h.provisionWorkspaceAutoSyncLocked(workspaceID, restartTemplatePath, nil, payload)
// sendRestartContext is a one-way notification to the new container; safe
// to fire async — the next restart cycle won't depend on it completing.
// Tracked via h.goAsync so tests can wait for it via h.asyncWG before