From 2ba0662da678e27ebe4e6736721757bdce6c9910 Mon Sep 17 00:00:00 2001 From: core-devops Date: Thu, 18 Jun 2026 19:52:33 -0700 Subject: [PATCH] fix(provision): SaaS auto-restart re-delivers template config.yaml + prompts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SaaS auto-restart re-provisions a fresh EC2 (empty /configs), but the restart dispatched provisionWorkspaceAutoSyncLocked(id, "", nil, payload) with templatePath="". config.yaml/prompts reach a SaaS box ONLY via the config_files channel (CP /cp/workspaces/provision consumes `config_files`; the fetcher's TemplateAssets wire field is NOT consumed by the deployed control-plane), and config_files is populated by collectCPConfigFiles walking cfg.TemplatePath. With templatePath="", config routed only into the dropped TemplateAssets → every SaaS restart landed a 218-byte stub config.yaml (lost identity/prompts). The prior RFC#2843 #33 fix set payload.Template on restart, but that only drives the (CP-dropped) TemplateAssets path — it never re-delivered. This resolves the LOCAL template dir the same way first-provision does (resolveWorkspaceTemplatePath) and passes it, so the restart re-provision re-delivers config.yaml + prompts via config_files. Re-applying touches only template-owned, allowlisted files (config.yaml, prompts/**); user/agent paths (CLAUDE.md, MEMORY.md, .claude/**, /workspace) are excluded by IsCPTemplateAssetPath, so a persisted /configs is not clobbered. Docker keeps templatePath="" (preserve its persistent config volume). Root-caused live via SSM on a staging tenant: seo-agent online with real config, then post-online plugin-reconcile triggered an auto-restart → fresh EC2 → config.yaml = 218B stub (workspaces.template was correctly 'seo-agent'). This is the template-delivery-e2e intermittent (races the assertion vs the restart) and a prod identity-loss-on-restart bug. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../internal/handlers/workspace_restart.go | 36 +++++++++++++++---- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/workspace-server/internal/handlers/workspace_restart.go b/workspace-server/internal/handlers/workspace_restart.go index 0136f08f..fdb267b1 100644 --- a/workspace-server/internal/handlers/workspace_restart.go +++ b/workspace-server/internal/handlers/workspace_restart.go @@ -6,6 +6,7 @@ import ( "io" "log" "net/http" + "os" "runtime/debug" "strings" "sync" @@ -984,15 +985,36 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) { // Runtime from DB — no more config file parsing payload := withStoredCompute(ctx, workspaceID, models.CreateWorkspacePayload{Name: wsName, Tier: tier, Runtime: dbRuntime}) - // RFC#2843 #33: on SaaS (cpProv), restore the persisted template so the - // re-provision re-delivers config.yaml + prompts — TemplateIdentity is - // derived from payload.Template (workspace_provision.go). Without this the - // SaaS re-provision ran with template="" → 218-byte stub config + dropped - // skills on every restart. Docker keeps its persistent config volume, so it - // retains the "do not re-apply templates" behavior (template left empty). + // RFC#2843 #33 + SaaS restart re-stub fix: on SaaS (cpProv), restore the + // persisted template AND resolve its LOCAL template dir so the re-provision + // re-delivers config.yaml + prompts. + // + // Setting payload.Template alone is NOT enough: it only drives the fetcher's + // TemplateAssets wire field, which the deployed control-plane does NOT consume + // (CP /cp/workspaces/provision reads `config_files` only — see + // controlplane wsProvisionRequest). config.yaml/prompts reach a SaaS box + // EXCLUSIVELY via `config_files`, which collectCPConfigFiles populates by + // walking cfg.TemplatePath. The restart previously passed templatePath="" → + // config routed only into the dropped TemplateAssets → every SaaS restart + // (each a fresh EC2 with empty /configs) landed a 218-byte stub config.yaml. + // That is the template-delivery-e2e intermittent (loses the race vs the + // post-online plugin-reconcile restart) AND a prod identity-loss-on-restart + // bug. Re-applying only touches template-owned, allowlisted files + // (config.yaml, prompts/**); user/agent paths (CLAUDE.md, MEMORY.md, + // .claude/**, /workspace) are excluded by IsCPTemplateAssetPath, so a + // persisted /configs is never clobbered. Docker keeps templatePath="" to + // preserve its persistent config volume. + restartTemplatePath := "" if h.cpProv != nil { if storedTmpl := storedWorkspaceTemplate(ctx, workspaceID); storedTmpl != "" { payload.Template = storedTmpl + if p, resolveErr := resolveWorkspaceTemplatePath(h.configsDir, h.cacheDir, storedTmpl); resolveErr != nil { + log.Printf("Auto-restart: %s template %q path resolution failed: %v — re-provision may land a stub config", workspaceID, storedTmpl, resolveErr) + } else if _, statErr := os.Stat(p); statErr == nil { + restartTemplatePath = p + } else { + log.Printf("Auto-restart: %s template %q dir absent at %q — re-provision may land a stub config", workspaceID, storedTmpl, p) + } } } @@ -1022,7 +1044,7 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) { // status='provisioning' (the UPDATE above already ran). User- // observable result on SaaS pre-fix: dead workspace → manual canvas // restart was the only recovery path. - h.provisionWorkspaceAutoSyncLocked(workspaceID, "", nil, payload) + h.provisionWorkspaceAutoSyncLocked(workspaceID, restartTemplatePath, nil, payload) // sendRestartContext is a one-way notification to the new container; safe // to fire async — the next restart cycle won't depend on it completing. // Tracked via h.goAsync so tests can wait for it via h.asyncWG before -- 2.52.0