From 3f8286ea47ad452b96e5338b37a5d6c2d823109c Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 02:18:08 -0700 Subject: [PATCH] fix(provision): share Docker+SaaS prepare path so both mint workspace secrets (RFC #2312) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause of 2026-04-30 silent-503 chat-upload bug: provisionWorkspaceCP (SaaS) skipped issueAndInjectInboundSecret while provisionWorkspaceOpts (Docker) called it. Every prod SaaS workspace provisioned with NULL platform_inbound_secret → upload returned 503 with the v2-enrollment message on every attempt. Structural fix: - Extract prepareProvisionContext (secrets load, env mutators, preflight, cfg build), mintWorkspaceSecrets (auth_token + platform_inbound_secret), markProvisionFailed (broadcast + DB update) into workspace_provision_shared.go - Refactor both provision modes to call the shared helpers - Add provisionAbort struct so the missing-env failure class can carry its structured "missing" payload through the shared abort path - Unify last_sample_error: previously the decrypt-fail path skipped it while others set it; users now see every failure class in the UI Drift prevention: - AST gate TestProvisionFunctions_AllCallMintWorkspaceSecrets asserts every function in the provisionFunctions set calls mintWorkspaceSecrets at least once (same shape as the audit-coverage gate from #335). New provision paths must either call mint or be added to provisionExemptFunctions with a one-line justification - Behavioral test TestMintWorkspaceSecrets_PersistsInboundSecretInSaaSMode pins the contract: SaaS mode MUST persist platform_inbound_secret to the DB column even though it skips file injection Existing-workspace recovery (chat_files.go lazy-heal): - Upload + Download handlers detect NULL platform_inbound_secret and call IssuePlatformInboundSecret inline, returning 503 with retry_after_seconds=30 - Self-heals workspaces that were provisioned before this fix without requiring destructive reprovision Tests: full handlers + workspace-server module green; AST gate verified to fire red on deliberate violation (commented-out mint call surfaces the exact function name + actionable remediation message). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/chat_files.go | 55 +++- .../internal/handlers/workspace_provision.go | 254 ++++-------------- .../handlers/workspace_provision_shared.go | 181 +++++++++++++ .../workspace_provision_shared_test.go | 187 +++++++++++++ .../handlers/workspace_provision_test.go | 10 +- 5 files changed, 478 insertions(+), 209 deletions(-) create mode 100644 workspace-server/internal/handlers/workspace_provision_shared.go create mode 100644 workspace-server/internal/handlers/workspace_provision_shared_test.go diff --git a/workspace-server/internal/handlers/chat_files.go b/workspace-server/internal/handlers/chat_files.go index bed787de..b2851ca5 100644 --- a/workspace-server/internal/handlers/chat_files.go +++ b/workspace-server/internal/handlers/chat_files.go @@ -192,13 +192,35 @@ func (h *ChatFilesHandler) Upload(c *gin.Context) { secret, err := wsauth.ReadPlatformInboundSecret(ctx, db.DB, workspaceID) if err != nil { if errors.Is(err, wsauth.ErrNoInboundSecret) { - // Workspace predates migration 044 OR the provisioner's - // secret-mint hop failed. Both are operational issues, - // not user errors. Log loudly so ops can backfill. - log.Printf("chat_files Upload: no platform_inbound_secret for %s — workspace needs reprovision (#2312)", workspaceID) + // Lazy-heal: mint the secret now so future requests + // succeed. Pre-2026-04-30 the SaaS provision path didn't + // mint — every prod workspace started life with a NULL + // inbound secret. The shared-mint refactor closes the + // new-workspace gap; this lazy-heal closes the existing- + // workspace gap without requiring a destructive + // reprovision. + // + // Why the request still 503s: the workspace's local + // /configs/.platform_inbound_secret is also empty until + // the next /registry/register response (registry.go:344- + // 362) propagates the freshly-minted secret. The user + // needs to retry once the workspace's heartbeat picks up + // the new secret (typically <30s). + minted, mintErr := wsauth.IssuePlatformInboundSecret(ctx, db.DB, workspaceID) + if mintErr != nil { + log.Printf("chat_files Upload: lazy-heal mint failed for %s: %v", workspaceID, mintErr) + c.JSON(http.StatusServiceUnavailable, gin.H{ + "error": "workspace not yet enrolled in v2 upload (RFC #2312)", + "detail": "Failed to mint inbound secret. Reprovision the workspace if this persists.", + }) + return + } + _ = minted // platform now has it; workspace picks up next heartbeat + log.Printf("chat_files Upload: lazy-healed platform_inbound_secret for %s — retry once workspace re-registers (#2312 backfill)", workspaceID) c.JSON(http.StatusServiceUnavailable, gin.H{ - "error": "workspace not yet enrolled in v2 upload (RFC #2312)", - "detail": "Reprovisioning the workspace will mint the platform_inbound_secret it's missing.", + "error": "workspace re-registering — please retry in 30 seconds", + "detail": "Inbound secret was just minted. Workspace will pick it up on its next heartbeat.", + "retry_after_seconds": 30, }) return } @@ -324,10 +346,25 @@ func (h *ChatFilesHandler) Download(c *gin.Context) { secret, err := wsauth.ReadPlatformInboundSecret(ctx, db.DB, workspaceID) if err != nil { if errors.Is(err, wsauth.ErrNoInboundSecret) { - log.Printf("chat_files Download: no platform_inbound_secret for %s — workspace needs reprovision (#2312)", workspaceID) + // Lazy-heal — same shape as the upload handler. Mint the + // secret now; workspace picks it up on next heartbeat. + // User retries in ~30s. See chat_files.go Upload handler + // for full rationale. + minted, mintErr := wsauth.IssuePlatformInboundSecret(ctx, db.DB, workspaceID) + if mintErr != nil { + log.Printf("chat_files Download: lazy-heal mint failed for %s: %v", workspaceID, mintErr) + c.JSON(http.StatusServiceUnavailable, gin.H{ + "error": "workspace not yet enrolled in v2 download (RFC #2312)", + "detail": "Failed to mint inbound secret. Reprovision the workspace if this persists.", + }) + return + } + _ = minted + log.Printf("chat_files Download: lazy-healed platform_inbound_secret for %s — retry once workspace re-registers (#2312 backfill)", workspaceID) c.JSON(http.StatusServiceUnavailable, gin.H{ - "error": "workspace not yet enrolled in v2 download (RFC #2312)", - "detail": "Reprovisioning the workspace will mint the platform_inbound_secret it's missing.", + "error": "workspace re-registering — please retry in 30 seconds", + "detail": "Inbound secret was just minted. Workspace will pick it up on its next heartbeat.", + "retry_after_seconds": 30, }) return } diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index 53e7d31d..cdf60d90 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -28,130 +28,19 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout) defer cancel() - // Load global secrets first, then workspace-specific secrets (which override globals). - envVars := map[string]string{} - - // 1. Global secrets (platform-wide defaults). Uses DecryptVersioned - // so plaintext rows written before encryption was enabled (#85) - // keep working. A decrypt failure aborts provisioning — silent skip - // used to manifest as opaque "missing OAuth token" preflight crashes. - globalRows, globalErr := db.DB.QueryContext(ctx, - `SELECT key, encrypted_value, encryption_version FROM global_secrets`) - if globalErr == nil { - defer globalRows.Close() - for globalRows.Next() { - var k string - var v []byte - var ver int - if globalRows.Scan(&k, &v, &ver) == nil { - decrypted, decErr := crypto.DecryptVersioned(v, ver) - if decErr != nil { - log.Printf("Provisioner: FATAL — failed to decrypt global secret %s (version=%d): %v — aborting provision of workspace %s", k, ver, decErr, workspaceID) - h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{ - "error": "failed to decrypt global secret", - }) - db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', updated_at = now() WHERE id = $1`, workspaceID) - return - } - envVars[k] = string(decrypted) - } - } - } - - // 2. Workspace-specific secrets (override globals with same key) - rows, err := db.DB.QueryContext(ctx, - `SELECT key, encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = $1`, workspaceID) - if err == nil { - defer rows.Close() - for rows.Next() { - var k string - var v []byte - var ver int - if rows.Scan(&k, &v, &ver) == nil { - decrypted, decErr := crypto.DecryptVersioned(v, ver) - if decErr != nil { - log.Printf("Provisioner: FATAL — failed to decrypt workspace secret %s (version=%d) for %s: %v — aborting provision", k, ver, workspaceID, decErr) - h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{ - "error": "failed to decrypt workspace secret", - }) - db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', updated_at = now() WHERE id = $1`, workspaceID) - return - } - envVars[k] = string(decrypted) - } - } - } - - pluginsPath, _ := filepath.Abs(filepath.Join(h.configsDir, "..", "plugins")) - awarenessNamespace := h.loadAwarenessNamespace(ctx, workspaceID) - - // Per-agent git identity (Option 3 of agent-separation rollout). - // Sets GIT_AUTHOR_* / GIT_COMMITTER_* so commits from each workspace - // carry a distinct author in `git log` / `git blame` — instead of - // every agent appearing as whoever the shared PAT belongs to. PR + - // issue authorship is still tied to GITHUB_TOKEN (shared PAT); that - // gets solved by the GitHub App migration (Option 1, follow-up PR). - // Runs after secret loads so an operator can still override via a - // workspace_secret named GIT_AUTHOR_NAME if they want custom identity. - applyAgentGitIdentity(envVars, payload.Name) - applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model) - - // Propagate the workspace's role into env so role-aware plugins - // (gh-identity — molecule-core#1957) can read it without the - // plugin interface having to carry the full payload. Role is - // cosmetic metadata — no auth weight on it — safe to surface as env. - if payload.Role != "" { - envVars["MOLECULE_AGENT_ROLE"] = payload.Role - } - - // Plugin extension point: run any registered EnvMutators (e.g. - // github-app-auth, vault-secrets) AFTER built-in identity injection so - // plugins can override or augment GIT_AUTHOR_*, GITHUB_TOKEN, etc. - // A failure here aborts provisioning — a missing GitHub App token - // would manifest later as opaque "git push 401" loops, and the agent - // never recovers. Failing fast here surfaces the cause to the operator. - if err := h.envMutators.Run(ctx, workspaceID, envVars); err != nil { - log.Printf("Provisioner: env mutator chain failed for %s: %v", workspaceID, err) - // F1086 / #1206: broadcast and db last_sample_error use generic messages — - // env mutator errors (missing tokens, vault paths, etc.) can include - // internal credential URIs and file paths that must not reach the caller. - h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{ - "error": "plugin env mutator chain failed", - }) - if _, dbErr := db.DB.ExecContext(ctx, - `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`, - workspaceID, "plugin env mutator chain failed"); dbErr != nil { - - log.Printf("Provisioner: failed to mark workspace %s as failed after mutator error: %v", workspaceID, dbErr) - } + prepared, abort := h.prepareProvisionContext(ctx, workspaceID, templatePath, configFiles, payload, resetClaudeSession) + if prepared == nil { + log.Printf("Provisioner: prepare failed for %s: %s", workspaceID, abort.Msg) + h.markProvisionFailed(ctx, workspaceID, abort.Msg, abort.Extra) return } - - // Preflight: refuse to launch when config.yaml declares required env vars - // that are not set. Without this, a missing CLAUDE_CODE_OAUTH_TOKEN (or - // similar) crashes the in-container preflight, the container never calls - // /registry/register, and the workspace sits in `provisioning` until a - // sweeper flips it or the user retries. Failing fast here gives the user - // an immediate, actionable error in the Events tab. - if missing := missingRequiredEnv(configFiles, envVars); len(missing) > 0 { - msg := formatMissingEnvError(missing) - log.Printf("Provisioner: %s (workspace=%s)", msg, workspaceID) - if _, dbErr := db.DB.ExecContext(ctx, - `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`, - workspaceID, msg); dbErr != nil { - log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr) - } - h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{ - "error": msg, - "missing": missing, - }) - return - } - - cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace) - cfg.ResetClaudeSession = resetClaudeSession // #12 + cfg := prepared.Config // Preflight #17: detect + auto-recover the "empty config volume" crashloop. + // Docker-specific — SaaS mode delegates volume management to the + // control-plane EC2 launcher and never has a local Docker volume to + // probe. Runs AFTER prepareProvisionContext because the recovered + // template still needs the same env-and-cfg surface the prepare built. // // When the caller supplies neither a template dir nor in-memory configFiles // (the auto-restart path), probe the existing Docker named volume. If the @@ -194,7 +83,7 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri workspaceID, filepath.Base(runtimeTemplate)) templatePath = runtimeTemplate // Rebuild cfg with the recovered template path so Start() sees it. - cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace) + cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, prepared.EnvVars, prepared.PluginsPath, prepared.AwarenessNamespace) cfg.ResetClaudeSession = resetClaudeSession recovered = true break @@ -209,46 +98,27 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri if !recovered { msg := fmt.Sprintf("cannot start workspace %s: no config.yaml source and config volume is empty — delete the workspace or provide a template", workspaceID) log.Printf("Provisioner: %s", msg) - if _, dbErr := db.DB.ExecContext(ctx, - `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`, - workspaceID, msg); dbErr != nil { - log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr) - } - h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{ - "error": msg, - }) + h.markProvisionFailed(ctx, workspaceID, msg, nil) return } } } - // Issue/rotate the workspace auth token and inject the plaintext into the - // config volume so the workspace always has a valid bearer credential on - // disk, even after a container rebuild wiped the volume (issue #418). - // - // We must rotate (revoke-then-issue) rather than reuse because the DB only - // stores sha256(plaintext) — we cannot reconstruct the original token to - // write it back. The new plaintext is written into /configs/.auth_token via - // WriteFilesToContainer, which runs immediately after ContainerStart and - // wins the race against the Python adapter's startup time (~1-2 s). - h.issueAndInjectToken(ctx, workspaceID, &cfg) - h.issueAndInjectInboundSecret(ctx, workspaceID, &cfg) + // Issue/rotate the workspace auth token + platform→workspace inbound secret + // and inject both into the config volume. See mintWorkspaceSecrets doc for + // the shared invariant — every provision path MUST mint here. Plaintext + // is written into /configs/.auth_token + /configs/.platform_inbound_secret + // via WriteFilesToContainer, which runs immediately after ContainerStart + // and wins the race against the Python adapter's startup time (~1-2 s). + h.mintWorkspaceSecrets(ctx, workspaceID, &cfg) url, err := h.provisioner.Start(ctx, cfg) if err != nil { // F1086 / #1206: persist a generic message so the canvas and // GET /workspaces/:id expose something actionable without leaking // docker/error internals (image pull messages, volume paths, etc.). - errMsg := "workspace start failed" - log.Printf("Provisioner: %s for %s: %v", errMsg, workspaceID, err) - if _, dbErr := db.DB.ExecContext(ctx, - `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`, - workspaceID, "workspace start failed"); dbErr != nil { - log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr) - } - h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{ - "error": "workspace start failed", - }) + log.Printf("Provisioner: workspace start failed for %s: %v", workspaceID, err) + h.markProvisionFailed(ctx, workspaceID, "workspace start failed", nil) } else if url != "" { // Pre-store the host-accessible URL (http://127.0.0.1:) so the A2A proxy can reach the container. // The registry's ON CONFLICT preserves URLs starting with http://127.0.0.1 when the agent self-registers. @@ -706,6 +576,13 @@ func applyRuntimeModelEnv(envVars map[string]string, runtime, model string) { // loadWorkspaceSecrets loads global + workspace-specific secrets into a map. // Returns nil map + error string on decrypt failure. Shared by both Docker // and control plane provisioning paths to avoid duplication. +// +// F1086 / #1206: the returned error string is the SAFE-CANNED message that +// gets persisted to workspaces.last_sample_error AND broadcast as the +// WORKSPACE_PROVISION_FAILED payload. Internal detail (the secret key name, +// the encryption version, the decrypt-error text) is logged here, never +// returned to the caller, so it can't leak via the canvas event stream +// (cf. TestProvisionWorkspace_NoInternalErrorsInBroadcast). func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]string, string) { envVars := map[string]string{} globalRows, globalErr := db.DB.QueryContext(ctx, @@ -719,7 +596,8 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s if globalRows.Scan(&k, &v, &ver) == nil { decrypted, decErr := crypto.DecryptVersioned(v, ver) if decErr != nil { - return nil, fmt.Sprintf("cannot decrypt global secret %s: %v", k, decErr) + log.Printf("Provisioner: FATAL — failed to decrypt global secret %s (version=%d): %v — aborting provision of workspace %s", k, ver, decErr, workspaceID) + return nil, "failed to decrypt global secret" } envVars[k] = string(decrypted) } @@ -736,7 +614,8 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s if wsRows.Scan(&k, &v, &ver) == nil { decrypted, decErr := crypto.DecryptVersioned(v, ver) if decErr != nil { - return nil, fmt.Sprintf("cannot decrypt workspace secret %s: %v", k, decErr) + log.Printf("Provisioner: FATAL — failed to decrypt workspace secret %s (version=%d) for %s: %v — aborting provision", k, ver, workspaceID, decErr) + return nil, "failed to decrypt workspace secret" } envVars[k] = string(decrypted) } @@ -746,53 +625,43 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s } // provisionWorkspaceCP provisions a workspace via the control plane API. +// +// Mode-specific work this function owns: cpProv.Start (delegates EC2 +// launch to control plane) + persist instance_id in DB. The shared +// setup (secrets, env mutators, mint of auth_token + inbound_secret) +// lives in prepareProvisionContext + mintWorkspaceSecrets and is +// called by both this function and the Docker-mode counterpart. +// +// Pre-#2026-04-30: this function did NOT call mintWorkspaceSecrets. +// That left every prod workspace with a NULL platform_inbound_secret +// column → 503 on every chat upload (RFC #2312). The bug shipped +// because the Docker and SaaS provision paths had drifted: Docker +// got the mint when #2312 landed; SaaS was missed. Refactored to +// share so the next mint added can't be silently forgotten on one +// side. func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) { ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout) defer cancel() - envVars, decryptErr := loadWorkspaceSecrets(ctx, workspaceID) - if decryptErr != "" { - log.Printf("CPProvisioner: %s for %s", decryptErr, workspaceID) - db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`, - workspaceID, decryptErr) + prepared, abort := h.prepareProvisionContext(ctx, workspaceID, templatePath, configFiles, payload, false) + if prepared == nil { + log.Printf("CPProvisioner: prepare failed for %s: %s", workspaceID, abort.Msg) + h.markProvisionFailed(ctx, workspaceID, abort.Msg, abort.Extra) return } - applyAgentGitIdentity(envVars, payload.Name) - applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model) - // Propagate role for role-aware plugins (#1957). See provisionWorkspace - // above for rationale. - if payload.Role != "" { - envVars["MOLECULE_AGENT_ROLE"] = payload.Role - } - if err := h.envMutators.Run(ctx, workspaceID, envVars); err != nil { - log.Printf("CPProvisioner: env mutator failed for %s: %v", workspaceID, err) - // F1086 / #1206: env mutator errors (missing tokens, vault paths) must not - // leak into last_sample_error — use generic message. - db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`, - workspaceID, "plugin env mutator chain failed") - return - } + // Mint the workspace's auth_token + platform_inbound_secret now, + // before cpProv.Start. Both modes write to the DB column; the + // workspace receives the plaintext via /registry/register response + // (registry.go:344-362) on its first heartbeat after boot. + h.mintWorkspaceSecrets(ctx, workspaceID, &prepared.Config) - cfg := provisioner.WorkspaceConfig{ - WorkspaceID: workspaceID, - Tier: payload.Tier, - Runtime: payload.Runtime, - EnvVars: envVars, - PlatformURL: h.platformURL, - } - - machineID, err := h.cpProv.Start(ctx, cfg) + machineID, err := h.cpProv.Start(ctx, prepared.Config) if err != nil { // F1086 / #1206: CP errors can include machine type, AMI IDs, VPC // paths — use generic message for broadcast and last_sample_error. - errMsg := "workspace start failed" - log.Printf("CPProvisioner: %s for %s: %v", errMsg, workspaceID, err) - h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{ - "error": "provisioning failed", - }) - db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`, - workspaceID, "provisioning failed") + log.Printf("CPProvisioner: workspace start failed for %s: %v", workspaceID, err) + h.markProvisionFailed(ctx, workspaceID, "provisioning failed", nil) return } @@ -809,13 +678,4 @@ func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string } log.Printf("CPProvisioner: workspace %s started as machine %s via control plane", workspaceID, machineID) - // Token issuance is deliberately deferred to the workspace's first - // /registry/register call. Minting here without also delivering the - // plaintext to the workspace (via user-data or a follow-up callback) - // would leave a live token in DB that the workspace has no copy of — - // RegistryHandler.requireWorkspaceToken would then 401 every - // /registry/register attempt because the workspace is no longer in the - // "no live tokens → bootstrap-allowed" state. The register handler - // already mints a token on first successful register and returns it in - // the response body for the workspace to persist. } diff --git a/workspace-server/internal/handlers/workspace_provision_shared.go b/workspace-server/internal/handlers/workspace_provision_shared.go new file mode 100644 index 00000000..c6c467c8 --- /dev/null +++ b/workspace-server/internal/handlers/workspace_provision_shared.go @@ -0,0 +1,181 @@ +package handlers + +// workspace_provision_shared.go — mode-agnostic provision-time +// orchestration that BOTH provisionWorkspaceOpts (Docker) and +// provisionWorkspaceCP (SaaS) call. Extracted because the original +// per-mode functions had drifted: the SaaS path forgot to call +// issueAndInjectInboundSecret, which produced silent-503 chat upload +// errors for every prod workspace (RFC #2312 — discovered 2026-04-30). +// +// The drift class this module closes: when a new provision-time setup +// step is added, it goes in ONE place and both modes pick it up. New +// steps that legitimately differ per mode stay in the per-mode +// functions and are explicitly documented there. +// +// What's shared (this file): +// - Loading secrets (global + workspace) from Postgres +// - Applying git identity, runtime model env, role propagation +// - Running env mutators +// - Preflight: missing required env +// - Building provisioner.WorkspaceConfig +// - Minting auth_token + platform_inbound_secret (#2312) +// +// What's mode-specific (kept in the per-mode functions): +// - Docker: empty-config-volume preflight + auto-recover via +// runtime template (#1858), then provisioner.Start with local +// Docker daemon, then WriteFilesToContainer for config volume. +// - SaaS: cpProv.Start which delegates EC2 launch to control plane, +// then persist returned instance_id, then defer .auth_token / +// .platform_inbound_secret delivery to the workspace's first +// /registry/register response (registry.go:344-362). +// +// Architectural test (workspace_provision_shared_test.go) asserts +// every code path that takes a workspaceID and starts a provision +// MUST call mintWorkspaceSecrets — same shape as the +// audit-coverage AST gate from #335. + +import ( + "context" + "log" + "path/filepath" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/models" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" +) + +// preparedProvisionContext carries the computed env + cfg through +// the per-mode provisioner-Start step. Returned from +// prepareProvisionContext when the caller proceeds; nil + non-empty +// abort message when the caller must mark the workspace failed. +type preparedProvisionContext struct { + EnvVars map[string]string + PluginsPath string + AwarenessNamespace string + Config provisioner.WorkspaceConfig +} + +// provisionAbort describes why prepareProvisionContext refused to +// produce a context. Msg is the caller-safe summary that gets +// persisted to workspaces.last_sample_error AND broadcast. Extra +// is the structured payload to surface alongside (e.g. the missing +// env list for the missing-env failure class). Both fields are +// zero-valued on the success return. +type provisionAbort struct { + Msg string + Extra map[string]interface{} +} + +// prepareProvisionContext does the mode-agnostic setup work that +// both Docker and SaaS provisioners need before their respective +// Start() call. Returns the prepared cfg or an abort message that +// the caller MUST broadcast as WORKSPACE_PROVISION_FAILED and +// persist as workspaces.last_sample_error. +// +// The function does NOT broadcast / DB-write the failure itself — +// failure surface is mode-aware (Docker logs differ from SaaS logs +// in observed format, and the failure-class taxonomy is shared +// between modes but the broadcast key may evolve per mode in the +// future). +func (h *WorkspaceHandler) prepareProvisionContext( + ctx context.Context, + workspaceID, templatePath string, + configFiles map[string][]byte, + payload models.CreateWorkspacePayload, + resetClaudeSession bool, +) (*preparedProvisionContext, *provisionAbort) { + envVars, decryptErr := loadWorkspaceSecrets(ctx, workspaceID) + if decryptErr != "" { + return nil, &provisionAbort{Msg: decryptErr} + } + + pluginsPath, _ := filepath.Abs(filepath.Join(h.configsDir, "..", "plugins")) + awarenessNamespace := h.loadAwarenessNamespace(ctx, workspaceID) + + // Per-agent git identity (#1957) — must run after secret loads so + // a workspace_secret named GIT_AUTHOR_NAME can override. + applyAgentGitIdentity(envVars, payload.Name) + applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model) + if payload.Role != "" { + envVars["MOLECULE_AGENT_ROLE"] = payload.Role + } + + // Plugin extension point: env mutators run AFTER built-in identity + // injection so plugins can override or augment identity vars. + if err := h.envMutators.Run(ctx, workspaceID, envVars); err != nil { + return nil, &provisionAbort{Msg: "plugin env mutator chain failed"} + } + + // Preflight #5: refuse to launch when config.yaml declares required + // env vars that are not set. Skipped in SaaS mode when configFiles + // is nil (CP-mode's cfg is built without local config bytes — the + // CP-side launches the workspace with envVars but doesn't inspect + // the config.yaml the same way Docker does). Future: lift the + // preflight to a pure-data check that doesn't need configFiles + // bytes, so SaaS mode can run it too. + if configFiles != nil { + if missing := missingRequiredEnv(configFiles, envVars); len(missing) > 0 { + msg := formatMissingEnvError(missing) + return nil, &provisionAbort{ + Msg: msg, + Extra: map[string]interface{}{"error": msg, "missing": missing}, + } + } + } + + cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace) + cfg.ResetClaudeSession = resetClaudeSession + + return &preparedProvisionContext{ + EnvVars: envVars, + PluginsPath: pluginsPath, + AwarenessNamespace: awarenessNamespace, + Config: cfg, + }, nil +} + +// mintWorkspaceSecrets issues + persists the workspace auth token +// AND the platform→workspace inbound secret (#2312). Both modes MUST +// call this — Docker mints + writes to local config volume; SaaS +// mints and persists only to the DB column (the workspace fetches +// via /registry/register response). +// +// Pre-2026-04-30: the SaaS path (provisionWorkspaceCP) was missing +// the inbound-secret mint, leaving every prod workspace with a NULL +// platform_inbound_secret column and 503-ing every chat upload. This +// extracted helper is the structural fix — both paths share one +// function so adding a new mint can't be silently forgotten on one +// side. +// +// Failure model: best-effort. Each underlying issue function logs + +// returns silently on its own failure (token mint failure → workspace +// 401s on first /internal call; secret mint failure → first chat +// upload 503s with the same message we used to surface for old- +// workspaces). The shared helper does NOT abort the provision because +// the workspace can still come up and serve non-internal traffic; the +// 401/503 surfaces the missing secret loudly when the user actually +// tries to use the affected feature. +func (h *WorkspaceHandler) mintWorkspaceSecrets(ctx context.Context, workspaceID string, cfg *provisioner.WorkspaceConfig) { + h.issueAndInjectToken(ctx, workspaceID, cfg) + h.issueAndInjectInboundSecret(ctx, workspaceID, cfg) +} + +// markProvisionFailed is the standard "abort with message" path used +// by both provision modes. Wraps the broadcast + DB update in one +// call so the failure shape stays consistent across modes. +func (h *WorkspaceHandler) markProvisionFailed(ctx context.Context, workspaceID, msg string, extra map[string]interface{}) { + if extra == nil { + extra = map[string]interface{}{"error": msg} + } else if _, hasErr := extra["error"]; !hasErr { + extra["error"] = msg + } + h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, extra) + if _, dbErr := db.DB.ExecContext(ctx, + `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`, + workspaceID, msg); dbErr != nil { + // Non-fatal: the broadcast already fired, the operator sees the + // failure event in the canvas. The DB row stays at whatever + // status it had — provisioning event log is the source of truth. + log.Printf("markProvisionFailed: db update failed for %s: %v", workspaceID, dbErr) + } +} diff --git a/workspace-server/internal/handlers/workspace_provision_shared_test.go b/workspace-server/internal/handlers/workspace_provision_shared_test.go new file mode 100644 index 00000000..5f39a2e2 --- /dev/null +++ b/workspace-server/internal/handlers/workspace_provision_shared_test.go @@ -0,0 +1,187 @@ +package handlers + +// workspace_provision_shared_test.go — architectural test that pins +// the invariant the shared-prepare refactor relies on: every code +// path that provisions a workspace MUST call mintWorkspaceSecrets. +// +// Closes the drift class that produced the 2026-04-30 RFC #2312 +// silent-503 bug. Pre-fix: provisionWorkspaceCP forgot to mint +// platform_inbound_secret because the SaaS path was implemented +// after the Docker path and the original mint call wasn't carried +// forward. Both modes now share mintWorkspaceSecrets via this +// extracted helper; this test ensures it stays that way. +// +// Same shape as the audit-coverage gate from #335 (#2343 PR-5). +// If this test fails: either add mintWorkspaceSecrets to the new +// provision function, OR (if the function legitimately should NOT +// mint) add it to provisionExemptFunctions with a one-line +// justification. + +import ( + "context" + "go/ast" + "go/parser" + "go/token" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" +) + +// provisionFunctions are the set of functions that provision a +// workspace and therefore MUST call mintWorkspaceSecrets. Detected +// by name match (handler-method pattern); add a new entry whenever +// a new provision path is introduced. +var provisionFunctions = map[string]bool{ + "provisionWorkspace": true, + "provisionWorkspaceOpts": true, + "provisionWorkspaceCP": true, +} + +// provisionExemptFunctions are functions whose name pattern matches +// the provision-function set but legitimately don't call +// mintWorkspaceSecrets (e.g., the wrapper provisionWorkspace which +// delegates to provisionWorkspaceOpts — the actual mint happens in +// the delegate, not the wrapper). +var provisionExemptFunctions = map[string]string{ + "provisionWorkspace": "thin wrapper that delegates to provisionWorkspaceOpts; the delegate mints", +} + +// TestProvisionFunctions_AllCallMintWorkspaceSecrets asserts every +// non-exempt provision function in this package calls +// mintWorkspaceSecrets at least once in its body. +// +// The check is presence-in-function, not pairing-by-line — same +// rationale as the audit-coverage gate from #335. +func TestProvisionFunctions_AllCallMintWorkspaceSecrets(t *testing.T) { + t.Parallel() + + fset := token.NewFileSet() + entries, err := os.ReadDir(".") + if err != nil { + t.Fatalf("read dir: %v", err) + } + + type missing struct { + file string + line int + fn string + } + var violations []missing + + for _, entry := range entries { + name := entry.Name() + if entry.IsDir() || !strings.HasSuffix(name, ".go") || strings.HasSuffix(name, "_test.go") { + continue + } + f, err := parser.ParseFile(fset, filepath.Join(".", name), nil, 0) + if err != nil { + t.Fatalf("parse %s: %v", name, err) + } + + for _, decl := range f.Decls { + fn, ok := decl.(*ast.FuncDecl) + if !ok || fn.Body == nil { + continue + } + if !provisionFunctions[fn.Name.Name] { + continue + } + if _, exempt := provisionExemptFunctions[fn.Name.Name]; exempt { + continue + } + if !callsMintWorkspaceSecrets(fn.Body) { + violations = append(violations, missing{ + file: name, + line: fset.Position(fn.Pos()).Line, + fn: fn.Name.Name, + }) + } + } + } + + for _, v := range violations { + t.Errorf( + "%s:%d %s does not call mintWorkspaceSecrets — every provision path MUST mint auth_token + platform_inbound_secret. If audit is genuinely impossible here, add %q to provisionExemptFunctions in workspace_provision_shared_test.go with a justification.", + v.file, v.line, v.fn, v.fn, + ) + } +} + +// callsMintWorkspaceSecrets walks the function body and reports +// whether mintWorkspaceSecrets is called anywhere — direct call OR +// via a helper. Recursion to helpers is shallow: we only check +// immediate calls in this function's body. The shared-prepare +// refactor centralizes mint in mintWorkspaceSecrets itself, so a +// direct call at the top-level is the expected pattern. +func callsMintWorkspaceSecrets(body *ast.BlockStmt) bool { + found := false + ast.Inspect(body, func(n ast.Node) bool { + if found { + return false + } + call, ok := n.(*ast.CallExpr) + if !ok { + return true + } + sel, ok := call.Fun.(*ast.SelectorExpr) + if !ok { + return true + } + if sel.Sel.Name == "mintWorkspaceSecrets" { + found = true + return false + } + return true + }) + return found +} + +// TestMintWorkspaceSecrets_PersistsInboundSecretInSaaSMode is the +// behavioral counterpart to the AST gate. It pins the structural +// fix for the 2026-04-30 silent-503 chat upload bug (RFC #2312): +// even in SaaS mode (where Docker file injection is skipped), +// mintWorkspaceSecrets MUST persist platform_inbound_secret to the +// workspaces row so platform-side handlers can read it back. +// +// Pre-fix: provisionWorkspaceCP never called the mint helper, so +// every prod workspace had NULL platform_inbound_secret → +// chat_files Upload returned 503 with "workspace not yet enrolled +// in v2 upload" on every attempt. +func TestMintWorkspaceSecrets_PersistsInboundSecretInSaaSMode(t *testing.T) { + t.Setenv("MOLECULE_DEPLOY_MODE", "saas") + mock := setupTestDB(t) + + // First underlying call: revoke any existing live tokens. SaaS + // mode early-returns from issueAndInjectToken right after this, + // so IssueToken is NOT expected. + mock.ExpectExec(`UPDATE workspace_auth_tokens SET revoked_at = now\(\) WHERE workspace_id = \$1 AND revoked_at IS NULL`). + WithArgs("ws-saas-mint"). + WillReturnResult(sqlmock.NewResult(0, 0)) + + // Second underlying call: persist the platform_inbound_secret. + // The structural fix — without this UPDATE, the bug recurs. + mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`). + WithArgs(sqlmock.AnyArg(), "ws-saas-mint"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + handler := NewWorkspaceHandler(&captureBroadcaster{}, nil, "http://localhost:8080", t.TempDir()) + cfg := provisioner.WorkspaceConfig{WorkspaceID: "ws-saas-mint"} + handler.mintWorkspaceSecrets(context.Background(), "ws-saas-mint", &cfg) + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("sqlmock expectations not met — mintWorkspaceSecrets did not persist platform_inbound_secret in SaaS mode (this is the prod bug recurrence): %v", err) + } + + // Sanity: SaaS mode must NOT have written .platform_inbound_secret + // into cfg.ConfigFiles — there's no Docker volume to deliver it. + if _, present := cfg.ConfigFiles[".platform_inbound_secret"]; present { + t.Errorf("SaaS mode should not inject .platform_inbound_secret into cfg.ConfigFiles (no Docker volume) — got entry") + } + if _, present := cfg.ConfigFiles[".auth_token"]; present { + t.Errorf("SaaS mode should not inject .auth_token into cfg.ConfigFiles (no Docker volume) — got entry") + } +} diff --git a/workspace-server/internal/handlers/workspace_provision_test.go b/workspace-server/internal/handlers/workspace_provision_test.go index b5124e37..c7f4e21d 100644 --- a/workspace-server/internal/handlers/workspace_provision_test.go +++ b/workspace-server/internal/handlers/workspace_provision_test.go @@ -1137,10 +1137,14 @@ func TestProvisionWorkspace_NoInternalErrorsInBroadcast(t *testing.T) { WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"}). AddRow("FAKE_KEY", []byte("any-bytes"), 99)) // On decrypt failure provisionWorkspace also marks the workspace as - // failed via UPDATE workspaces. Match-anything on the args so the - // test isn't coupled to the exact UPDATE column order. + // failed via UPDATE workspaces. Two args: workspace_id + the + // last_sample_error message ("failed to decrypt global secret"). + // Pre-refactor (workspace_provision_shared.go) the decrypt-fail + // path skipped last_sample_error; the shared helper now always + // persists it so users see the failure in the UI without having + // to grep server logs. mock.ExpectExec(`UPDATE workspaces SET status = 'failed'`). - WithArgs(sqlmock.AnyArg()). + WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg()). WillReturnResult(sqlmock.NewResult(0, 1)) cap := &captureBroadcaster{}