Merge pull request #2366 from Molecule-AI/auto/workspace-provision-shared

fix(provision): share Docker+SaaS prepare path so both mint workspace secrets (RFC #2312)
This commit is contained in:
Hongming Wang 2026-04-30 09:21:38 +00:00 committed by GitHub
commit b9b0a46f2e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 478 additions and 209 deletions

View File

@ -192,13 +192,35 @@ func (h *ChatFilesHandler) Upload(c *gin.Context) {
secret, err := wsauth.ReadPlatformInboundSecret(ctx, db.DB, workspaceID)
if err != nil {
if errors.Is(err, wsauth.ErrNoInboundSecret) {
// Workspace predates migration 044 OR the provisioner's
// secret-mint hop failed. Both are operational issues,
// not user errors. Log loudly so ops can backfill.
log.Printf("chat_files Upload: no platform_inbound_secret for %s — workspace needs reprovision (#2312)", workspaceID)
// Lazy-heal: mint the secret now so future requests
// succeed. Pre-2026-04-30 the SaaS provision path didn't
// mint — every prod workspace started life with a NULL
// inbound secret. The shared-mint refactor closes the
// new-workspace gap; this lazy-heal closes the existing-
// workspace gap without requiring a destructive
// reprovision.
//
// Why the request still 503s: the workspace's local
// /configs/.platform_inbound_secret is also empty until
// the next /registry/register response (registry.go:344-
// 362) propagates the freshly-minted secret. The user
// needs to retry once the workspace's heartbeat picks up
// the new secret (typically <30s).
minted, mintErr := wsauth.IssuePlatformInboundSecret(ctx, db.DB, workspaceID)
if mintErr != nil {
log.Printf("chat_files Upload: lazy-heal mint failed for %s: %v", workspaceID, mintErr)
c.JSON(http.StatusServiceUnavailable, gin.H{
"error": "workspace not yet enrolled in v2 upload (RFC #2312)",
"detail": "Failed to mint inbound secret. Reprovision the workspace if this persists.",
})
return
}
_ = minted // platform now has it; workspace picks up next heartbeat
log.Printf("chat_files Upload: lazy-healed platform_inbound_secret for %s — retry once workspace re-registers (#2312 backfill)", workspaceID)
c.JSON(http.StatusServiceUnavailable, gin.H{
"error": "workspace not yet enrolled in v2 upload (RFC #2312)",
"detail": "Reprovisioning the workspace will mint the platform_inbound_secret it's missing.",
"error": "workspace re-registering — please retry in 30 seconds",
"detail": "Inbound secret was just minted. Workspace will pick it up on its next heartbeat.",
"retry_after_seconds": 30,
})
return
}
@ -324,10 +346,25 @@ func (h *ChatFilesHandler) Download(c *gin.Context) {
secret, err := wsauth.ReadPlatformInboundSecret(ctx, db.DB, workspaceID)
if err != nil {
if errors.Is(err, wsauth.ErrNoInboundSecret) {
log.Printf("chat_files Download: no platform_inbound_secret for %s — workspace needs reprovision (#2312)", workspaceID)
// Lazy-heal — same shape as the upload handler. Mint the
// secret now; workspace picks it up on next heartbeat.
// User retries in ~30s. See chat_files.go Upload handler
// for full rationale.
minted, mintErr := wsauth.IssuePlatformInboundSecret(ctx, db.DB, workspaceID)
if mintErr != nil {
log.Printf("chat_files Download: lazy-heal mint failed for %s: %v", workspaceID, mintErr)
c.JSON(http.StatusServiceUnavailable, gin.H{
"error": "workspace not yet enrolled in v2 download (RFC #2312)",
"detail": "Failed to mint inbound secret. Reprovision the workspace if this persists.",
})
return
}
_ = minted
log.Printf("chat_files Download: lazy-healed platform_inbound_secret for %s — retry once workspace re-registers (#2312 backfill)", workspaceID)
c.JSON(http.StatusServiceUnavailable, gin.H{
"error": "workspace not yet enrolled in v2 download (RFC #2312)",
"detail": "Reprovisioning the workspace will mint the platform_inbound_secret it's missing.",
"error": "workspace re-registering — please retry in 30 seconds",
"detail": "Inbound secret was just minted. Workspace will pick it up on its next heartbeat.",
"retry_after_seconds": 30,
})
return
}

View File

@ -28,130 +28,19 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
defer cancel()
// Load global secrets first, then workspace-specific secrets (which override globals).
envVars := map[string]string{}
// 1. Global secrets (platform-wide defaults). Uses DecryptVersioned
// so plaintext rows written before encryption was enabled (#85)
// keep working. A decrypt failure aborts provisioning — silent skip
// used to manifest as opaque "missing OAuth token" preflight crashes.
globalRows, globalErr := db.DB.QueryContext(ctx,
`SELECT key, encrypted_value, encryption_version FROM global_secrets`)
if globalErr == nil {
defer globalRows.Close()
for globalRows.Next() {
var k string
var v []byte
var ver int
if globalRows.Scan(&k, &v, &ver) == nil {
decrypted, decErr := crypto.DecryptVersioned(v, ver)
if decErr != nil {
log.Printf("Provisioner: FATAL — failed to decrypt global secret %s (version=%d): %v — aborting provision of workspace %s", k, ver, decErr, workspaceID)
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
"error": "failed to decrypt global secret",
})
db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', updated_at = now() WHERE id = $1`, workspaceID)
return
}
envVars[k] = string(decrypted)
}
}
}
// 2. Workspace-specific secrets (override globals with same key)
rows, err := db.DB.QueryContext(ctx,
`SELECT key, encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = $1`, workspaceID)
if err == nil {
defer rows.Close()
for rows.Next() {
var k string
var v []byte
var ver int
if rows.Scan(&k, &v, &ver) == nil {
decrypted, decErr := crypto.DecryptVersioned(v, ver)
if decErr != nil {
log.Printf("Provisioner: FATAL — failed to decrypt workspace secret %s (version=%d) for %s: %v — aborting provision", k, ver, workspaceID, decErr)
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
"error": "failed to decrypt workspace secret",
})
db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', updated_at = now() WHERE id = $1`, workspaceID)
return
}
envVars[k] = string(decrypted)
}
}
}
pluginsPath, _ := filepath.Abs(filepath.Join(h.configsDir, "..", "plugins"))
awarenessNamespace := h.loadAwarenessNamespace(ctx, workspaceID)
// Per-agent git identity (Option 3 of agent-separation rollout).
// Sets GIT_AUTHOR_* / GIT_COMMITTER_* so commits from each workspace
// carry a distinct author in `git log` / `git blame` — instead of
// every agent appearing as whoever the shared PAT belongs to. PR +
// issue authorship is still tied to GITHUB_TOKEN (shared PAT); that
// gets solved by the GitHub App migration (Option 1, follow-up PR).
// Runs after secret loads so an operator can still override via a
// workspace_secret named GIT_AUTHOR_NAME if they want custom identity.
applyAgentGitIdentity(envVars, payload.Name)
applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model)
// Propagate the workspace's role into env so role-aware plugins
// (gh-identity — molecule-core#1957) can read it without the
// plugin interface having to carry the full payload. Role is
// cosmetic metadata — no auth weight on it — safe to surface as env.
if payload.Role != "" {
envVars["MOLECULE_AGENT_ROLE"] = payload.Role
}
// Plugin extension point: run any registered EnvMutators (e.g.
// github-app-auth, vault-secrets) AFTER built-in identity injection so
// plugins can override or augment GIT_AUTHOR_*, GITHUB_TOKEN, etc.
// A failure here aborts provisioning — a missing GitHub App token
// would manifest later as opaque "git push 401" loops, and the agent
// never recovers. Failing fast here surfaces the cause to the operator.
if err := h.envMutators.Run(ctx, workspaceID, envVars); err != nil {
log.Printf("Provisioner: env mutator chain failed for %s: %v", workspaceID, err)
// F1086 / #1206: broadcast and db last_sample_error use generic messages —
// env mutator errors (missing tokens, vault paths, etc.) can include
// internal credential URIs and file paths that must not reach the caller.
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
"error": "plugin env mutator chain failed",
})
if _, dbErr := db.DB.ExecContext(ctx,
`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
workspaceID, "plugin env mutator chain failed"); dbErr != nil {
log.Printf("Provisioner: failed to mark workspace %s as failed after mutator error: %v", workspaceID, dbErr)
}
prepared, abort := h.prepareProvisionContext(ctx, workspaceID, templatePath, configFiles, payload, resetClaudeSession)
if prepared == nil {
log.Printf("Provisioner: prepare failed for %s: %s", workspaceID, abort.Msg)
h.markProvisionFailed(ctx, workspaceID, abort.Msg, abort.Extra)
return
}
// Preflight: refuse to launch when config.yaml declares required env vars
// that are not set. Without this, a missing CLAUDE_CODE_OAUTH_TOKEN (or
// similar) crashes the in-container preflight, the container never calls
// /registry/register, and the workspace sits in `provisioning` until a
// sweeper flips it or the user retries. Failing fast here gives the user
// an immediate, actionable error in the Events tab.
if missing := missingRequiredEnv(configFiles, envVars); len(missing) > 0 {
msg := formatMissingEnvError(missing)
log.Printf("Provisioner: %s (workspace=%s)", msg, workspaceID)
if _, dbErr := db.DB.ExecContext(ctx,
`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
workspaceID, msg); dbErr != nil {
log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr)
}
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
"error": msg,
"missing": missing,
})
return
}
cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
cfg.ResetClaudeSession = resetClaudeSession // #12
cfg := prepared.Config
// Preflight #17: detect + auto-recover the "empty config volume" crashloop.
// Docker-specific — SaaS mode delegates volume management to the
// control-plane EC2 launcher and never has a local Docker volume to
// probe. Runs AFTER prepareProvisionContext because the recovered
// template still needs the same env-and-cfg surface the prepare built.
//
// When the caller supplies neither a template dir nor in-memory configFiles
// (the auto-restart path), probe the existing Docker named volume. If the
@ -194,7 +83,7 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
workspaceID, filepath.Base(runtimeTemplate))
templatePath = runtimeTemplate
// Rebuild cfg with the recovered template path so Start() sees it.
cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, prepared.EnvVars, prepared.PluginsPath, prepared.AwarenessNamespace)
cfg.ResetClaudeSession = resetClaudeSession
recovered = true
break
@ -209,46 +98,27 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
if !recovered {
msg := fmt.Sprintf("cannot start workspace %s: no config.yaml source and config volume is empty — delete the workspace or provide a template", workspaceID)
log.Printf("Provisioner: %s", msg)
if _, dbErr := db.DB.ExecContext(ctx,
`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
workspaceID, msg); dbErr != nil {
log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr)
}
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
"error": msg,
})
h.markProvisionFailed(ctx, workspaceID, msg, nil)
return
}
}
}
// Issue/rotate the workspace auth token and inject the plaintext into the
// config volume so the workspace always has a valid bearer credential on
// disk, even after a container rebuild wiped the volume (issue #418).
//
// We must rotate (revoke-then-issue) rather than reuse because the DB only
// stores sha256(plaintext) — we cannot reconstruct the original token to
// write it back. The new plaintext is written into /configs/.auth_token via
// WriteFilesToContainer, which runs immediately after ContainerStart and
// wins the race against the Python adapter's startup time (~1-2 s).
h.issueAndInjectToken(ctx, workspaceID, &cfg)
h.issueAndInjectInboundSecret(ctx, workspaceID, &cfg)
// Issue/rotate the workspace auth token + platform→workspace inbound secret
// and inject both into the config volume. See mintWorkspaceSecrets doc for
// the shared invariant — every provision path MUST mint here. Plaintext
// is written into /configs/.auth_token + /configs/.platform_inbound_secret
// via WriteFilesToContainer, which runs immediately after ContainerStart
// and wins the race against the Python adapter's startup time (~1-2 s).
h.mintWorkspaceSecrets(ctx, workspaceID, &cfg)
url, err := h.provisioner.Start(ctx, cfg)
if err != nil {
// F1086 / #1206: persist a generic message so the canvas and
// GET /workspaces/:id expose something actionable without leaking
// docker/error internals (image pull messages, volume paths, etc.).
errMsg := "workspace start failed"
log.Printf("Provisioner: %s for %s: %v", errMsg, workspaceID, err)
if _, dbErr := db.DB.ExecContext(ctx,
`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
workspaceID, "workspace start failed"); dbErr != nil {
log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr)
}
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
"error": "workspace start failed",
})
log.Printf("Provisioner: workspace start failed for %s: %v", workspaceID, err)
h.markProvisionFailed(ctx, workspaceID, "workspace start failed", nil)
} else if url != "" {
// Pre-store the host-accessible URL (http://127.0.0.1:<port>) so the A2A proxy can reach the container.
// The registry's ON CONFLICT preserves URLs starting with http://127.0.0.1 when the agent self-registers.
@ -706,6 +576,13 @@ func applyRuntimeModelEnv(envVars map[string]string, runtime, model string) {
// loadWorkspaceSecrets loads global + workspace-specific secrets into a map.
// Returns nil map + error string on decrypt failure. Shared by both Docker
// and control plane provisioning paths to avoid duplication.
//
// F1086 / #1206: the returned error string is the SAFE-CANNED message that
// gets persisted to workspaces.last_sample_error AND broadcast as the
// WORKSPACE_PROVISION_FAILED payload. Internal detail (the secret key name,
// the encryption version, the decrypt-error text) is logged here, never
// returned to the caller, so it can't leak via the canvas event stream
// (cf. TestProvisionWorkspace_NoInternalErrorsInBroadcast).
func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]string, string) {
envVars := map[string]string{}
globalRows, globalErr := db.DB.QueryContext(ctx,
@ -719,7 +596,8 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s
if globalRows.Scan(&k, &v, &ver) == nil {
decrypted, decErr := crypto.DecryptVersioned(v, ver)
if decErr != nil {
return nil, fmt.Sprintf("cannot decrypt global secret %s: %v", k, decErr)
log.Printf("Provisioner: FATAL — failed to decrypt global secret %s (version=%d): %v — aborting provision of workspace %s", k, ver, decErr, workspaceID)
return nil, "failed to decrypt global secret"
}
envVars[k] = string(decrypted)
}
@ -736,7 +614,8 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s
if wsRows.Scan(&k, &v, &ver) == nil {
decrypted, decErr := crypto.DecryptVersioned(v, ver)
if decErr != nil {
return nil, fmt.Sprintf("cannot decrypt workspace secret %s: %v", k, decErr)
log.Printf("Provisioner: FATAL — failed to decrypt workspace secret %s (version=%d) for %s: %v — aborting provision", k, ver, workspaceID, decErr)
return nil, "failed to decrypt workspace secret"
}
envVars[k] = string(decrypted)
}
@ -746,53 +625,43 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s
}
// provisionWorkspaceCP provisions a workspace via the control plane API.
//
// Mode-specific work this function owns: cpProv.Start (delegates EC2
// launch to control plane) + persist instance_id in DB. The shared
// setup (secrets, env mutators, mint of auth_token + inbound_secret)
// lives in prepareProvisionContext + mintWorkspaceSecrets and is
// called by both this function and the Docker-mode counterpart.
//
// Pre-#2026-04-30: this function did NOT call mintWorkspaceSecrets.
// That left every prod workspace with a NULL platform_inbound_secret
// column → 503 on every chat upload (RFC #2312). The bug shipped
// because the Docker and SaaS provision paths had drifted: Docker
// got the mint when #2312 landed; SaaS was missed. Refactored to
// share so the next mint added can't be silently forgotten on one
// side.
func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) {
ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
defer cancel()
envVars, decryptErr := loadWorkspaceSecrets(ctx, workspaceID)
if decryptErr != "" {
log.Printf("CPProvisioner: %s for %s", decryptErr, workspaceID)
db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
workspaceID, decryptErr)
prepared, abort := h.prepareProvisionContext(ctx, workspaceID, templatePath, configFiles, payload, false)
if prepared == nil {
log.Printf("CPProvisioner: prepare failed for %s: %s", workspaceID, abort.Msg)
h.markProvisionFailed(ctx, workspaceID, abort.Msg, abort.Extra)
return
}
applyAgentGitIdentity(envVars, payload.Name)
applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model)
// Propagate role for role-aware plugins (#1957). See provisionWorkspace
// above for rationale.
if payload.Role != "" {
envVars["MOLECULE_AGENT_ROLE"] = payload.Role
}
if err := h.envMutators.Run(ctx, workspaceID, envVars); err != nil {
log.Printf("CPProvisioner: env mutator failed for %s: %v", workspaceID, err)
// F1086 / #1206: env mutator errors (missing tokens, vault paths) must not
// leak into last_sample_error — use generic message.
db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
workspaceID, "plugin env mutator chain failed")
return
}
// Mint the workspace's auth_token + platform_inbound_secret now,
// before cpProv.Start. Both modes write to the DB column; the
// workspace receives the plaintext via /registry/register response
// (registry.go:344-362) on its first heartbeat after boot.
h.mintWorkspaceSecrets(ctx, workspaceID, &prepared.Config)
cfg := provisioner.WorkspaceConfig{
WorkspaceID: workspaceID,
Tier: payload.Tier,
Runtime: payload.Runtime,
EnvVars: envVars,
PlatformURL: h.platformURL,
}
machineID, err := h.cpProv.Start(ctx, cfg)
machineID, err := h.cpProv.Start(ctx, prepared.Config)
if err != nil {
// F1086 / #1206: CP errors can include machine type, AMI IDs, VPC
// paths — use generic message for broadcast and last_sample_error.
errMsg := "workspace start failed"
log.Printf("CPProvisioner: %s for %s: %v", errMsg, workspaceID, err)
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
"error": "provisioning failed",
})
db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
workspaceID, "provisioning failed")
log.Printf("CPProvisioner: workspace start failed for %s: %v", workspaceID, err)
h.markProvisionFailed(ctx, workspaceID, "provisioning failed", nil)
return
}
@ -809,13 +678,4 @@ func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string
}
log.Printf("CPProvisioner: workspace %s started as machine %s via control plane", workspaceID, machineID)
// Token issuance is deliberately deferred to the workspace's first
// /registry/register call. Minting here without also delivering the
// plaintext to the workspace (via user-data or a follow-up callback)
// would leave a live token in DB that the workspace has no copy of —
// RegistryHandler.requireWorkspaceToken would then 401 every
// /registry/register attempt because the workspace is no longer in the
// "no live tokens → bootstrap-allowed" state. The register handler
// already mints a token on first successful register and returns it in
// the response body for the workspace to persist.
}

View File

@ -0,0 +1,181 @@
package handlers
// workspace_provision_shared.go — mode-agnostic provision-time
// orchestration that BOTH provisionWorkspaceOpts (Docker) and
// provisionWorkspaceCP (SaaS) call. Extracted because the original
// per-mode functions had drifted: the SaaS path forgot to call
// issueAndInjectInboundSecret, which produced silent-503 chat upload
// errors for every prod workspace (RFC #2312 — discovered 2026-04-30).
//
// The drift class this module closes: when a new provision-time setup
// step is added, it goes in ONE place and both modes pick it up. New
// steps that legitimately differ per mode stay in the per-mode
// functions and are explicitly documented there.
//
// What's shared (this file):
// - Loading secrets (global + workspace) from Postgres
// - Applying git identity, runtime model env, role propagation
// - Running env mutators
// - Preflight: missing required env
// - Building provisioner.WorkspaceConfig
// - Minting auth_token + platform_inbound_secret (#2312)
//
// What's mode-specific (kept in the per-mode functions):
// - Docker: empty-config-volume preflight + auto-recover via
// runtime template (#1858), then provisioner.Start with local
// Docker daemon, then WriteFilesToContainer for config volume.
// - SaaS: cpProv.Start which delegates EC2 launch to control plane,
// then persist returned instance_id, then defer .auth_token /
// .platform_inbound_secret delivery to the workspace's first
// /registry/register response (registry.go:344-362).
//
// Architectural test (workspace_provision_shared_test.go) asserts
// every code path that takes a workspaceID and starts a provision
// MUST call mintWorkspaceSecrets — same shape as the
// audit-coverage AST gate from #335.
import (
"context"
"log"
"path/filepath"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
)
// preparedProvisionContext carries the computed env + cfg through
// the per-mode provisioner-Start step. Returned from
// prepareProvisionContext when the caller proceeds; nil + non-empty
// abort message when the caller must mark the workspace failed.
type preparedProvisionContext struct {
EnvVars map[string]string
PluginsPath string
AwarenessNamespace string
Config provisioner.WorkspaceConfig
}
// provisionAbort describes why prepareProvisionContext refused to
// produce a context. Msg is the caller-safe summary that gets
// persisted to workspaces.last_sample_error AND broadcast. Extra
// is the structured payload to surface alongside (e.g. the missing
// env list for the missing-env failure class). Both fields are
// zero-valued on the success return.
type provisionAbort struct {
Msg string
Extra map[string]interface{}
}
// prepareProvisionContext does the mode-agnostic setup work that
// both Docker and SaaS provisioners need before their respective
// Start() call. Returns the prepared cfg or an abort message that
// the caller MUST broadcast as WORKSPACE_PROVISION_FAILED and
// persist as workspaces.last_sample_error.
//
// The function does NOT broadcast / DB-write the failure itself —
// failure surface is mode-aware (Docker logs differ from SaaS logs
// in observed format, and the failure-class taxonomy is shared
// between modes but the broadcast key may evolve per mode in the
// future).
func (h *WorkspaceHandler) prepareProvisionContext(
ctx context.Context,
workspaceID, templatePath string,
configFiles map[string][]byte,
payload models.CreateWorkspacePayload,
resetClaudeSession bool,
) (*preparedProvisionContext, *provisionAbort) {
envVars, decryptErr := loadWorkspaceSecrets(ctx, workspaceID)
if decryptErr != "" {
return nil, &provisionAbort{Msg: decryptErr}
}
pluginsPath, _ := filepath.Abs(filepath.Join(h.configsDir, "..", "plugins"))
awarenessNamespace := h.loadAwarenessNamespace(ctx, workspaceID)
// Per-agent git identity (#1957) — must run after secret loads so
// a workspace_secret named GIT_AUTHOR_NAME can override.
applyAgentGitIdentity(envVars, payload.Name)
applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model)
if payload.Role != "" {
envVars["MOLECULE_AGENT_ROLE"] = payload.Role
}
// Plugin extension point: env mutators run AFTER built-in identity
// injection so plugins can override or augment identity vars.
if err := h.envMutators.Run(ctx, workspaceID, envVars); err != nil {
return nil, &provisionAbort{Msg: "plugin env mutator chain failed"}
}
// Preflight #5: refuse to launch when config.yaml declares required
// env vars that are not set. Skipped in SaaS mode when configFiles
// is nil (CP-mode's cfg is built without local config bytes — the
// CP-side launches the workspace with envVars but doesn't inspect
// the config.yaml the same way Docker does). Future: lift the
// preflight to a pure-data check that doesn't need configFiles
// bytes, so SaaS mode can run it too.
if configFiles != nil {
if missing := missingRequiredEnv(configFiles, envVars); len(missing) > 0 {
msg := formatMissingEnvError(missing)
return nil, &provisionAbort{
Msg: msg,
Extra: map[string]interface{}{"error": msg, "missing": missing},
}
}
}
cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
cfg.ResetClaudeSession = resetClaudeSession
return &preparedProvisionContext{
EnvVars: envVars,
PluginsPath: pluginsPath,
AwarenessNamespace: awarenessNamespace,
Config: cfg,
}, nil
}
// mintWorkspaceSecrets issues + persists the workspace auth token
// AND the platform→workspace inbound secret (#2312). Both modes MUST
// call this — Docker mints + writes to local config volume; SaaS
// mints and persists only to the DB column (the workspace fetches
// via /registry/register response).
//
// Pre-2026-04-30: the SaaS path (provisionWorkspaceCP) was missing
// the inbound-secret mint, leaving every prod workspace with a NULL
// platform_inbound_secret column and 503-ing every chat upload. This
// extracted helper is the structural fix — both paths share one
// function so adding a new mint can't be silently forgotten on one
// side.
//
// Failure model: best-effort. Each underlying issue function logs +
// returns silently on its own failure (token mint failure → workspace
// 401s on first /internal call; secret mint failure → first chat
// upload 503s with the same message we used to surface for old-
// workspaces). The shared helper does NOT abort the provision because
// the workspace can still come up and serve non-internal traffic; the
// 401/503 surfaces the missing secret loudly when the user actually
// tries to use the affected feature.
func (h *WorkspaceHandler) mintWorkspaceSecrets(ctx context.Context, workspaceID string, cfg *provisioner.WorkspaceConfig) {
h.issueAndInjectToken(ctx, workspaceID, cfg)
h.issueAndInjectInboundSecret(ctx, workspaceID, cfg)
}
// markProvisionFailed is the standard "abort with message" path used
// by both provision modes. Wraps the broadcast + DB update in one
// call so the failure shape stays consistent across modes.
func (h *WorkspaceHandler) markProvisionFailed(ctx context.Context, workspaceID, msg string, extra map[string]interface{}) {
if extra == nil {
extra = map[string]interface{}{"error": msg}
} else if _, hasErr := extra["error"]; !hasErr {
extra["error"] = msg
}
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, extra)
if _, dbErr := db.DB.ExecContext(ctx,
`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
workspaceID, msg); dbErr != nil {
// Non-fatal: the broadcast already fired, the operator sees the
// failure event in the canvas. The DB row stays at whatever
// status it had — provisioning event log is the source of truth.
log.Printf("markProvisionFailed: db update failed for %s: %v", workspaceID, dbErr)
}
}

View File

@ -0,0 +1,187 @@
package handlers
// workspace_provision_shared_test.go — architectural test that pins
// the invariant the shared-prepare refactor relies on: every code
// path that provisions a workspace MUST call mintWorkspaceSecrets.
//
// Closes the drift class that produced the 2026-04-30 RFC #2312
// silent-503 bug. Pre-fix: provisionWorkspaceCP forgot to mint
// platform_inbound_secret because the SaaS path was implemented
// after the Docker path and the original mint call wasn't carried
// forward. Both modes now share mintWorkspaceSecrets via this
// extracted helper; this test ensures it stays that way.
//
// Same shape as the audit-coverage gate from #335 (#2343 PR-5).
// If this test fails: either add mintWorkspaceSecrets to the new
// provision function, OR (if the function legitimately should NOT
// mint) add it to provisionExemptFunctions with a one-line
// justification.
import (
"context"
"go/ast"
"go/parser"
"go/token"
"os"
"path/filepath"
"strings"
"testing"
"github.com/DATA-DOG/go-sqlmock"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
)
// provisionFunctions are the set of functions that provision a
// workspace and therefore MUST call mintWorkspaceSecrets. Detected
// by name match (handler-method pattern); add a new entry whenever
// a new provision path is introduced.
var provisionFunctions = map[string]bool{
"provisionWorkspace": true,
"provisionWorkspaceOpts": true,
"provisionWorkspaceCP": true,
}
// provisionExemptFunctions are functions whose name pattern matches
// the provision-function set but legitimately don't call
// mintWorkspaceSecrets (e.g., the wrapper provisionWorkspace which
// delegates to provisionWorkspaceOpts — the actual mint happens in
// the delegate, not the wrapper).
var provisionExemptFunctions = map[string]string{
"provisionWorkspace": "thin wrapper that delegates to provisionWorkspaceOpts; the delegate mints",
}
// TestProvisionFunctions_AllCallMintWorkspaceSecrets asserts every
// non-exempt provision function in this package calls
// mintWorkspaceSecrets at least once in its body.
//
// The check is presence-in-function, not pairing-by-line — same
// rationale as the audit-coverage gate from #335.
func TestProvisionFunctions_AllCallMintWorkspaceSecrets(t *testing.T) {
t.Parallel()
fset := token.NewFileSet()
entries, err := os.ReadDir(".")
if err != nil {
t.Fatalf("read dir: %v", err)
}
type missing struct {
file string
line int
fn string
}
var violations []missing
for _, entry := range entries {
name := entry.Name()
if entry.IsDir() || !strings.HasSuffix(name, ".go") || strings.HasSuffix(name, "_test.go") {
continue
}
f, err := parser.ParseFile(fset, filepath.Join(".", name), nil, 0)
if err != nil {
t.Fatalf("parse %s: %v", name, err)
}
for _, decl := range f.Decls {
fn, ok := decl.(*ast.FuncDecl)
if !ok || fn.Body == nil {
continue
}
if !provisionFunctions[fn.Name.Name] {
continue
}
if _, exempt := provisionExemptFunctions[fn.Name.Name]; exempt {
continue
}
if !callsMintWorkspaceSecrets(fn.Body) {
violations = append(violations, missing{
file: name,
line: fset.Position(fn.Pos()).Line,
fn: fn.Name.Name,
})
}
}
}
for _, v := range violations {
t.Errorf(
"%s:%d %s does not call mintWorkspaceSecrets — every provision path MUST mint auth_token + platform_inbound_secret. If audit is genuinely impossible here, add %q to provisionExemptFunctions in workspace_provision_shared_test.go with a justification.",
v.file, v.line, v.fn, v.fn,
)
}
}
// callsMintWorkspaceSecrets walks the function body and reports
// whether mintWorkspaceSecrets is called anywhere — direct call OR
// via a helper. Recursion to helpers is shallow: we only check
// immediate calls in this function's body. The shared-prepare
// refactor centralizes mint in mintWorkspaceSecrets itself, so a
// direct call at the top-level is the expected pattern.
func callsMintWorkspaceSecrets(body *ast.BlockStmt) bool {
found := false
ast.Inspect(body, func(n ast.Node) bool {
if found {
return false
}
call, ok := n.(*ast.CallExpr)
if !ok {
return true
}
sel, ok := call.Fun.(*ast.SelectorExpr)
if !ok {
return true
}
if sel.Sel.Name == "mintWorkspaceSecrets" {
found = true
return false
}
return true
})
return found
}
// TestMintWorkspaceSecrets_PersistsInboundSecretInSaaSMode is the
// behavioral counterpart to the AST gate. It pins the structural
// fix for the 2026-04-30 silent-503 chat upload bug (RFC #2312):
// even in SaaS mode (where Docker file injection is skipped),
// mintWorkspaceSecrets MUST persist platform_inbound_secret to the
// workspaces row so platform-side handlers can read it back.
//
// Pre-fix: provisionWorkspaceCP never called the mint helper, so
// every prod workspace had NULL platform_inbound_secret →
// chat_files Upload returned 503 with "workspace not yet enrolled
// in v2 upload" on every attempt.
func TestMintWorkspaceSecrets_PersistsInboundSecretInSaaSMode(t *testing.T) {
t.Setenv("MOLECULE_DEPLOY_MODE", "saas")
mock := setupTestDB(t)
// First underlying call: revoke any existing live tokens. SaaS
// mode early-returns from issueAndInjectToken right after this,
// so IssueToken is NOT expected.
mock.ExpectExec(`UPDATE workspace_auth_tokens SET revoked_at = now\(\) WHERE workspace_id = \$1 AND revoked_at IS NULL`).
WithArgs("ws-saas-mint").
WillReturnResult(sqlmock.NewResult(0, 0))
// Second underlying call: persist the platform_inbound_secret.
// The structural fix — without this UPDATE, the bug recurs.
mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`).
WithArgs(sqlmock.AnyArg(), "ws-saas-mint").
WillReturnResult(sqlmock.NewResult(0, 1))
handler := NewWorkspaceHandler(&captureBroadcaster{}, nil, "http://localhost:8080", t.TempDir())
cfg := provisioner.WorkspaceConfig{WorkspaceID: "ws-saas-mint"}
handler.mintWorkspaceSecrets(context.Background(), "ws-saas-mint", &cfg)
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("sqlmock expectations not met — mintWorkspaceSecrets did not persist platform_inbound_secret in SaaS mode (this is the prod bug recurrence): %v", err)
}
// Sanity: SaaS mode must NOT have written .platform_inbound_secret
// into cfg.ConfigFiles — there's no Docker volume to deliver it.
if _, present := cfg.ConfigFiles[".platform_inbound_secret"]; present {
t.Errorf("SaaS mode should not inject .platform_inbound_secret into cfg.ConfigFiles (no Docker volume) — got entry")
}
if _, present := cfg.ConfigFiles[".auth_token"]; present {
t.Errorf("SaaS mode should not inject .auth_token into cfg.ConfigFiles (no Docker volume) — got entry")
}
}

View File

@ -1137,10 +1137,14 @@ func TestProvisionWorkspace_NoInternalErrorsInBroadcast(t *testing.T) {
WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"}).
AddRow("FAKE_KEY", []byte("any-bytes"), 99))
// On decrypt failure provisionWorkspace also marks the workspace as
// failed via UPDATE workspaces. Match-anything on the args so the
// test isn't coupled to the exact UPDATE column order.
// failed via UPDATE workspaces. Two args: workspace_id + the
// last_sample_error message ("failed to decrypt global secret").
// Pre-refactor (workspace_provision_shared.go) the decrypt-fail
// path skipped last_sample_error; the shared helper now always
// persists it so users see the failure in the UI without having
// to grep server logs.
mock.ExpectExec(`UPDATE workspaces SET status = 'failed'`).
WithArgs(sqlmock.AnyArg()).
WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
cap := &captureBroadcaster{}