Merge pull request #2366 from Molecule-AI/auto/workspace-provision-shared
fix(provision): share Docker+SaaS prepare path so both mint workspace secrets (RFC #2312)
This commit is contained in:
commit
b9b0a46f2e
@ -192,13 +192,35 @@ func (h *ChatFilesHandler) Upload(c *gin.Context) {
|
||||
secret, err := wsauth.ReadPlatformInboundSecret(ctx, db.DB, workspaceID)
|
||||
if err != nil {
|
||||
if errors.Is(err, wsauth.ErrNoInboundSecret) {
|
||||
// Workspace predates migration 044 OR the provisioner's
|
||||
// secret-mint hop failed. Both are operational issues,
|
||||
// not user errors. Log loudly so ops can backfill.
|
||||
log.Printf("chat_files Upload: no platform_inbound_secret for %s — workspace needs reprovision (#2312)", workspaceID)
|
||||
// Lazy-heal: mint the secret now so future requests
|
||||
// succeed. Pre-2026-04-30 the SaaS provision path didn't
|
||||
// mint — every prod workspace started life with a NULL
|
||||
// inbound secret. The shared-mint refactor closes the
|
||||
// new-workspace gap; this lazy-heal closes the existing-
|
||||
// workspace gap without requiring a destructive
|
||||
// reprovision.
|
||||
//
|
||||
// Why the request still 503s: the workspace's local
|
||||
// /configs/.platform_inbound_secret is also empty until
|
||||
// the next /registry/register response (registry.go:344-
|
||||
// 362) propagates the freshly-minted secret. The user
|
||||
// needs to retry once the workspace's heartbeat picks up
|
||||
// the new secret (typically <30s).
|
||||
minted, mintErr := wsauth.IssuePlatformInboundSecret(ctx, db.DB, workspaceID)
|
||||
if mintErr != nil {
|
||||
log.Printf("chat_files Upload: lazy-heal mint failed for %s: %v", workspaceID, mintErr)
|
||||
c.JSON(http.StatusServiceUnavailable, gin.H{
|
||||
"error": "workspace not yet enrolled in v2 upload (RFC #2312)",
|
||||
"detail": "Failed to mint inbound secret. Reprovision the workspace if this persists.",
|
||||
})
|
||||
return
|
||||
}
|
||||
_ = minted // platform now has it; workspace picks up next heartbeat
|
||||
log.Printf("chat_files Upload: lazy-healed platform_inbound_secret for %s — retry once workspace re-registers (#2312 backfill)", workspaceID)
|
||||
c.JSON(http.StatusServiceUnavailable, gin.H{
|
||||
"error": "workspace not yet enrolled in v2 upload (RFC #2312)",
|
||||
"detail": "Reprovisioning the workspace will mint the platform_inbound_secret it's missing.",
|
||||
"error": "workspace re-registering — please retry in 30 seconds",
|
||||
"detail": "Inbound secret was just minted. Workspace will pick it up on its next heartbeat.",
|
||||
"retry_after_seconds": 30,
|
||||
})
|
||||
return
|
||||
}
|
||||
@ -324,10 +346,25 @@ func (h *ChatFilesHandler) Download(c *gin.Context) {
|
||||
secret, err := wsauth.ReadPlatformInboundSecret(ctx, db.DB, workspaceID)
|
||||
if err != nil {
|
||||
if errors.Is(err, wsauth.ErrNoInboundSecret) {
|
||||
log.Printf("chat_files Download: no platform_inbound_secret for %s — workspace needs reprovision (#2312)", workspaceID)
|
||||
// Lazy-heal — same shape as the upload handler. Mint the
|
||||
// secret now; workspace picks it up on next heartbeat.
|
||||
// User retries in ~30s. See chat_files.go Upload handler
|
||||
// for full rationale.
|
||||
minted, mintErr := wsauth.IssuePlatformInboundSecret(ctx, db.DB, workspaceID)
|
||||
if mintErr != nil {
|
||||
log.Printf("chat_files Download: lazy-heal mint failed for %s: %v", workspaceID, mintErr)
|
||||
c.JSON(http.StatusServiceUnavailable, gin.H{
|
||||
"error": "workspace not yet enrolled in v2 download (RFC #2312)",
|
||||
"detail": "Failed to mint inbound secret. Reprovision the workspace if this persists.",
|
||||
})
|
||||
return
|
||||
}
|
||||
_ = minted
|
||||
log.Printf("chat_files Download: lazy-healed platform_inbound_secret for %s — retry once workspace re-registers (#2312 backfill)", workspaceID)
|
||||
c.JSON(http.StatusServiceUnavailable, gin.H{
|
||||
"error": "workspace not yet enrolled in v2 download (RFC #2312)",
|
||||
"detail": "Reprovisioning the workspace will mint the platform_inbound_secret it's missing.",
|
||||
"error": "workspace re-registering — please retry in 30 seconds",
|
||||
"detail": "Inbound secret was just minted. Workspace will pick it up on its next heartbeat.",
|
||||
"retry_after_seconds": 30,
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
@ -28,130 +28,19 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
|
||||
ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
|
||||
defer cancel()
|
||||
|
||||
// Load global secrets first, then workspace-specific secrets (which override globals).
|
||||
envVars := map[string]string{}
|
||||
|
||||
// 1. Global secrets (platform-wide defaults). Uses DecryptVersioned
|
||||
// so plaintext rows written before encryption was enabled (#85)
|
||||
// keep working. A decrypt failure aborts provisioning — silent skip
|
||||
// used to manifest as opaque "missing OAuth token" preflight crashes.
|
||||
globalRows, globalErr := db.DB.QueryContext(ctx,
|
||||
`SELECT key, encrypted_value, encryption_version FROM global_secrets`)
|
||||
if globalErr == nil {
|
||||
defer globalRows.Close()
|
||||
for globalRows.Next() {
|
||||
var k string
|
||||
var v []byte
|
||||
var ver int
|
||||
if globalRows.Scan(&k, &v, &ver) == nil {
|
||||
decrypted, decErr := crypto.DecryptVersioned(v, ver)
|
||||
if decErr != nil {
|
||||
log.Printf("Provisioner: FATAL — failed to decrypt global secret %s (version=%d): %v — aborting provision of workspace %s", k, ver, decErr, workspaceID)
|
||||
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
|
||||
"error": "failed to decrypt global secret",
|
||||
})
|
||||
db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', updated_at = now() WHERE id = $1`, workspaceID)
|
||||
return
|
||||
}
|
||||
envVars[k] = string(decrypted)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Workspace-specific secrets (override globals with same key)
|
||||
rows, err := db.DB.QueryContext(ctx,
|
||||
`SELECT key, encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = $1`, workspaceID)
|
||||
if err == nil {
|
||||
defer rows.Close()
|
||||
for rows.Next() {
|
||||
var k string
|
||||
var v []byte
|
||||
var ver int
|
||||
if rows.Scan(&k, &v, &ver) == nil {
|
||||
decrypted, decErr := crypto.DecryptVersioned(v, ver)
|
||||
if decErr != nil {
|
||||
log.Printf("Provisioner: FATAL — failed to decrypt workspace secret %s (version=%d) for %s: %v — aborting provision", k, ver, workspaceID, decErr)
|
||||
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
|
||||
"error": "failed to decrypt workspace secret",
|
||||
})
|
||||
db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', updated_at = now() WHERE id = $1`, workspaceID)
|
||||
return
|
||||
}
|
||||
envVars[k] = string(decrypted)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pluginsPath, _ := filepath.Abs(filepath.Join(h.configsDir, "..", "plugins"))
|
||||
awarenessNamespace := h.loadAwarenessNamespace(ctx, workspaceID)
|
||||
|
||||
// Per-agent git identity (Option 3 of agent-separation rollout).
|
||||
// Sets GIT_AUTHOR_* / GIT_COMMITTER_* so commits from each workspace
|
||||
// carry a distinct author in `git log` / `git blame` — instead of
|
||||
// every agent appearing as whoever the shared PAT belongs to. PR +
|
||||
// issue authorship is still tied to GITHUB_TOKEN (shared PAT); that
|
||||
// gets solved by the GitHub App migration (Option 1, follow-up PR).
|
||||
// Runs after secret loads so an operator can still override via a
|
||||
// workspace_secret named GIT_AUTHOR_NAME if they want custom identity.
|
||||
applyAgentGitIdentity(envVars, payload.Name)
|
||||
applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model)
|
||||
|
||||
// Propagate the workspace's role into env so role-aware plugins
|
||||
// (gh-identity — molecule-core#1957) can read it without the
|
||||
// plugin interface having to carry the full payload. Role is
|
||||
// cosmetic metadata — no auth weight on it — safe to surface as env.
|
||||
if payload.Role != "" {
|
||||
envVars["MOLECULE_AGENT_ROLE"] = payload.Role
|
||||
}
|
||||
|
||||
// Plugin extension point: run any registered EnvMutators (e.g.
|
||||
// github-app-auth, vault-secrets) AFTER built-in identity injection so
|
||||
// plugins can override or augment GIT_AUTHOR_*, GITHUB_TOKEN, etc.
|
||||
// A failure here aborts provisioning — a missing GitHub App token
|
||||
// would manifest later as opaque "git push 401" loops, and the agent
|
||||
// never recovers. Failing fast here surfaces the cause to the operator.
|
||||
if err := h.envMutators.Run(ctx, workspaceID, envVars); err != nil {
|
||||
log.Printf("Provisioner: env mutator chain failed for %s: %v", workspaceID, err)
|
||||
// F1086 / #1206: broadcast and db last_sample_error use generic messages —
|
||||
// env mutator errors (missing tokens, vault paths, etc.) can include
|
||||
// internal credential URIs and file paths that must not reach the caller.
|
||||
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
|
||||
"error": "plugin env mutator chain failed",
|
||||
})
|
||||
if _, dbErr := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
|
||||
workspaceID, "plugin env mutator chain failed"); dbErr != nil {
|
||||
|
||||
log.Printf("Provisioner: failed to mark workspace %s as failed after mutator error: %v", workspaceID, dbErr)
|
||||
}
|
||||
prepared, abort := h.prepareProvisionContext(ctx, workspaceID, templatePath, configFiles, payload, resetClaudeSession)
|
||||
if prepared == nil {
|
||||
log.Printf("Provisioner: prepare failed for %s: %s", workspaceID, abort.Msg)
|
||||
h.markProvisionFailed(ctx, workspaceID, abort.Msg, abort.Extra)
|
||||
return
|
||||
}
|
||||
|
||||
// Preflight: refuse to launch when config.yaml declares required env vars
|
||||
// that are not set. Without this, a missing CLAUDE_CODE_OAUTH_TOKEN (or
|
||||
// similar) crashes the in-container preflight, the container never calls
|
||||
// /registry/register, and the workspace sits in `provisioning` until a
|
||||
// sweeper flips it or the user retries. Failing fast here gives the user
|
||||
// an immediate, actionable error in the Events tab.
|
||||
if missing := missingRequiredEnv(configFiles, envVars); len(missing) > 0 {
|
||||
msg := formatMissingEnvError(missing)
|
||||
log.Printf("Provisioner: %s (workspace=%s)", msg, workspaceID)
|
||||
if _, dbErr := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
|
||||
workspaceID, msg); dbErr != nil {
|
||||
log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr)
|
||||
}
|
||||
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
|
||||
"error": msg,
|
||||
"missing": missing,
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
|
||||
cfg.ResetClaudeSession = resetClaudeSession // #12
|
||||
cfg := prepared.Config
|
||||
|
||||
// Preflight #17: detect + auto-recover the "empty config volume" crashloop.
|
||||
// Docker-specific — SaaS mode delegates volume management to the
|
||||
// control-plane EC2 launcher and never has a local Docker volume to
|
||||
// probe. Runs AFTER prepareProvisionContext because the recovered
|
||||
// template still needs the same env-and-cfg surface the prepare built.
|
||||
//
|
||||
// When the caller supplies neither a template dir nor in-memory configFiles
|
||||
// (the auto-restart path), probe the existing Docker named volume. If the
|
||||
@ -194,7 +83,7 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
|
||||
workspaceID, filepath.Base(runtimeTemplate))
|
||||
templatePath = runtimeTemplate
|
||||
// Rebuild cfg with the recovered template path so Start() sees it.
|
||||
cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
|
||||
cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, prepared.EnvVars, prepared.PluginsPath, prepared.AwarenessNamespace)
|
||||
cfg.ResetClaudeSession = resetClaudeSession
|
||||
recovered = true
|
||||
break
|
||||
@ -209,46 +98,27 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
|
||||
if !recovered {
|
||||
msg := fmt.Sprintf("cannot start workspace %s: no config.yaml source and config volume is empty — delete the workspace or provide a template", workspaceID)
|
||||
log.Printf("Provisioner: %s", msg)
|
||||
if _, dbErr := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
|
||||
workspaceID, msg); dbErr != nil {
|
||||
log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr)
|
||||
}
|
||||
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
|
||||
"error": msg,
|
||||
})
|
||||
h.markProvisionFailed(ctx, workspaceID, msg, nil)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Issue/rotate the workspace auth token and inject the plaintext into the
|
||||
// config volume so the workspace always has a valid bearer credential on
|
||||
// disk, even after a container rebuild wiped the volume (issue #418).
|
||||
//
|
||||
// We must rotate (revoke-then-issue) rather than reuse because the DB only
|
||||
// stores sha256(plaintext) — we cannot reconstruct the original token to
|
||||
// write it back. The new plaintext is written into /configs/.auth_token via
|
||||
// WriteFilesToContainer, which runs immediately after ContainerStart and
|
||||
// wins the race against the Python adapter's startup time (~1-2 s).
|
||||
h.issueAndInjectToken(ctx, workspaceID, &cfg)
|
||||
h.issueAndInjectInboundSecret(ctx, workspaceID, &cfg)
|
||||
// Issue/rotate the workspace auth token + platform→workspace inbound secret
|
||||
// and inject both into the config volume. See mintWorkspaceSecrets doc for
|
||||
// the shared invariant — every provision path MUST mint here. Plaintext
|
||||
// is written into /configs/.auth_token + /configs/.platform_inbound_secret
|
||||
// via WriteFilesToContainer, which runs immediately after ContainerStart
|
||||
// and wins the race against the Python adapter's startup time (~1-2 s).
|
||||
h.mintWorkspaceSecrets(ctx, workspaceID, &cfg)
|
||||
|
||||
url, err := h.provisioner.Start(ctx, cfg)
|
||||
if err != nil {
|
||||
// F1086 / #1206: persist a generic message so the canvas and
|
||||
// GET /workspaces/:id expose something actionable without leaking
|
||||
// docker/error internals (image pull messages, volume paths, etc.).
|
||||
errMsg := "workspace start failed"
|
||||
log.Printf("Provisioner: %s for %s: %v", errMsg, workspaceID, err)
|
||||
if _, dbErr := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
|
||||
workspaceID, "workspace start failed"); dbErr != nil {
|
||||
log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr)
|
||||
}
|
||||
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
|
||||
"error": "workspace start failed",
|
||||
})
|
||||
log.Printf("Provisioner: workspace start failed for %s: %v", workspaceID, err)
|
||||
h.markProvisionFailed(ctx, workspaceID, "workspace start failed", nil)
|
||||
} else if url != "" {
|
||||
// Pre-store the host-accessible URL (http://127.0.0.1:<port>) so the A2A proxy can reach the container.
|
||||
// The registry's ON CONFLICT preserves URLs starting with http://127.0.0.1 when the agent self-registers.
|
||||
@ -706,6 +576,13 @@ func applyRuntimeModelEnv(envVars map[string]string, runtime, model string) {
|
||||
// loadWorkspaceSecrets loads global + workspace-specific secrets into a map.
|
||||
// Returns nil map + error string on decrypt failure. Shared by both Docker
|
||||
// and control plane provisioning paths to avoid duplication.
|
||||
//
|
||||
// F1086 / #1206: the returned error string is the SAFE-CANNED message that
|
||||
// gets persisted to workspaces.last_sample_error AND broadcast as the
|
||||
// WORKSPACE_PROVISION_FAILED payload. Internal detail (the secret key name,
|
||||
// the encryption version, the decrypt-error text) is logged here, never
|
||||
// returned to the caller, so it can't leak via the canvas event stream
|
||||
// (cf. TestProvisionWorkspace_NoInternalErrorsInBroadcast).
|
||||
func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]string, string) {
|
||||
envVars := map[string]string{}
|
||||
globalRows, globalErr := db.DB.QueryContext(ctx,
|
||||
@ -719,7 +596,8 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s
|
||||
if globalRows.Scan(&k, &v, &ver) == nil {
|
||||
decrypted, decErr := crypto.DecryptVersioned(v, ver)
|
||||
if decErr != nil {
|
||||
return nil, fmt.Sprintf("cannot decrypt global secret %s: %v", k, decErr)
|
||||
log.Printf("Provisioner: FATAL — failed to decrypt global secret %s (version=%d): %v — aborting provision of workspace %s", k, ver, decErr, workspaceID)
|
||||
return nil, "failed to decrypt global secret"
|
||||
}
|
||||
envVars[k] = string(decrypted)
|
||||
}
|
||||
@ -736,7 +614,8 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s
|
||||
if wsRows.Scan(&k, &v, &ver) == nil {
|
||||
decrypted, decErr := crypto.DecryptVersioned(v, ver)
|
||||
if decErr != nil {
|
||||
return nil, fmt.Sprintf("cannot decrypt workspace secret %s: %v", k, decErr)
|
||||
log.Printf("Provisioner: FATAL — failed to decrypt workspace secret %s (version=%d) for %s: %v — aborting provision", k, ver, workspaceID, decErr)
|
||||
return nil, "failed to decrypt workspace secret"
|
||||
}
|
||||
envVars[k] = string(decrypted)
|
||||
}
|
||||
@ -746,53 +625,43 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s
|
||||
}
|
||||
|
||||
// provisionWorkspaceCP provisions a workspace via the control plane API.
|
||||
//
|
||||
// Mode-specific work this function owns: cpProv.Start (delegates EC2
|
||||
// launch to control plane) + persist instance_id in DB. The shared
|
||||
// setup (secrets, env mutators, mint of auth_token + inbound_secret)
|
||||
// lives in prepareProvisionContext + mintWorkspaceSecrets and is
|
||||
// called by both this function and the Docker-mode counterpart.
|
||||
//
|
||||
// Pre-#2026-04-30: this function did NOT call mintWorkspaceSecrets.
|
||||
// That left every prod workspace with a NULL platform_inbound_secret
|
||||
// column → 503 on every chat upload (RFC #2312). The bug shipped
|
||||
// because the Docker and SaaS provision paths had drifted: Docker
|
||||
// got the mint when #2312 landed; SaaS was missed. Refactored to
|
||||
// share so the next mint added can't be silently forgotten on one
|
||||
// side.
|
||||
func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
|
||||
defer cancel()
|
||||
|
||||
envVars, decryptErr := loadWorkspaceSecrets(ctx, workspaceID)
|
||||
if decryptErr != "" {
|
||||
log.Printf("CPProvisioner: %s for %s", decryptErr, workspaceID)
|
||||
db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
|
||||
workspaceID, decryptErr)
|
||||
prepared, abort := h.prepareProvisionContext(ctx, workspaceID, templatePath, configFiles, payload, false)
|
||||
if prepared == nil {
|
||||
log.Printf("CPProvisioner: prepare failed for %s: %s", workspaceID, abort.Msg)
|
||||
h.markProvisionFailed(ctx, workspaceID, abort.Msg, abort.Extra)
|
||||
return
|
||||
}
|
||||
|
||||
applyAgentGitIdentity(envVars, payload.Name)
|
||||
applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model)
|
||||
// Propagate role for role-aware plugins (#1957). See provisionWorkspace
|
||||
// above for rationale.
|
||||
if payload.Role != "" {
|
||||
envVars["MOLECULE_AGENT_ROLE"] = payload.Role
|
||||
}
|
||||
if err := h.envMutators.Run(ctx, workspaceID, envVars); err != nil {
|
||||
log.Printf("CPProvisioner: env mutator failed for %s: %v", workspaceID, err)
|
||||
// F1086 / #1206: env mutator errors (missing tokens, vault paths) must not
|
||||
// leak into last_sample_error — use generic message.
|
||||
db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
|
||||
workspaceID, "plugin env mutator chain failed")
|
||||
return
|
||||
}
|
||||
// Mint the workspace's auth_token + platform_inbound_secret now,
|
||||
// before cpProv.Start. Both modes write to the DB column; the
|
||||
// workspace receives the plaintext via /registry/register response
|
||||
// (registry.go:344-362) on its first heartbeat after boot.
|
||||
h.mintWorkspaceSecrets(ctx, workspaceID, &prepared.Config)
|
||||
|
||||
cfg := provisioner.WorkspaceConfig{
|
||||
WorkspaceID: workspaceID,
|
||||
Tier: payload.Tier,
|
||||
Runtime: payload.Runtime,
|
||||
EnvVars: envVars,
|
||||
PlatformURL: h.platformURL,
|
||||
}
|
||||
|
||||
machineID, err := h.cpProv.Start(ctx, cfg)
|
||||
machineID, err := h.cpProv.Start(ctx, prepared.Config)
|
||||
if err != nil {
|
||||
// F1086 / #1206: CP errors can include machine type, AMI IDs, VPC
|
||||
// paths — use generic message for broadcast and last_sample_error.
|
||||
errMsg := "workspace start failed"
|
||||
log.Printf("CPProvisioner: %s for %s: %v", errMsg, workspaceID, err)
|
||||
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
|
||||
"error": "provisioning failed",
|
||||
})
|
||||
db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
|
||||
workspaceID, "provisioning failed")
|
||||
log.Printf("CPProvisioner: workspace start failed for %s: %v", workspaceID, err)
|
||||
h.markProvisionFailed(ctx, workspaceID, "provisioning failed", nil)
|
||||
return
|
||||
}
|
||||
|
||||
@ -809,13 +678,4 @@ func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string
|
||||
}
|
||||
|
||||
log.Printf("CPProvisioner: workspace %s started as machine %s via control plane", workspaceID, machineID)
|
||||
// Token issuance is deliberately deferred to the workspace's first
|
||||
// /registry/register call. Minting here without also delivering the
|
||||
// plaintext to the workspace (via user-data or a follow-up callback)
|
||||
// would leave a live token in DB that the workspace has no copy of —
|
||||
// RegistryHandler.requireWorkspaceToken would then 401 every
|
||||
// /registry/register attempt because the workspace is no longer in the
|
||||
// "no live tokens → bootstrap-allowed" state. The register handler
|
||||
// already mints a token on first successful register and returns it in
|
||||
// the response body for the workspace to persist.
|
||||
}
|
||||
|
||||
181
workspace-server/internal/handlers/workspace_provision_shared.go
Normal file
181
workspace-server/internal/handlers/workspace_provision_shared.go
Normal file
@ -0,0 +1,181 @@
|
||||
package handlers
|
||||
|
||||
// workspace_provision_shared.go — mode-agnostic provision-time
|
||||
// orchestration that BOTH provisionWorkspaceOpts (Docker) and
|
||||
// provisionWorkspaceCP (SaaS) call. Extracted because the original
|
||||
// per-mode functions had drifted: the SaaS path forgot to call
|
||||
// issueAndInjectInboundSecret, which produced silent-503 chat upload
|
||||
// errors for every prod workspace (RFC #2312 — discovered 2026-04-30).
|
||||
//
|
||||
// The drift class this module closes: when a new provision-time setup
|
||||
// step is added, it goes in ONE place and both modes pick it up. New
|
||||
// steps that legitimately differ per mode stay in the per-mode
|
||||
// functions and are explicitly documented there.
|
||||
//
|
||||
// What's shared (this file):
|
||||
// - Loading secrets (global + workspace) from Postgres
|
||||
// - Applying git identity, runtime model env, role propagation
|
||||
// - Running env mutators
|
||||
// - Preflight: missing required env
|
||||
// - Building provisioner.WorkspaceConfig
|
||||
// - Minting auth_token + platform_inbound_secret (#2312)
|
||||
//
|
||||
// What's mode-specific (kept in the per-mode functions):
|
||||
// - Docker: empty-config-volume preflight + auto-recover via
|
||||
// runtime template (#1858), then provisioner.Start with local
|
||||
// Docker daemon, then WriteFilesToContainer for config volume.
|
||||
// - SaaS: cpProv.Start which delegates EC2 launch to control plane,
|
||||
// then persist returned instance_id, then defer .auth_token /
|
||||
// .platform_inbound_secret delivery to the workspace's first
|
||||
// /registry/register response (registry.go:344-362).
|
||||
//
|
||||
// Architectural test (workspace_provision_shared_test.go) asserts
|
||||
// every code path that takes a workspaceID and starts a provision
|
||||
// MUST call mintWorkspaceSecrets — same shape as the
|
||||
// audit-coverage AST gate from #335.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
)
|
||||
|
||||
// preparedProvisionContext carries the computed env + cfg through
|
||||
// the per-mode provisioner-Start step. Returned from
|
||||
// prepareProvisionContext when the caller proceeds; nil + non-empty
|
||||
// abort message when the caller must mark the workspace failed.
|
||||
type preparedProvisionContext struct {
|
||||
EnvVars map[string]string
|
||||
PluginsPath string
|
||||
AwarenessNamespace string
|
||||
Config provisioner.WorkspaceConfig
|
||||
}
|
||||
|
||||
// provisionAbort describes why prepareProvisionContext refused to
|
||||
// produce a context. Msg is the caller-safe summary that gets
|
||||
// persisted to workspaces.last_sample_error AND broadcast. Extra
|
||||
// is the structured payload to surface alongside (e.g. the missing
|
||||
// env list for the missing-env failure class). Both fields are
|
||||
// zero-valued on the success return.
|
||||
type provisionAbort struct {
|
||||
Msg string
|
||||
Extra map[string]interface{}
|
||||
}
|
||||
|
||||
// prepareProvisionContext does the mode-agnostic setup work that
|
||||
// both Docker and SaaS provisioners need before their respective
|
||||
// Start() call. Returns the prepared cfg or an abort message that
|
||||
// the caller MUST broadcast as WORKSPACE_PROVISION_FAILED and
|
||||
// persist as workspaces.last_sample_error.
|
||||
//
|
||||
// The function does NOT broadcast / DB-write the failure itself —
|
||||
// failure surface is mode-aware (Docker logs differ from SaaS logs
|
||||
// in observed format, and the failure-class taxonomy is shared
|
||||
// between modes but the broadcast key may evolve per mode in the
|
||||
// future).
|
||||
func (h *WorkspaceHandler) prepareProvisionContext(
|
||||
ctx context.Context,
|
||||
workspaceID, templatePath string,
|
||||
configFiles map[string][]byte,
|
||||
payload models.CreateWorkspacePayload,
|
||||
resetClaudeSession bool,
|
||||
) (*preparedProvisionContext, *provisionAbort) {
|
||||
envVars, decryptErr := loadWorkspaceSecrets(ctx, workspaceID)
|
||||
if decryptErr != "" {
|
||||
return nil, &provisionAbort{Msg: decryptErr}
|
||||
}
|
||||
|
||||
pluginsPath, _ := filepath.Abs(filepath.Join(h.configsDir, "..", "plugins"))
|
||||
awarenessNamespace := h.loadAwarenessNamespace(ctx, workspaceID)
|
||||
|
||||
// Per-agent git identity (#1957) — must run after secret loads so
|
||||
// a workspace_secret named GIT_AUTHOR_NAME can override.
|
||||
applyAgentGitIdentity(envVars, payload.Name)
|
||||
applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model)
|
||||
if payload.Role != "" {
|
||||
envVars["MOLECULE_AGENT_ROLE"] = payload.Role
|
||||
}
|
||||
|
||||
// Plugin extension point: env mutators run AFTER built-in identity
|
||||
// injection so plugins can override or augment identity vars.
|
||||
if err := h.envMutators.Run(ctx, workspaceID, envVars); err != nil {
|
||||
return nil, &provisionAbort{Msg: "plugin env mutator chain failed"}
|
||||
}
|
||||
|
||||
// Preflight #5: refuse to launch when config.yaml declares required
|
||||
// env vars that are not set. Skipped in SaaS mode when configFiles
|
||||
// is nil (CP-mode's cfg is built without local config bytes — the
|
||||
// CP-side launches the workspace with envVars but doesn't inspect
|
||||
// the config.yaml the same way Docker does). Future: lift the
|
||||
// preflight to a pure-data check that doesn't need configFiles
|
||||
// bytes, so SaaS mode can run it too.
|
||||
if configFiles != nil {
|
||||
if missing := missingRequiredEnv(configFiles, envVars); len(missing) > 0 {
|
||||
msg := formatMissingEnvError(missing)
|
||||
return nil, &provisionAbort{
|
||||
Msg: msg,
|
||||
Extra: map[string]interface{}{"error": msg, "missing": missing},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
|
||||
cfg.ResetClaudeSession = resetClaudeSession
|
||||
|
||||
return &preparedProvisionContext{
|
||||
EnvVars: envVars,
|
||||
PluginsPath: pluginsPath,
|
||||
AwarenessNamespace: awarenessNamespace,
|
||||
Config: cfg,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// mintWorkspaceSecrets issues + persists the workspace auth token
|
||||
// AND the platform→workspace inbound secret (#2312). Both modes MUST
|
||||
// call this — Docker mints + writes to local config volume; SaaS
|
||||
// mints and persists only to the DB column (the workspace fetches
|
||||
// via /registry/register response).
|
||||
//
|
||||
// Pre-2026-04-30: the SaaS path (provisionWorkspaceCP) was missing
|
||||
// the inbound-secret mint, leaving every prod workspace with a NULL
|
||||
// platform_inbound_secret column and 503-ing every chat upload. This
|
||||
// extracted helper is the structural fix — both paths share one
|
||||
// function so adding a new mint can't be silently forgotten on one
|
||||
// side.
|
||||
//
|
||||
// Failure model: best-effort. Each underlying issue function logs +
|
||||
// returns silently on its own failure (token mint failure → workspace
|
||||
// 401s on first /internal call; secret mint failure → first chat
|
||||
// upload 503s with the same message we used to surface for old-
|
||||
// workspaces). The shared helper does NOT abort the provision because
|
||||
// the workspace can still come up and serve non-internal traffic; the
|
||||
// 401/503 surfaces the missing secret loudly when the user actually
|
||||
// tries to use the affected feature.
|
||||
func (h *WorkspaceHandler) mintWorkspaceSecrets(ctx context.Context, workspaceID string, cfg *provisioner.WorkspaceConfig) {
|
||||
h.issueAndInjectToken(ctx, workspaceID, cfg)
|
||||
h.issueAndInjectInboundSecret(ctx, workspaceID, cfg)
|
||||
}
|
||||
|
||||
// markProvisionFailed is the standard "abort with message" path used
|
||||
// by both provision modes. Wraps the broadcast + DB update in one
|
||||
// call so the failure shape stays consistent across modes.
|
||||
func (h *WorkspaceHandler) markProvisionFailed(ctx context.Context, workspaceID, msg string, extra map[string]interface{}) {
|
||||
if extra == nil {
|
||||
extra = map[string]interface{}{"error": msg}
|
||||
} else if _, hasErr := extra["error"]; !hasErr {
|
||||
extra["error"] = msg
|
||||
}
|
||||
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, extra)
|
||||
if _, dbErr := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
|
||||
workspaceID, msg); dbErr != nil {
|
||||
// Non-fatal: the broadcast already fired, the operator sees the
|
||||
// failure event in the canvas. The DB row stays at whatever
|
||||
// status it had — provisioning event log is the source of truth.
|
||||
log.Printf("markProvisionFailed: db update failed for %s: %v", workspaceID, dbErr)
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,187 @@
|
||||
package handlers
|
||||
|
||||
// workspace_provision_shared_test.go — architectural test that pins
|
||||
// the invariant the shared-prepare refactor relies on: every code
|
||||
// path that provisions a workspace MUST call mintWorkspaceSecrets.
|
||||
//
|
||||
// Closes the drift class that produced the 2026-04-30 RFC #2312
|
||||
// silent-503 bug. Pre-fix: provisionWorkspaceCP forgot to mint
|
||||
// platform_inbound_secret because the SaaS path was implemented
|
||||
// after the Docker path and the original mint call wasn't carried
|
||||
// forward. Both modes now share mintWorkspaceSecrets via this
|
||||
// extracted helper; this test ensures it stays that way.
|
||||
//
|
||||
// Same shape as the audit-coverage gate from #335 (#2343 PR-5).
|
||||
// If this test fails: either add mintWorkspaceSecrets to the new
|
||||
// provision function, OR (if the function legitimately should NOT
|
||||
// mint) add it to provisionExemptFunctions with a one-line
|
||||
// justification.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"go/ast"
|
||||
"go/parser"
|
||||
"go/token"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
)
|
||||
|
||||
// provisionFunctions are the set of functions that provision a
|
||||
// workspace and therefore MUST call mintWorkspaceSecrets. Detected
|
||||
// by name match (handler-method pattern); add a new entry whenever
|
||||
// a new provision path is introduced.
|
||||
var provisionFunctions = map[string]bool{
|
||||
"provisionWorkspace": true,
|
||||
"provisionWorkspaceOpts": true,
|
||||
"provisionWorkspaceCP": true,
|
||||
}
|
||||
|
||||
// provisionExemptFunctions are functions whose name pattern matches
|
||||
// the provision-function set but legitimately don't call
|
||||
// mintWorkspaceSecrets (e.g., the wrapper provisionWorkspace which
|
||||
// delegates to provisionWorkspaceOpts — the actual mint happens in
|
||||
// the delegate, not the wrapper).
|
||||
var provisionExemptFunctions = map[string]string{
|
||||
"provisionWorkspace": "thin wrapper that delegates to provisionWorkspaceOpts; the delegate mints",
|
||||
}
|
||||
|
||||
// TestProvisionFunctions_AllCallMintWorkspaceSecrets asserts every
|
||||
// non-exempt provision function in this package calls
|
||||
// mintWorkspaceSecrets at least once in its body.
|
||||
//
|
||||
// The check is presence-in-function, not pairing-by-line — same
|
||||
// rationale as the audit-coverage gate from #335.
|
||||
func TestProvisionFunctions_AllCallMintWorkspaceSecrets(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
fset := token.NewFileSet()
|
||||
entries, err := os.ReadDir(".")
|
||||
if err != nil {
|
||||
t.Fatalf("read dir: %v", err)
|
||||
}
|
||||
|
||||
type missing struct {
|
||||
file string
|
||||
line int
|
||||
fn string
|
||||
}
|
||||
var violations []missing
|
||||
|
||||
for _, entry := range entries {
|
||||
name := entry.Name()
|
||||
if entry.IsDir() || !strings.HasSuffix(name, ".go") || strings.HasSuffix(name, "_test.go") {
|
||||
continue
|
||||
}
|
||||
f, err := parser.ParseFile(fset, filepath.Join(".", name), nil, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("parse %s: %v", name, err)
|
||||
}
|
||||
|
||||
for _, decl := range f.Decls {
|
||||
fn, ok := decl.(*ast.FuncDecl)
|
||||
if !ok || fn.Body == nil {
|
||||
continue
|
||||
}
|
||||
if !provisionFunctions[fn.Name.Name] {
|
||||
continue
|
||||
}
|
||||
if _, exempt := provisionExemptFunctions[fn.Name.Name]; exempt {
|
||||
continue
|
||||
}
|
||||
if !callsMintWorkspaceSecrets(fn.Body) {
|
||||
violations = append(violations, missing{
|
||||
file: name,
|
||||
line: fset.Position(fn.Pos()).Line,
|
||||
fn: fn.Name.Name,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, v := range violations {
|
||||
t.Errorf(
|
||||
"%s:%d %s does not call mintWorkspaceSecrets — every provision path MUST mint auth_token + platform_inbound_secret. If audit is genuinely impossible here, add %q to provisionExemptFunctions in workspace_provision_shared_test.go with a justification.",
|
||||
v.file, v.line, v.fn, v.fn,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// callsMintWorkspaceSecrets walks the function body and reports
|
||||
// whether mintWorkspaceSecrets is called anywhere — direct call OR
|
||||
// via a helper. Recursion to helpers is shallow: we only check
|
||||
// immediate calls in this function's body. The shared-prepare
|
||||
// refactor centralizes mint in mintWorkspaceSecrets itself, so a
|
||||
// direct call at the top-level is the expected pattern.
|
||||
func callsMintWorkspaceSecrets(body *ast.BlockStmt) bool {
|
||||
found := false
|
||||
ast.Inspect(body, func(n ast.Node) bool {
|
||||
if found {
|
||||
return false
|
||||
}
|
||||
call, ok := n.(*ast.CallExpr)
|
||||
if !ok {
|
||||
return true
|
||||
}
|
||||
sel, ok := call.Fun.(*ast.SelectorExpr)
|
||||
if !ok {
|
||||
return true
|
||||
}
|
||||
if sel.Sel.Name == "mintWorkspaceSecrets" {
|
||||
found = true
|
||||
return false
|
||||
}
|
||||
return true
|
||||
})
|
||||
return found
|
||||
}
|
||||
|
||||
// TestMintWorkspaceSecrets_PersistsInboundSecretInSaaSMode is the
|
||||
// behavioral counterpart to the AST gate. It pins the structural
|
||||
// fix for the 2026-04-30 silent-503 chat upload bug (RFC #2312):
|
||||
// even in SaaS mode (where Docker file injection is skipped),
|
||||
// mintWorkspaceSecrets MUST persist platform_inbound_secret to the
|
||||
// workspaces row so platform-side handlers can read it back.
|
||||
//
|
||||
// Pre-fix: provisionWorkspaceCP never called the mint helper, so
|
||||
// every prod workspace had NULL platform_inbound_secret →
|
||||
// chat_files Upload returned 503 with "workspace not yet enrolled
|
||||
// in v2 upload" on every attempt.
|
||||
func TestMintWorkspaceSecrets_PersistsInboundSecretInSaaSMode(t *testing.T) {
|
||||
t.Setenv("MOLECULE_DEPLOY_MODE", "saas")
|
||||
mock := setupTestDB(t)
|
||||
|
||||
// First underlying call: revoke any existing live tokens. SaaS
|
||||
// mode early-returns from issueAndInjectToken right after this,
|
||||
// so IssueToken is NOT expected.
|
||||
mock.ExpectExec(`UPDATE workspace_auth_tokens SET revoked_at = now\(\) WHERE workspace_id = \$1 AND revoked_at IS NULL`).
|
||||
WithArgs("ws-saas-mint").
|
||||
WillReturnResult(sqlmock.NewResult(0, 0))
|
||||
|
||||
// Second underlying call: persist the platform_inbound_secret.
|
||||
// The structural fix — without this UPDATE, the bug recurs.
|
||||
mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`).
|
||||
WithArgs(sqlmock.AnyArg(), "ws-saas-mint").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
handler := NewWorkspaceHandler(&captureBroadcaster{}, nil, "http://localhost:8080", t.TempDir())
|
||||
cfg := provisioner.WorkspaceConfig{WorkspaceID: "ws-saas-mint"}
|
||||
handler.mintWorkspaceSecrets(context.Background(), "ws-saas-mint", &cfg)
|
||||
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("sqlmock expectations not met — mintWorkspaceSecrets did not persist platform_inbound_secret in SaaS mode (this is the prod bug recurrence): %v", err)
|
||||
}
|
||||
|
||||
// Sanity: SaaS mode must NOT have written .platform_inbound_secret
|
||||
// into cfg.ConfigFiles — there's no Docker volume to deliver it.
|
||||
if _, present := cfg.ConfigFiles[".platform_inbound_secret"]; present {
|
||||
t.Errorf("SaaS mode should not inject .platform_inbound_secret into cfg.ConfigFiles (no Docker volume) — got entry")
|
||||
}
|
||||
if _, present := cfg.ConfigFiles[".auth_token"]; present {
|
||||
t.Errorf("SaaS mode should not inject .auth_token into cfg.ConfigFiles (no Docker volume) — got entry")
|
||||
}
|
||||
}
|
||||
@ -1137,10 +1137,14 @@ func TestProvisionWorkspace_NoInternalErrorsInBroadcast(t *testing.T) {
|
||||
WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"}).
|
||||
AddRow("FAKE_KEY", []byte("any-bytes"), 99))
|
||||
// On decrypt failure provisionWorkspace also marks the workspace as
|
||||
// failed via UPDATE workspaces. Match-anything on the args so the
|
||||
// test isn't coupled to the exact UPDATE column order.
|
||||
// failed via UPDATE workspaces. Two args: workspace_id + the
|
||||
// last_sample_error message ("failed to decrypt global secret").
|
||||
// Pre-refactor (workspace_provision_shared.go) the decrypt-fail
|
||||
// path skipped last_sample_error; the shared helper now always
|
||||
// persists it so users see the failure in the UI without having
|
||||
// to grep server logs.
|
||||
mock.ExpectExec(`UPDATE workspaces SET status = 'failed'`).
|
||||
WithArgs(sqlmock.AnyArg()).
|
||||
WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
cap := &captureBroadcaster{}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user