diff --git a/workspace-server/internal/handlers/chat_files.go b/workspace-server/internal/handlers/chat_files.go index f8d348e7..e152809e 100644 --- a/workspace-server/internal/handlers/chat_files.go +++ b/workspace-server/internal/handlers/chat_files.go @@ -30,7 +30,6 @@ package handlers import ( "context" - "errors" "fmt" "io" "log" @@ -41,7 +40,6 @@ import ( "time" "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" - "github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth" "github.com/gin-gonic/gin" ) @@ -120,41 +118,28 @@ func resolveWorkspaceForwardCreds(c *gin.Context, ctx context.Context, workspace // upstream gate rather than re-validating here. Tracked at #2316 // for follow-up: forward-time re-validation as defense-in-depth. - secret, err := wsauth.ReadPlatformInboundSecret(ctx, db.DB, workspaceID) + secret, healed, err := readOrLazyHealInboundSecret(ctx, workspaceID, "chat_files "+op) if err != nil { - if errors.Is(err, wsauth.ErrNoInboundSecret) { - // Lazy-heal: mint the secret now so future requests - // succeed. Pre-2026-04-30 the SaaS provision path didn't - // mint — every prod workspace started life with NULL - // platform_inbound_secret. The shared-mint refactor closes - // the new-workspace gap; this lazy-heal closes the - // existing-workspace gap without requiring a destructive - // reprovision. - // - // Why the request still 503s: the workspace's local - // /configs/.platform_inbound_secret is also empty until the - // next /registry/register response (registry.go:344-362) - // propagates the freshly-minted secret. The user retries - // once the workspace's heartbeat picks up the new secret - // (typically <30s). - if _, mintErr := wsauth.IssuePlatformInboundSecret(ctx, db.DB, workspaceID); mintErr != nil { - log.Printf("chat_files %s: lazy-heal mint failed for %s: %v", op, workspaceID, mintErr) - c.JSON(http.StatusServiceUnavailable, gin.H{ - "error": "workspace not yet enrolled in v2 " + op + " (RFC #2312)", - "detail": "Failed to mint inbound secret. Reprovision the workspace if this persists.", - }) - return "", "", false - } - log.Printf("chat_files %s: lazy-healed platform_inbound_secret for %s — retry once workspace re-registers (#2312 backfill)", op, workspaceID) - c.JSON(http.StatusServiceUnavailable, gin.H{ - "error": "workspace re-registering — please retry in 30 seconds", - "detail": "Inbound secret was just minted. Workspace will pick it up on its next heartbeat.", - "retry_after_seconds": 30, - }) - return "", "", false - } - log.Printf("chat_files %s: read platform_inbound_secret failed for %s: %v", op, workspaceID, err) - c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to read workspace secret"}) + // Either a non-NoInboundSecret read error (DB hiccup) or a mint + // failure during lazy-heal. The chat_files contract is to surface + // 503 with the RFC-#2312 reprovision hint in both cases — the user + // can't proceed and needs ops attention. + c.JSON(http.StatusServiceUnavailable, gin.H{ + "error": "workspace not yet enrolled in v2 " + op + " (RFC #2312)", + "detail": "Failed to mint inbound secret. Reprovision the workspace if this persists.", + }) + return "", "", false + } + if healed { + // The platform now has the secret but the workspace's + // /configs/.platform_inbound_secret is still empty until the next + // /registry/register response propagates it. User retries after + // the workspace's next heartbeat picks up the new secret (~30s). + c.JSON(http.StatusServiceUnavailable, gin.H{ + "error": "workspace re-registering — please retry in 30 seconds", + "detail": "Inbound secret was just minted. Workspace will pick it up on its next heartbeat.", + "retry_after_seconds": 30, + }) return "", "", false } return wsURL, secret, true diff --git a/workspace-server/internal/handlers/registry.go b/workspace-server/internal/handlers/registry.go index 6fd5ae10..cee868eb 100644 --- a/workspace-server/internal/handlers/registry.go +++ b/workspace-server/internal/handlers/registry.go @@ -451,29 +451,19 @@ func (h *RegistryHandler) Register(c *gin.Context) { // has no volume to write into. // // Lazy-heal (2026-04-30): if the column is NULL (legacy workspace - // provisioned before the shared-mint refactor), mint it inline and - // include in the response. Without this, legacy workspaces would need - // two round-trips before chat upload works — chat_files lazy-heals - // platform-side on first attempt, then the workspace must heartbeat - // to receive the freshly-minted secret. Heal-on-register collapses - // that to one round-trip. - secret, secretErr := wsauth.ReadPlatformInboundSecret(ctx, db.DB, payload.ID) - if secretErr != nil && errors.Is(secretErr, wsauth.ErrNoInboundSecret) { - minted, mintErr := wsauth.IssuePlatformInboundSecret(ctx, db.DB, payload.ID) - if mintErr != nil { - log.Printf("Registry: lazy-heal mint of platform_inbound_secret failed for %s: %v — workspace will 503 on chat upload until next register", payload.ID, mintErr) - } else { - secret = minted - secretErr = nil - log.Printf("Registry: lazy-healed platform_inbound_secret for %s (#2312 backfill)", payload.ID) - } - } - if secretErr == nil { + // provisioned before the shared-mint refactor), mint inline and + // include in the response. Without this, legacy workspaces would + // need two round-trips before chat upload works — chat_files + // lazy-heals platform-side on first attempt, then the workspace + // must heartbeat to receive the freshly-minted secret. + // Heal-on-register collapses that to one round-trip. + if secret, _, healErr := readOrLazyHealInboundSecret(ctx, payload.ID, "Registry"); healErr == nil { response["platform_inbound_secret"] = secret - } else if !errors.Is(secretErr, wsauth.ErrNoInboundSecret) { - // Non-NoInboundSecret read errors (DB hiccup, etc.) — log loud. - log.Printf("Registry: read platform_inbound_secret for %s failed: %v", payload.ID, secretErr) } + // Errors are non-fatal here — the workspace is online and can serve + // non-/internal traffic. The lazy-heal helper has already logged + // whichever sub-step failed (read or mint). If the secret never lands, + // chat upload surfaces the issue loudly with the RFC-#2312 hint. c.JSON(http.StatusOK, response) } diff --git a/workspace-server/internal/handlers/workspace_provision_shared.go b/workspace-server/internal/handlers/workspace_provision_shared.go index d0457121..72942178 100644 --- a/workspace-server/internal/handlers/workspace_provision_shared.go +++ b/workspace-server/internal/handlers/workspace_provision_shared.go @@ -36,14 +36,49 @@ package handlers import ( "context" + "errors" "log" "path/filepath" "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" "github.com/Molecule-AI/molecule-monorepo/platform/internal/models" "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth" ) +// readOrLazyHealInboundSecret reads the workspace's +// platform_inbound_secret. On ErrNoInboundSecret it mints inline +// (lazy-heal — RFC #2312 backfill for workspaces provisioned before +// the shared-mint refactor closed the gap). +// +// Returns: +// - (secret, false, nil) — secret was already present +// - (secret, true, nil) — secret was missing; just minted +// - ("", false, err) — read failed (non-NoInboundSecret) OR mint failed +// +// opLabel prefixes log lines so operators can tell which feature +// triggered the heal (e.g. "Registry", "chat_files Upload"). Centralized +// here so the next "what to do when the secret is missing" decision — +// rotation, audit, alerting — goes in ONE place. Same drift-prevention +// rationale as resolveWorkspaceForwardCreds and mintWorkspaceSecrets. +func readOrLazyHealInboundSecret(ctx context.Context, workspaceID, opLabel string) (secret string, healed bool, err error) { + s, readErr := wsauth.ReadPlatformInboundSecret(ctx, db.DB, workspaceID) + if readErr == nil { + return s, false, nil + } + if !errors.Is(readErr, wsauth.ErrNoInboundSecret) { + log.Printf("%s: read platform_inbound_secret failed for %s: %v", opLabel, workspaceID, readErr) + return "", false, readErr + } + minted, mintErr := wsauth.IssuePlatformInboundSecret(ctx, db.DB, workspaceID) + if mintErr != nil { + log.Printf("%s: lazy-heal mint of platform_inbound_secret failed for %s: %v", opLabel, workspaceID, mintErr) + return "", false, mintErr + } + log.Printf("%s: lazy-healed platform_inbound_secret for %s (#2312 backfill)", opLabel, workspaceID) + return minted, true, nil +} + // preparedProvisionContext carries the computed env + cfg through // the per-mode provisioner-Start step. Returned from // prepareProvisionContext when the caller proceeds; nil + non-empty