refactor: extract readOrLazyHealInboundSecret to dedup chat_files + registry

The lazy-heal-on-miss pattern landed in two places this session:
PR #2372 (chat_files.go::resolveWorkspaceForwardCreds — Upload + Download)
and PR #2375 (registry.go::Register). Both implementations did the same
thing:

  read → if ErrNoInboundSecret then mint inline → return outcome

Different response-shape requirements but the same core mechanic. Three
sites' worth of drift potential: any future heal-time condition we add
(audit log, alert, secret rotation, observability) had to be applied to
each site, with partial application silently re-opening the gap.

Fix: extract readOrLazyHealInboundSecret in workspace_provision_shared.go
returning (secret, healed, err). Each caller maps the outcome to its
response shape:

  - chat_files: healed=true → 503 with retry hint; err != nil → 503 with
    RFC-#2312 reprovision hint
  - registry:   healed=true|false + err==nil → include in response;
    err != nil → omit field (workspace can retry on next register)

Net effect:

  - Single source of truth for the read+heal mechanic
  - Response-shape decisions stay in callers (they DO differ per feature)
  - Future heal-time conditions go in one place
  - Behavior preserved: existing TestRegister_NoInboundSecret_LazyHeals,
    TestRegister_NoInboundSecret_LazyHealMintFailureOmitsField,
    TestChatUpload_NoInboundSecret_LazyHeal*,
    TestChatDownload_NoInboundSecret_LazyHeal* all pass unchanged

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-30 04:11:43 -07:00
parent eef1969e30
commit 30a569c742
3 changed files with 67 additions and 57 deletions

View File

@ -30,7 +30,6 @@ package handlers
import (
"context"
"errors"
"fmt"
"io"
"log"
@ -41,7 +40,6 @@ import (
"time"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
"github.com/gin-gonic/gin"
)
@ -120,41 +118,28 @@ func resolveWorkspaceForwardCreds(c *gin.Context, ctx context.Context, workspace
// upstream gate rather than re-validating here. Tracked at #2316
// for follow-up: forward-time re-validation as defense-in-depth.
secret, err := wsauth.ReadPlatformInboundSecret(ctx, db.DB, workspaceID)
secret, healed, err := readOrLazyHealInboundSecret(ctx, workspaceID, "chat_files "+op)
if err != nil {
if errors.Is(err, wsauth.ErrNoInboundSecret) {
// Lazy-heal: mint the secret now so future requests
// succeed. Pre-2026-04-30 the SaaS provision path didn't
// mint — every prod workspace started life with NULL
// platform_inbound_secret. The shared-mint refactor closes
// the new-workspace gap; this lazy-heal closes the
// existing-workspace gap without requiring a destructive
// reprovision.
//
// Why the request still 503s: the workspace's local
// /configs/.platform_inbound_secret is also empty until the
// next /registry/register response (registry.go:344-362)
// propagates the freshly-minted secret. The user retries
// once the workspace's heartbeat picks up the new secret
// (typically <30s).
if _, mintErr := wsauth.IssuePlatformInboundSecret(ctx, db.DB, workspaceID); mintErr != nil {
log.Printf("chat_files %s: lazy-heal mint failed for %s: %v", op, workspaceID, mintErr)
c.JSON(http.StatusServiceUnavailable, gin.H{
"error": "workspace not yet enrolled in v2 " + op + " (RFC #2312)",
"detail": "Failed to mint inbound secret. Reprovision the workspace if this persists.",
})
return "", "", false
}
log.Printf("chat_files %s: lazy-healed platform_inbound_secret for %s — retry once workspace re-registers (#2312 backfill)", op, workspaceID)
c.JSON(http.StatusServiceUnavailable, gin.H{
"error": "workspace re-registering — please retry in 30 seconds",
"detail": "Inbound secret was just minted. Workspace will pick it up on its next heartbeat.",
"retry_after_seconds": 30,
})
return "", "", false
}
log.Printf("chat_files %s: read platform_inbound_secret failed for %s: %v", op, workspaceID, err)
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to read workspace secret"})
// Either a non-NoInboundSecret read error (DB hiccup) or a mint
// failure during lazy-heal. The chat_files contract is to surface
// 503 with the RFC-#2312 reprovision hint in both cases — the user
// can't proceed and needs ops attention.
c.JSON(http.StatusServiceUnavailable, gin.H{
"error": "workspace not yet enrolled in v2 " + op + " (RFC #2312)",
"detail": "Failed to mint inbound secret. Reprovision the workspace if this persists.",
})
return "", "", false
}
if healed {
// The platform now has the secret but the workspace's
// /configs/.platform_inbound_secret is still empty until the next
// /registry/register response propagates it. User retries after
// the workspace's next heartbeat picks up the new secret (~30s).
c.JSON(http.StatusServiceUnavailable, gin.H{
"error": "workspace re-registering — please retry in 30 seconds",
"detail": "Inbound secret was just minted. Workspace will pick it up on its next heartbeat.",
"retry_after_seconds": 30,
})
return "", "", false
}
return wsURL, secret, true

View File

@ -451,29 +451,19 @@ func (h *RegistryHandler) Register(c *gin.Context) {
// has no volume to write into.
//
// Lazy-heal (2026-04-30): if the column is NULL (legacy workspace
// provisioned before the shared-mint refactor), mint it inline and
// include in the response. Without this, legacy workspaces would need
// two round-trips before chat upload works — chat_files lazy-heals
// platform-side on first attempt, then the workspace must heartbeat
// to receive the freshly-minted secret. Heal-on-register collapses
// that to one round-trip.
secret, secretErr := wsauth.ReadPlatformInboundSecret(ctx, db.DB, payload.ID)
if secretErr != nil && errors.Is(secretErr, wsauth.ErrNoInboundSecret) {
minted, mintErr := wsauth.IssuePlatformInboundSecret(ctx, db.DB, payload.ID)
if mintErr != nil {
log.Printf("Registry: lazy-heal mint of platform_inbound_secret failed for %s: %v — workspace will 503 on chat upload until next register", payload.ID, mintErr)
} else {
secret = minted
secretErr = nil
log.Printf("Registry: lazy-healed platform_inbound_secret for %s (#2312 backfill)", payload.ID)
}
}
if secretErr == nil {
// provisioned before the shared-mint refactor), mint inline and
// include in the response. Without this, legacy workspaces would
// need two round-trips before chat upload works — chat_files
// lazy-heals platform-side on first attempt, then the workspace
// must heartbeat to receive the freshly-minted secret.
// Heal-on-register collapses that to one round-trip.
if secret, _, healErr := readOrLazyHealInboundSecret(ctx, payload.ID, "Registry"); healErr == nil {
response["platform_inbound_secret"] = secret
} else if !errors.Is(secretErr, wsauth.ErrNoInboundSecret) {
// Non-NoInboundSecret read errors (DB hiccup, etc.) — log loud.
log.Printf("Registry: read platform_inbound_secret for %s failed: %v", payload.ID, secretErr)
}
// Errors are non-fatal here — the workspace is online and can serve
// non-/internal traffic. The lazy-heal helper has already logged
// whichever sub-step failed (read or mint). If the secret never lands,
// chat upload surfaces the issue loudly with the RFC-#2312 hint.
c.JSON(http.StatusOK, response)
}

View File

@ -36,14 +36,49 @@ package handlers
import (
"context"
"errors"
"log"
"path/filepath"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
)
// readOrLazyHealInboundSecret reads the workspace's
// platform_inbound_secret. On ErrNoInboundSecret it mints inline
// (lazy-heal — RFC #2312 backfill for workspaces provisioned before
// the shared-mint refactor closed the gap).
//
// Returns:
// - (secret, false, nil) — secret was already present
// - (secret, true, nil) — secret was missing; just minted
// - ("", false, err) — read failed (non-NoInboundSecret) OR mint failed
//
// opLabel prefixes log lines so operators can tell which feature
// triggered the heal (e.g. "Registry", "chat_files Upload"). Centralized
// here so the next "what to do when the secret is missing" decision —
// rotation, audit, alerting — goes in ONE place. Same drift-prevention
// rationale as resolveWorkspaceForwardCreds and mintWorkspaceSecrets.
func readOrLazyHealInboundSecret(ctx context.Context, workspaceID, opLabel string) (secret string, healed bool, err error) {
s, readErr := wsauth.ReadPlatformInboundSecret(ctx, db.DB, workspaceID)
if readErr == nil {
return s, false, nil
}
if !errors.Is(readErr, wsauth.ErrNoInboundSecret) {
log.Printf("%s: read platform_inbound_secret failed for %s: %v", opLabel, workspaceID, readErr)
return "", false, readErr
}
minted, mintErr := wsauth.IssuePlatformInboundSecret(ctx, db.DB, workspaceID)
if mintErr != nil {
log.Printf("%s: lazy-heal mint of platform_inbound_secret failed for %s: %v", opLabel, workspaceID, mintErr)
return "", false, mintErr
}
log.Printf("%s: lazy-healed platform_inbound_secret for %s (#2312 backfill)", opLabel, workspaceID)
return minted, true, nil
}
// preparedProvisionContext carries the computed env + cfg through
// the per-mode provisioner-Start step. Returned from
// prepareProvisionContext when the caller proceeds; nil + non-empty