fix(platform): address code review — saasMode fallthrough, revoke in SaaS, warn-once on typo

Three Critical issues from the independent review pass:

1. saasMode() typo fallthrough. MOLECULE_DEPLOY_MODE=prod (typo) used
   to fall through to the MOLECULE_ORG_ID legacy signal, which is set
   in every tenant. A self-hosted deployment that happened to have
   MOLECULE_ORG_ID set would silently flip into SaaS mode with the
   relaxed SSRF posture. Now: non-empty MOLECULE_DEPLOY_MODE that
   doesn't match the recognised vocabulary falls closed (strict, non-
   SaaS) and logs a one-shot warning so operators notice the typo.

2. issueAndInjectToken early-return dropped RevokeAllForWorkspace.
   On re-provision in SaaS mode, the old workspace's live token
   stayed in the DB. The new workspace's first /registry/register
   then 401'd because requireWorkspaceToken saw live tokens and
   skipped the bootstrap-allowed path — and the new workspace had
   no plaintext to present. Swap the order so revoke runs first in
   both modes; only the IssueToken + ConfigFiles write is SaaS-skipped.

3. Extended TestSaasMode to cover the typo-fallthrough regression.
   Three new cases (prod / SaaS-mode / production) pin the fall-closed
   behaviour.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-21 03:15:47 -07:00
parent 1125a029b8
commit cf107337b6
3 changed files with 49 additions and 27 deletions

View File

@ -10,6 +10,7 @@ import (
"net/url" "net/url"
"os" "os"
"strings" "strings"
"sync"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db" "github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events" "github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
@ -35,26 +36,38 @@ type blockedRange struct {
// trusted by construction. // trusted by construction.
// //
// Resolution order: // Resolution order:
// 1. MOLECULE_DEPLOY_MODE — explicit operator-set flag ("saas" | "self-hosted"). // 1. MOLECULE_DEPLOY_MODE set — explicit operator flag is authoritative.
// Prefer this for new deployments: the signal is unambiguous and can't // Recognised values: "saas" → true. "self-hosted" / "selfhosted" /
// collide with some future non-SaaS path that legitimately needs // "standalone" → false. Any other non-empty value logs a warning and
// MOLECULE_ORG_ID. // falls closed (false) so a typo like MOLECULE_DEPLOY_MODE=prod can't
// 2. MOLECULE_ORG_ID presence — implicit legacy signal, kept so deployments // silently flip a self-hosted deployment into the relaxed SSRF posture.
// that haven't yet adopted MOLECULE_DEPLOY_MODE keep working. // 2. MOLECULE_DEPLOY_MODE unset — fall back to the MOLECULE_ORG_ID presence
// signal for deployments that predate the explicit flag.
// //
// Self-hosted / single-container deployments set neither and keep the strict // Self-hosted / single-container deployments set neither and keep the strict
// blocklist. // blocklist.
func saasMode() bool { func saasMode() bool {
mode := strings.ToLower(strings.TrimSpace(os.Getenv("MOLECULE_DEPLOY_MODE"))) raw := os.Getenv("MOLECULE_DEPLOY_MODE")
switch mode { trimmed := strings.TrimSpace(raw)
if trimmed != "" {
switch strings.ToLower(trimmed) {
case "saas": case "saas":
return true return true
case "self-hosted", "selfhosted", "standalone": case "self-hosted", "selfhosted", "standalone":
return false return false
default:
// Warn-once so operators notice the typo without spamming logs.
saasModeWarnUnknownOnce.Do(func() {
log.Printf("saasMode: MOLECULE_DEPLOY_MODE=%q not recognised; falling back to strict (non-SaaS) mode. Valid values: saas | self-hosted.", raw)
})
return false
}
} }
return strings.TrimSpace(os.Getenv("MOLECULE_ORG_ID")) != "" return strings.TrimSpace(os.Getenv("MOLECULE_ORG_ID")) != ""
} }
var saasModeWarnUnknownOnce sync.Once
type RegistryHandler struct { type RegistryHandler struct {
broadcaster *events.Broadcaster broadcaster *events.Broadcaster
} }

View File

@ -29,6 +29,13 @@ func TestSaasMode(t *testing.T) {
{"explicit standalone wins over legacy org id", "standalone", "some-org", false}, {"explicit standalone wins over legacy org id", "standalone", "some-org", false},
{"whitespace-only deploy mode falls through to legacy", " ", "some-org", true}, {"whitespace-only deploy mode falls through to legacy", " ", "some-org", true},
{"whitespace-only org id falls through to false", "", " ", false}, {"whitespace-only org id falls through to false", "", " ", false},
// Typo / unknown values: must fall closed (strict / self-hosted)
// instead of falling through to the MOLECULE_ORG_ID legacy signal.
// Any tenant deployment has MOLECULE_ORG_ID set, so a typo like
// MOLECULE_DEPLOY_MODE=prod used to silently flip into SaaS mode.
{"typo prod falls closed even with org id set", "prod", "some-org", false},
{"typo SaaS-mode falls closed even with org id set", "SaaS-mode", "some-org", false},
{"typo production falls closed", "production", "", false},
} }
for _, tc := range cases { for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) { t.Run(tc.name, func(t *testing.T) {

View File

@ -330,24 +330,26 @@ func (h *WorkspaceHandler) buildProvisionerConfig(
// provisioning continues — the workspace will get 401 on its first heartbeat // provisioning continues — the workspace will get 401 on its first heartbeat
// and can recover on the next restart. // and can recover on the next restart.
func (h *WorkspaceHandler) issueAndInjectToken(ctx context.Context, workspaceID string, cfg *provisioner.WorkspaceConfig) { func (h *WorkspaceHandler) issueAndInjectToken(ctx context.Context, workspaceID string, cfg *provisioner.WorkspaceConfig) {
// SaaS mode skip: the CP provisioner ships workspaces to a remote EC2 // Revoke any existing live tokens FIRST — this must run in both modes.
// via user-data and does not carry cfg.ConfigFiles across the wire, so // In SaaS mode the revoke is load-bearing on re-provision: without it,
// any token we mint here never reaches the workspace. Minting it anyway // the previous workspace instance's live token sits in the DB, and
// would leave a live token in the DB with no plaintext on disk — // RegistryHandler.requireWorkspaceToken on the fresh instance's first
// RegistryHandler.requireWorkspaceToken then 401s every /registry/register // /registry/register would reject it (live token exists → no
// attempt because the workspace has a live token on file (which trips // bootstrap allowance, but the new workspace has no plaintext because
// bootstrap-allowed) but no way to present it. Defer to the register // the CP provisioner doesn't carry cfg.ConfigFiles across user-data).
// handler's own bootstrap-issuance path, which mints a token only after // Revoking clears the gate so the register handler's bootstrap path
// a successful first register and returns the plaintext in the response // can mint a fresh token and return the plaintext in the response.
// for the runtime to persist locally. if err := wsauth.RevokeAllForWorkspace(ctx, db.DB, workspaceID); err != nil {
if saasMode() { log.Printf("Provisioner: failed to revoke existing tokens for %s: %v — skipping auth-token injection", workspaceID, err)
return return
} }
// Revoke any existing live tokens. If this fails we bail out rather than // SaaS mode skips the IssueToken + ConfigFiles write because both
// issuing a second live token whose plaintext we can't also deliver. // only make sense on the Docker provisioner's volume-mount delivery
if err := wsauth.RevokeAllForWorkspace(ctx, db.DB, workspaceID); err != nil { // path. The register handler mints a fresh token on first successful
log.Printf("Provisioner: failed to revoke existing tokens for %s: %v — skipping auth-token injection", workspaceID, err) // register and returns the plaintext in the response body for the
// runtime to persist locally.
if saasMode() {
return return
} }