fix(platform): address code review — saasMode fallthrough, revoke in SaaS, warn-once on typo

Three Critical issues from the independent review pass:

1. saasMode() typo fallthrough. MOLECULE_DEPLOY_MODE=prod (typo) used
   to fall through to the MOLECULE_ORG_ID legacy signal, which is set
   in every tenant. A self-hosted deployment that happened to have
   MOLECULE_ORG_ID set would silently flip into SaaS mode with the
   relaxed SSRF posture. Now: non-empty MOLECULE_DEPLOY_MODE that
   doesn't match the recognised vocabulary falls closed (strict, non-
   SaaS) and logs a one-shot warning so operators notice the typo.

2. issueAndInjectToken early-return dropped RevokeAllForWorkspace.
   On re-provision in SaaS mode, the old workspace's live token
   stayed in the DB. The new workspace's first /registry/register
   then 401'd because requireWorkspaceToken saw live tokens and
   skipped the bootstrap-allowed path — and the new workspace had
   no plaintext to present. Swap the order so revoke runs first in
   both modes; only the IssueToken + ConfigFiles write is SaaS-skipped.

3. Extended TestSaasMode to cover the typo-fallthrough regression.
   Three new cases (prod / SaaS-mode / production) pin the fall-closed
   behaviour.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-21 03:15:47 -07:00
parent 1125a029b8
commit cf107337b6
3 changed files with 49 additions and 27 deletions

View File

@ -10,6 +10,7 @@ import (
"net/url"
"os"
"strings"
"sync"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
@ -35,26 +36,38 @@ type blockedRange struct {
// trusted by construction.
//
// Resolution order:
// 1. MOLECULE_DEPLOY_MODE — explicit operator-set flag ("saas" | "self-hosted").
// Prefer this for new deployments: the signal is unambiguous and can't
// collide with some future non-SaaS path that legitimately needs
// MOLECULE_ORG_ID.
// 2. MOLECULE_ORG_ID presence — implicit legacy signal, kept so deployments
// that haven't yet adopted MOLECULE_DEPLOY_MODE keep working.
// 1. MOLECULE_DEPLOY_MODE set — explicit operator flag is authoritative.
// Recognised values: "saas" → true. "self-hosted" / "selfhosted" /
// "standalone" → false. Any other non-empty value logs a warning and
// falls closed (false) so a typo like MOLECULE_DEPLOY_MODE=prod can't
// silently flip a self-hosted deployment into the relaxed SSRF posture.
// 2. MOLECULE_DEPLOY_MODE unset — fall back to the MOLECULE_ORG_ID presence
// signal for deployments that predate the explicit flag.
//
// Self-hosted / single-container deployments set neither and keep the strict
// blocklist.
func saasMode() bool {
mode := strings.ToLower(strings.TrimSpace(os.Getenv("MOLECULE_DEPLOY_MODE")))
switch mode {
raw := os.Getenv("MOLECULE_DEPLOY_MODE")
trimmed := strings.TrimSpace(raw)
if trimmed != "" {
switch strings.ToLower(trimmed) {
case "saas":
return true
case "self-hosted", "selfhosted", "standalone":
return false
default:
// Warn-once so operators notice the typo without spamming logs.
saasModeWarnUnknownOnce.Do(func() {
log.Printf("saasMode: MOLECULE_DEPLOY_MODE=%q not recognised; falling back to strict (non-SaaS) mode. Valid values: saas | self-hosted.", raw)
})
return false
}
}
return strings.TrimSpace(os.Getenv("MOLECULE_ORG_ID")) != ""
}
var saasModeWarnUnknownOnce sync.Once
type RegistryHandler struct {
broadcaster *events.Broadcaster
}

View File

@ -29,6 +29,13 @@ func TestSaasMode(t *testing.T) {
{"explicit standalone wins over legacy org id", "standalone", "some-org", false},
{"whitespace-only deploy mode falls through to legacy", " ", "some-org", true},
{"whitespace-only org id falls through to false", "", " ", false},
// Typo / unknown values: must fall closed (strict / self-hosted)
// instead of falling through to the MOLECULE_ORG_ID legacy signal.
// Any tenant deployment has MOLECULE_ORG_ID set, so a typo like
// MOLECULE_DEPLOY_MODE=prod used to silently flip into SaaS mode.
{"typo prod falls closed even with org id set", "prod", "some-org", false},
{"typo SaaS-mode falls closed even with org id set", "SaaS-mode", "some-org", false},
{"typo production falls closed", "production", "", false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {

View File

@ -330,24 +330,26 @@ func (h *WorkspaceHandler) buildProvisionerConfig(
// provisioning continues — the workspace will get 401 on its first heartbeat
// and can recover on the next restart.
func (h *WorkspaceHandler) issueAndInjectToken(ctx context.Context, workspaceID string, cfg *provisioner.WorkspaceConfig) {
// SaaS mode skip: the CP provisioner ships workspaces to a remote EC2
// via user-data and does not carry cfg.ConfigFiles across the wire, so
// any token we mint here never reaches the workspace. Minting it anyway
// would leave a live token in the DB with no plaintext on disk —
// RegistryHandler.requireWorkspaceToken then 401s every /registry/register
// attempt because the workspace has a live token on file (which trips
// bootstrap-allowed) but no way to present it. Defer to the register
// handler's own bootstrap-issuance path, which mints a token only after
// a successful first register and returns the plaintext in the response
// for the runtime to persist locally.
if saasMode() {
// Revoke any existing live tokens FIRST — this must run in both modes.
// In SaaS mode the revoke is load-bearing on re-provision: without it,
// the previous workspace instance's live token sits in the DB, and
// RegistryHandler.requireWorkspaceToken on the fresh instance's first
// /registry/register would reject it (live token exists → no
// bootstrap allowance, but the new workspace has no plaintext because
// the CP provisioner doesn't carry cfg.ConfigFiles across user-data).
// Revoking clears the gate so the register handler's bootstrap path
// can mint a fresh token and return the plaintext in the response.
if err := wsauth.RevokeAllForWorkspace(ctx, db.DB, workspaceID); err != nil {
log.Printf("Provisioner: failed to revoke existing tokens for %s: %v — skipping auth-token injection", workspaceID, err)
return
}
// Revoke any existing live tokens. If this fails we bail out rather than
// issuing a second live token whose plaintext we can't also deliver.
if err := wsauth.RevokeAllForWorkspace(ctx, db.DB, workspaceID); err != nil {
log.Printf("Provisioner: failed to revoke existing tokens for %s: %v — skipping auth-token injection", workspaceID, err)
// SaaS mode skips the IssueToken + ConfigFiles write because both
// only make sense on the Docker provisioner's volume-mount delivery
// path. The register handler mints a fresh token on first successful
// register and returns the plaintext in the response body for the
// runtime to persist locally.
if saasMode() {
return
}