fix(rfc523): scope forbidden-env check to global_secrets (allow user-set workspace_secrets)

Reviews APPROVED (core-qa,core-devops). CI/all-required green. Non-required E2E + security-review checks noted in PR body; not gating per BP. Co-authored-by: core-be <core-be@agents.moleculesai.app> Co-committed-by: core-be <core-be@agents.moleculesai.app>
2026-05-20 23:40:54 +00:00
parent 5455ddefe2
commit ee9dc5b9c5
4 changed files with 212 additions and 23 deletions
@@ -869,14 +869,31 @@ func applyRuntimeModelEnv(envVars map[string]string, runtime, model string) {
 // Returns nil map + error string on decrypt failure. Shared by both Docker
 // and control plane provisioning paths to avoid duplication.
 //
+// The second return value (globalKeys) records which keys originated from
+// the operator-controlled `global_secrets` table — used by RFC#523 Layer 1
+// to constrain its forbidden-key check to the operator-bleed channel,
+// instead of blanket-blocking by name across BOTH provenance channels (the
+// over-fire that breaks the legitimate user flow of pasting their own
+// GitHub PAT into the canvas Secrets tab → workspace_secrets row). See
+// `feedback_upstream_docs_first_before_hypothesizing`: RFC#523's threat
+// model (issue molecule-ai/internal#523 §"Threat model") names operator-
+// scope tokens being injected via provision-time env / operator-side
+// stores — NOT the user's own scoped PAT they explicitly authorized via
+// the per-workspace Secrets tab.
+//
+// The merged map preserves the existing precedence semantic (workspace
+// rows overwrite global rows on key collision); only the provenance side-
+// channel is new. Existing single-return callers can ignore globalKeys.
+//
 // F1086 / #1206: the returned error string is the SAFE-CANNED message that
 // gets persisted to workspaces.last_sample_error AND broadcast as the
 // WORKSPACE_PROVISION_FAILED payload. Internal detail (the secret key name,
 // the encryption version, the decrypt-error text) is logged here, never
 // returned to the caller, so it can't leak via the canvas event stream
 // (cf. TestProvisionWorkspace_NoInternalErrorsInBroadcast).
-func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]string, string) {
+func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]string, map[string]struct{}, string) {
 	envVars := map[string]string{}
+	globalKeys := map[string]struct{}{}
 	globalRows, globalErr := db.DB.QueryContext(ctx,
 		`SELECT key, encrypted_value, encryption_version FROM global_secrets`)
 	if globalErr == nil {
@@ -889,9 +906,10 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s
 				decrypted, decErr := crypto.DecryptVersioned(v, ver)
 				if decErr != nil {
 					log.Printf("Provisioner: FATAL — failed to decrypt global secret %s (version=%d): %v — aborting provision of workspace %s", k, ver, decErr, workspaceID)
-					return nil, "failed to decrypt global secret"
+					return nil, nil, "failed to decrypt global secret"
 				}
 				envVars[k] = string(decrypted)
+				globalKeys[k] = struct{}{}
 			}
 		}
 		if err := globalRows.Err(); err != nil {
@@ -910,16 +928,22 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s
 				decrypted, decErr := crypto.DecryptVersioned(v, ver)
 				if decErr != nil {
 					log.Printf("Provisioner: FATAL — failed to decrypt workspace secret %s (version=%d) for %s: %v — aborting provision", k, ver, workspaceID, decErr)
-					return nil, "failed to decrypt workspace secret"
+					return nil, nil, "failed to decrypt workspace secret"
 				}
 				envVars[k] = string(decrypted)
+				// User-authored workspace_secrets value supersedes any
+				// global_secrets row of the same key — including dropping
+				// the operator-bleed provenance flag. The user explicitly
+				// re-set the value via the canvas Secrets tab, so it is
+				// no longer "the operator-store version."
+				delete(globalKeys, k)
 			}
 		}
 		if err := wsRows.Err(); err != nil {
 			log.Printf("Provisioner: workspace_secrets rows.Err workspace=%s: %v", workspaceID, err)
 		}
 	}
-	return envVars, ""
+	return envVars, globalKeys, ""
 }

 // provisionWorkspaceCP provisions a workspace via the control plane API.
@@ -135,6 +135,15 @@ func isForbiddenTenantEnvKey(key string) bool {
 // message and the structured-extra payload that goes to the
 // canvas Events tab. Sorting makes the message stable across
 // Go's randomized map iteration.
+//
+// PROVENANCE NOTE: this helper checks by env-var name ONLY and is
+// unaware of where each value came from. Production provision code
+// uses findForbiddenTenantEnvKeysFromGlobals instead, restricting
+// the check to keys originating from the operator-controlled
+// global_secrets table — see the doc-comment on that function and
+// the RFC#523 Layer 1 block in prepareProvisionContext. This name-
+// only helper is kept for the workspace_secrets-write CI lint
+// (Layer 3) and for tests that pin the deny-set definition.
 func findForbiddenTenantEnvKeys(envVars map[string]string) []string {
 	if len(envVars) == 0 {
 		return []string{}
@@ -149,6 +158,48 @@ func findForbiddenTenantEnvKeys(envVars map[string]string) []string {
 	return found
 }

+// findForbiddenTenantEnvKeysFromGlobals is the provenance-aware
+// variant used by RFC#523 Layer 1 in prepareProvisionContext. It
+// restricts the forbidden-key scan to keys whose value originated
+// from the operator-controlled `global_secrets` table.
+//
+// Fixes the over-fire reported by CTO empirical 2026-05-20: a user
+// who explicitly pastes their own scoped GitHub PAT under
+// GITHUB_TOKEN into the canvas Secrets tab (a `workspace_secrets`
+// row) was being blocked alongside the genuine operator-bleed case.
+// RFC#523's threat model (issue molecule-ai/internal#523 §"Threat
+// model") names operator-scope tokens injected via operator-side
+// stores; user-authored workspace_secrets is out of scope.
+//
+// globalSecretKeys is the set returned as the second value from
+// loadWorkspaceSecrets. A key that exists in BOTH stores is treated
+// as workspace_secrets (user override wins) — loadWorkspaceSecrets
+// drops the global flag when the workspace row is read.
+//
+// Empty/nil globalSecretKeys means no operator-side source was
+// loaded (e.g. tests, or table empty); the scan returns no hits.
+// Deterministic sort order, same as findForbiddenTenantEnvKeys.
+func findForbiddenTenantEnvKeysFromGlobals(envVars map[string]string, globalSecretKeys map[string]struct{}) []string {
+	if len(envVars) == 0 || len(globalSecretKeys) == 0 {
+		return []string{}
+	}
+	found := make([]string, 0)
+	for k := range globalSecretKeys {
+		if _, present := envVars[k]; !present {
+			// Defensive: a key flagged as global-origin must also
+			// be in the resolved env-set. If not, skip — the
+			// loadWorkspaceSecrets contract guarantees this never
+			// happens, but the helper stays total.
+			continue
+		}
+		if isForbiddenTenantEnvKey(k) {
+			found = append(found, k)
+		}
+	}
+	sort.Strings(found)
+	return found
+}
+
 // formatForbiddenTenantEnvError builds the safe-canned user-facing
 // message for a provision aborted because forbidden env keys are
 // present in the resolved env-set. The message names the
@@ -150,6 +150,106 @@ func TestFindForbiddenTenantEnvKeys_SingleAndMultipleSorted(t *testing.T) {
 	}
 }

+// TestFindForbiddenTenantEnvKeysFromGlobals pins the provenance-aware
+// behaviour added 2026-05-20 to fix the RFC#523 Layer 1 over-fire: a
+// user-set workspace_secrets row with key=GITHUB_TOKEN must NOT be
+// flagged, while a global_secrets row of the same key MUST be.
+//
+// Cross-references the empirical bug: CTO 2026-05-20 hit
+// `provision aborted: env var "GITHUB_TOKEN" is operator-scope...`
+// after pasting their own scoped PAT into the canvas Secrets tab
+// (workspace_secrets) — the original blanket check fired on the
+// merged env-set regardless of provenance.
+func TestFindForbiddenTenantEnvKeysFromGlobals_UserSetAllowed(t *testing.T) {
+	// User pasted their own PAT via canvas Secrets tab —
+	// workspace_secrets row only. globalSecretKeys is empty for
+	// this key, so the check MUST not fire.
+	envVars := map[string]string{
+		"GITHUB_TOKEN":      "ghp_FAKEUSERPAT_user_set_via_canvas",
+		"ANTHROPIC_API_KEY": "sk-ant-keep",
+	}
+	globalKeys := map[string]struct{}{} // nothing from global_secrets
+	got := findForbiddenTenantEnvKeysFromGlobals(envVars, globalKeys)
+	if len(got) != 0 {
+		t.Errorf("user-set workspace_secrets with GITHUB_TOKEN: got %v; want empty (provenance-allowed)", got)
+	}
+}
+
+func TestFindForbiddenTenantEnvKeysFromGlobals_OperatorLeakBlocked(t *testing.T) {
+	// Operator-store bleed — GITHUB_TOKEN sourced from global_secrets.
+	// This is the literal RFC#523 §"Threat model" attack vector.
+	// Check MUST fire and name GITHUB_TOKEN.
+	envVars := map[string]string{
+		"GITHUB_TOKEN":      "ghp_OPERATOR_LEAK_from_global_secrets",
+		"ANTHROPIC_API_KEY": "sk-ant-keep",
+	}
+	globalKeys := map[string]struct{}{
+		"GITHUB_TOKEN":      {},
+		"ANTHROPIC_API_KEY": {},
+	}
+	got := findForbiddenTenantEnvKeysFromGlobals(envVars, globalKeys)
+	if len(got) != 1 || got[0] != "GITHUB_TOKEN" {
+		t.Errorf("operator-leak GITHUB_TOKEN in global_secrets: got %v; want [GITHUB_TOKEN]", got)
+	}
+}
+
+func TestFindForbiddenTenantEnvKeysFromGlobals_UserOverrideOfGlobalAllowed(t *testing.T) {
+	// Both stores have the key; loadWorkspaceSecrets drops the global
+	// flag when the workspace row supersedes (caller contract).
+	// Simulate that here: globalKeys does NOT contain GITHUB_TOKEN
+	// because workspace_secrets re-set it. Allowed.
+	envVars := map[string]string{
+		"GITHUB_TOKEN": "ghp_USER_RESET_after_global_was_present",
+	}
+	globalKeys := map[string]struct{}{} // workspace overrode → flag dropped
+	got := findForbiddenTenantEnvKeysFromGlobals(envVars, globalKeys)
+	if len(got) != 0 {
+		t.Errorf("user-override of global GITHUB_TOKEN: got %v; want empty", got)
+	}
+}
+
+func TestFindForbiddenTenantEnvKeysFromGlobals_MultipleOperatorLeaks(t *testing.T) {
+	// Multiple operator-leaked tokens — must return sorted slice.
+	envVars := map[string]string{
+		"GITHUB_TOKEN":           "leak1",
+		"CP_ADMIN_API_TOKEN":     "leak2",
+		"MOLECULE_OPERATOR_HOST": "leak3",
+		"RAILWAY_TOKEN":          "leak4",
+		"ANTHROPIC_API_KEY":      "user-allowed",
+	}
+	globalKeys := map[string]struct{}{
+		"GITHUB_TOKEN":           {},
+		"CP_ADMIN_API_TOKEN":     {},
+		"MOLECULE_OPERATOR_HOST": {},
+		"RAILWAY_TOKEN":          {},
+	}
+	got := findForbiddenTenantEnvKeysFromGlobals(envVars, globalKeys)
+	want := []string{"CP_ADMIN_API_TOKEN", "GITHUB_TOKEN", "MOLECULE_OPERATOR_HOST", "RAILWAY_TOKEN"}
+	if len(got) != len(want) {
+		t.Fatalf("operator-leak multi: got %v; want %v", got, want)
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Errorf("operator-leak multi[%d] = %q; want %q (full got=%v)", i, got[i], want[i], got)
+		}
+	}
+}
+
+func TestFindForbiddenTenantEnvKeysFromGlobals_EmptyInputs(t *testing.T) {
+	if got := findForbiddenTenantEnvKeysFromGlobals(nil, nil); len(got) != 0 {
+		t.Errorf("nil/nil: got %v; want empty", got)
+	}
+	if got := findForbiddenTenantEnvKeysFromGlobals(map[string]string{}, map[string]struct{}{}); len(got) != 0 {
+		t.Errorf("empty/empty: got %v; want empty", got)
+	}
+	// Non-empty envVars but no global provenance — nothing came from
+	// global_secrets, so nothing to block (even if a workspace_secrets
+	// row exists for GITHUB_TOKEN).
+	if got := findForbiddenTenantEnvKeysFromGlobals(map[string]string{"GITHUB_TOKEN": "ghp_user"}, map[string]struct{}{}); len(got) != 0 {
+		t.Errorf("workspace-only GITHUB_TOKEN: got %v; want empty", got)
+	}
+}
+
 func TestFormatForbiddenTenantEnvError_Phrasing(t *testing.T) {
 	// Empty input — defensive total function.
 	if msg := formatForbiddenTenantEnvError(nil); !strings.Contains(msg, "RFC#523") {
@@ -120,38 +120,52 @@ func (h *WorkspaceHandler) prepareProvisionContext(
 	payload models.CreateWorkspacePayload,
 	resetClaudeSession bool,
 ) (*preparedProvisionContext, *provisionAbort) {
-	envVars, decryptErr := loadWorkspaceSecrets(ctx, workspaceID)
+	envVars, globalSecretKeys, decryptErr := loadWorkspaceSecrets(ctx, workspaceID)
 	if decryptErr != "" {
 		return nil, &provisionAbort{Msg: decryptErr}
 	}

-	// RFC#523 Layer 1 (task #146): refuse to start a tenant workspace
-	// when any forbidden operator-scope env var is present in the
-	// resolved secret-load env-set. Runs IMMEDIATELY after
-	// loadWorkspaceSecrets and BEFORE applyAgentGitHTTPCreds — the
-	// per-agent persona injection sets a fallback GITEA_USER/GITEA_TOKEN
-	// pair that the buildContainerEnv forensic #145 guard will strip
-	// later. We want THIS layer to catch leaks from the operator-
-	// controlled stores (global_secrets, workspace_secrets) only, not
-	// the deliberate per-agent platform injection that lives downstream.
+	// RFC#523 Layer 1 (issue molecule-ai/internal#523): refuse to start a
+	// tenant workspace when any forbidden operator-scope env var is
+	// present in the operator-controlled store (global_secrets).
 	//
-	// Threat model is "an upstream secret-writer accidentally widened
-	// the propagation set" — e.g. an operator pastes GITEA_TOKEN into
-	// a workspace_secrets row. Caught here, surfaced loudly to the
-	// canvas Events tab, fail-closed. The existing forensic #145 guard
-	// in provisioner.buildContainerEnv / CPProvisioner.Start stays as
-	// defense-in-depth: it silently strips at container-env-build time.
+	// PROVENANCE-AWARE — fix for the over-fire reported by CTO empirical
+	// 2026-05-20: the original implementation ran this check on the
+	// merged env-set, which conflated two very different sources:
+	//
+	//   1. global_secrets — operator-side store. ANY operator-scope token
+	//      here is an upstream bleed (e.g. tenant_secrets_seed.go pre-
+	//      4f45d37 propagating CP-env GITHUB_TOKEN into every fresh
+	//      tenant's row). RFC#523's literal threat model.
+	//
+	//   2. workspace_secrets — user-set via the canvas Secrets tab,
+	//      authenticated as the workspace owner. If the user pastes
+	//      their own scoped GitHub PAT under GITHUB_TOKEN so the agent
+	//      can push to their personal repos, that is the system working
+	//      as designed — not the leak RFC#523 was written to catch.
+	//
+	// The provenance side-channel from loadWorkspaceSecrets tells us
+	// which keys came from global_secrets (workspace_secrets writes
+	// override and clear the flag, since the user explicitly re-set
+	// the value). We restrict the abort to that set.
+	//
+	// Defense-in-depth NOT removed: provisioner.buildContainerEnv still
+	// runs the forensic #145 silent-strip (lower-confidence late layer),
+	// and workspace/entrypoint.sh has Layer 2 inside the container. If a
+	// real operator-scope token slips into workspace_secrets some other
+	// way, the later layers (and the per-workspace SG, and the per-tenant
+	// VPC isolation) are still in force.
 	//
 	// Key names (not values) are echoed in the user-facing error so
 	// the operator can locate and remove the offending row. Per memory
 	// `feedback_passwords_in_chat_are_burned`, key names are not
 	// secret; values would be.
-	if forbidden := findForbiddenTenantEnvKeys(envVars); len(forbidden) > 0 {
+	if forbidden := findForbiddenTenantEnvKeysFromGlobals(envVars, globalSecretKeys); len(forbidden) > 0 {
 		msg := formatForbiddenTenantEnvError(forbidden)
-		log.Printf("Provisioner: ABORT workspace=%s — forbidden operator-scope env keys present: %v (RFC#523)", workspaceID, forbidden)
+		log.Printf("Provisioner: ABORT workspace=%s — forbidden operator-scope env keys present in global_secrets: %v (RFC#523)", workspaceID, forbidden)
 		return nil, &provisionAbort{
 			Msg:   msg,
-			Extra: map[string]interface{}{"error": msg, "forbidden_env_keys": forbidden, "rfc": "523"},
+			Extra: map[string]interface{}{"error": msg, "forbidden_env_keys": forbidden, "rfc": "523", "source": "global_secrets"},
 		}
 	}