From 514341dd51a52620881cd88d6bee5a5db15521f6 Mon Sep 17 00:00:00 2001 From: core-be Date: Wed, 20 May 2026 15:22:14 -0700 Subject: [PATCH] fix(rfc523): scope forbidden-env check to global_secrets (allow user-set workspace_secrets) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RFC#523 Layer 1 was over-firing: a user pasting their own scoped GitHub PAT into the canvas Secrets tab under GITHUB_TOKEN landed in the per-workspace `workspace_secrets` table, but the L1 guardrail ran on the merged env-set from loadWorkspaceSecrets and refused the provision with the operator-leak abort message — even though RFC#523's literal threat model (issue molecule-ai/internal#523 §"Threat model") only names operator-scope tokens injected via operator-side stores. CTO empirical 2026-05-20. Fix (Option C — schema-natural provenance split): - loadWorkspaceSecrets now returns a second value: the set of keys whose value came from `global_secrets` (the operator-controlled table). A subsequent workspace_secrets row of the same key drops the global-origin flag (user override wins). - New helper findForbiddenTenantEnvKeysFromGlobals restricts the scan to that set; the old findForbiddenTenantEnvKeys (name-only) is kept for the Layer 3 CI-lint use case + test pins. - prepareProvisionContext calls the provenance-aware helper. Error surface adds source="global_secrets" to the structured Extra payload so canvas Events can disambiguate. Defense-in-depth NOT removed: - provisioner.buildContainerEnv silent-strip (forensic #145) still applies at the container-env-build step. - workspace/entrypoint.sh L2 in-container env-grep stays as-is (a separate concern — see the "Open question" in the PR body about whether L2 also needs provenance-awareness). Regression tests (workspace_provision_forbidden_env_test.go): - user-set workspace_secrets w/ GITHUB_TOKEN → allowed - operator-leak global_secrets w/ GITHUB_TOKEN → blocked - user-override of pre-existing global GITHUB_TOKEN → allowed - multiple operator-leaks → sorted slice returned - nil/empty inputs total Refs: - molecule-ai/internal#523 (original RFC, §"Threat model") - 4f45d37 (RFC#523 follow-up: removed GITHUB_TOKEN from tenant seedAllowList — the one upstream operator-bleed source has already been eliminated) - feedback_upstream_docs_first_before_hypothesizing - task #146 (this orchestrator's tracker — the CI-lint side) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/workspace_provision.go | 32 +++++- .../workspace_provision_forbidden_env.go | 51 +++++++++ .../workspace_provision_forbidden_env_test.go | 100 ++++++++++++++++++ .../handlers/workspace_provision_shared.go | 52 +++++---- 4 files changed, 212 insertions(+), 23 deletions(-) diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index c30b9b299..11da5f448 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -869,14 +869,31 @@ func applyRuntimeModelEnv(envVars map[string]string, runtime, model string) { // Returns nil map + error string on decrypt failure. Shared by both Docker // and control plane provisioning paths to avoid duplication. // +// The second return value (globalKeys) records which keys originated from +// the operator-controlled `global_secrets` table — used by RFC#523 Layer 1 +// to constrain its forbidden-key check to the operator-bleed channel, +// instead of blanket-blocking by name across BOTH provenance channels (the +// over-fire that breaks the legitimate user flow of pasting their own +// GitHub PAT into the canvas Secrets tab → workspace_secrets row). See +// `feedback_upstream_docs_first_before_hypothesizing`: RFC#523's threat +// model (issue molecule-ai/internal#523 §"Threat model") names operator- +// scope tokens being injected via provision-time env / operator-side +// stores — NOT the user's own scoped PAT they explicitly authorized via +// the per-workspace Secrets tab. +// +// The merged map preserves the existing precedence semantic (workspace +// rows overwrite global rows on key collision); only the provenance side- +// channel is new. Existing single-return callers can ignore globalKeys. +// // F1086 / #1206: the returned error string is the SAFE-CANNED message that // gets persisted to workspaces.last_sample_error AND broadcast as the // WORKSPACE_PROVISION_FAILED payload. Internal detail (the secret key name, // the encryption version, the decrypt-error text) is logged here, never // returned to the caller, so it can't leak via the canvas event stream // (cf. TestProvisionWorkspace_NoInternalErrorsInBroadcast). -func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]string, string) { +func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]string, map[string]struct{}, string) { envVars := map[string]string{} + globalKeys := map[string]struct{}{} globalRows, globalErr := db.DB.QueryContext(ctx, `SELECT key, encrypted_value, encryption_version FROM global_secrets`) if globalErr == nil { @@ -889,9 +906,10 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s decrypted, decErr := crypto.DecryptVersioned(v, ver) if decErr != nil { log.Printf("Provisioner: FATAL — failed to decrypt global secret %s (version=%d): %v — aborting provision of workspace %s", k, ver, decErr, workspaceID) - return nil, "failed to decrypt global secret" + return nil, nil, "failed to decrypt global secret" } envVars[k] = string(decrypted) + globalKeys[k] = struct{}{} } } if err := globalRows.Err(); err != nil { @@ -910,16 +928,22 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s decrypted, decErr := crypto.DecryptVersioned(v, ver) if decErr != nil { log.Printf("Provisioner: FATAL — failed to decrypt workspace secret %s (version=%d) for %s: %v — aborting provision", k, ver, workspaceID, decErr) - return nil, "failed to decrypt workspace secret" + return nil, nil, "failed to decrypt workspace secret" } envVars[k] = string(decrypted) + // User-authored workspace_secrets value supersedes any + // global_secrets row of the same key — including dropping + // the operator-bleed provenance flag. The user explicitly + // re-set the value via the canvas Secrets tab, so it is + // no longer "the operator-store version." + delete(globalKeys, k) } } if err := wsRows.Err(); err != nil { log.Printf("Provisioner: workspace_secrets rows.Err workspace=%s: %v", workspaceID, err) } } - return envVars, "" + return envVars, globalKeys, "" } // provisionWorkspaceCP provisions a workspace via the control plane API. diff --git a/workspace-server/internal/handlers/workspace_provision_forbidden_env.go b/workspace-server/internal/handlers/workspace_provision_forbidden_env.go index 30764d7f0..db6759ac9 100644 --- a/workspace-server/internal/handlers/workspace_provision_forbidden_env.go +++ b/workspace-server/internal/handlers/workspace_provision_forbidden_env.go @@ -135,6 +135,15 @@ func isForbiddenTenantEnvKey(key string) bool { // message and the structured-extra payload that goes to the // canvas Events tab. Sorting makes the message stable across // Go's randomized map iteration. +// +// PROVENANCE NOTE: this helper checks by env-var name ONLY and is +// unaware of where each value came from. Production provision code +// uses findForbiddenTenantEnvKeysFromGlobals instead, restricting +// the check to keys originating from the operator-controlled +// global_secrets table — see the doc-comment on that function and +// the RFC#523 Layer 1 block in prepareProvisionContext. This name- +// only helper is kept for the workspace_secrets-write CI lint +// (Layer 3) and for tests that pin the deny-set definition. func findForbiddenTenantEnvKeys(envVars map[string]string) []string { if len(envVars) == 0 { return []string{} @@ -149,6 +158,48 @@ func findForbiddenTenantEnvKeys(envVars map[string]string) []string { return found } +// findForbiddenTenantEnvKeysFromGlobals is the provenance-aware +// variant used by RFC#523 Layer 1 in prepareProvisionContext. It +// restricts the forbidden-key scan to keys whose value originated +// from the operator-controlled `global_secrets` table. +// +// Fixes the over-fire reported by CTO empirical 2026-05-20: a user +// who explicitly pastes their own scoped GitHub PAT under +// GITHUB_TOKEN into the canvas Secrets tab (a `workspace_secrets` +// row) was being blocked alongside the genuine operator-bleed case. +// RFC#523's threat model (issue molecule-ai/internal#523 §"Threat +// model") names operator-scope tokens injected via operator-side +// stores; user-authored workspace_secrets is out of scope. +// +// globalSecretKeys is the set returned as the second value from +// loadWorkspaceSecrets. A key that exists in BOTH stores is treated +// as workspace_secrets (user override wins) — loadWorkspaceSecrets +// drops the global flag when the workspace row is read. +// +// Empty/nil globalSecretKeys means no operator-side source was +// loaded (e.g. tests, or table empty); the scan returns no hits. +// Deterministic sort order, same as findForbiddenTenantEnvKeys. +func findForbiddenTenantEnvKeysFromGlobals(envVars map[string]string, globalSecretKeys map[string]struct{}) []string { + if len(envVars) == 0 || len(globalSecretKeys) == 0 { + return []string{} + } + found := make([]string, 0) + for k := range globalSecretKeys { + if _, present := envVars[k]; !present { + // Defensive: a key flagged as global-origin must also + // be in the resolved env-set. If not, skip — the + // loadWorkspaceSecrets contract guarantees this never + // happens, but the helper stays total. + continue + } + if isForbiddenTenantEnvKey(k) { + found = append(found, k) + } + } + sort.Strings(found) + return found +} + // formatForbiddenTenantEnvError builds the safe-canned user-facing // message for a provision aborted because forbidden env keys are // present in the resolved env-set. The message names the diff --git a/workspace-server/internal/handlers/workspace_provision_forbidden_env_test.go b/workspace-server/internal/handlers/workspace_provision_forbidden_env_test.go index ee7bf8040..d3e6998dc 100644 --- a/workspace-server/internal/handlers/workspace_provision_forbidden_env_test.go +++ b/workspace-server/internal/handlers/workspace_provision_forbidden_env_test.go @@ -150,6 +150,106 @@ func TestFindForbiddenTenantEnvKeys_SingleAndMultipleSorted(t *testing.T) { } } +// TestFindForbiddenTenantEnvKeysFromGlobals pins the provenance-aware +// behaviour added 2026-05-20 to fix the RFC#523 Layer 1 over-fire: a +// user-set workspace_secrets row with key=GITHUB_TOKEN must NOT be +// flagged, while a global_secrets row of the same key MUST be. +// +// Cross-references the empirical bug: CTO 2026-05-20 hit +// `provision aborted: env var "GITHUB_TOKEN" is operator-scope...` +// after pasting their own scoped PAT into the canvas Secrets tab +// (workspace_secrets) — the original blanket check fired on the +// merged env-set regardless of provenance. +func TestFindForbiddenTenantEnvKeysFromGlobals_UserSetAllowed(t *testing.T) { + // User pasted their own PAT via canvas Secrets tab — + // workspace_secrets row only. globalSecretKeys is empty for + // this key, so the check MUST not fire. + envVars := map[string]string{ + "GITHUB_TOKEN": "ghp_FAKEUSERPAT_user_set_via_canvas", + "ANTHROPIC_API_KEY": "sk-ant-keep", + } + globalKeys := map[string]struct{}{} // nothing from global_secrets + got := findForbiddenTenantEnvKeysFromGlobals(envVars, globalKeys) + if len(got) != 0 { + t.Errorf("user-set workspace_secrets with GITHUB_TOKEN: got %v; want empty (provenance-allowed)", got) + } +} + +func TestFindForbiddenTenantEnvKeysFromGlobals_OperatorLeakBlocked(t *testing.T) { + // Operator-store bleed — GITHUB_TOKEN sourced from global_secrets. + // This is the literal RFC#523 §"Threat model" attack vector. + // Check MUST fire and name GITHUB_TOKEN. + envVars := map[string]string{ + "GITHUB_TOKEN": "ghp_OPERATOR_LEAK_from_global_secrets", + "ANTHROPIC_API_KEY": "sk-ant-keep", + } + globalKeys := map[string]struct{}{ + "GITHUB_TOKEN": {}, + "ANTHROPIC_API_KEY": {}, + } + got := findForbiddenTenantEnvKeysFromGlobals(envVars, globalKeys) + if len(got) != 1 || got[0] != "GITHUB_TOKEN" { + t.Errorf("operator-leak GITHUB_TOKEN in global_secrets: got %v; want [GITHUB_TOKEN]", got) + } +} + +func TestFindForbiddenTenantEnvKeysFromGlobals_UserOverrideOfGlobalAllowed(t *testing.T) { + // Both stores have the key; loadWorkspaceSecrets drops the global + // flag when the workspace row supersedes (caller contract). + // Simulate that here: globalKeys does NOT contain GITHUB_TOKEN + // because workspace_secrets re-set it. Allowed. + envVars := map[string]string{ + "GITHUB_TOKEN": "ghp_USER_RESET_after_global_was_present", + } + globalKeys := map[string]struct{}{} // workspace overrode → flag dropped + got := findForbiddenTenantEnvKeysFromGlobals(envVars, globalKeys) + if len(got) != 0 { + t.Errorf("user-override of global GITHUB_TOKEN: got %v; want empty", got) + } +} + +func TestFindForbiddenTenantEnvKeysFromGlobals_MultipleOperatorLeaks(t *testing.T) { + // Multiple operator-leaked tokens — must return sorted slice. + envVars := map[string]string{ + "GITHUB_TOKEN": "leak1", + "CP_ADMIN_API_TOKEN": "leak2", + "MOLECULE_OPERATOR_HOST": "leak3", + "RAILWAY_TOKEN": "leak4", + "ANTHROPIC_API_KEY": "user-allowed", + } + globalKeys := map[string]struct{}{ + "GITHUB_TOKEN": {}, + "CP_ADMIN_API_TOKEN": {}, + "MOLECULE_OPERATOR_HOST": {}, + "RAILWAY_TOKEN": {}, + } + got := findForbiddenTenantEnvKeysFromGlobals(envVars, globalKeys) + want := []string{"CP_ADMIN_API_TOKEN", "GITHUB_TOKEN", "MOLECULE_OPERATOR_HOST", "RAILWAY_TOKEN"} + if len(got) != len(want) { + t.Fatalf("operator-leak multi: got %v; want %v", got, want) + } + for i := range want { + if got[i] != want[i] { + t.Errorf("operator-leak multi[%d] = %q; want %q (full got=%v)", i, got[i], want[i], got) + } + } +} + +func TestFindForbiddenTenantEnvKeysFromGlobals_EmptyInputs(t *testing.T) { + if got := findForbiddenTenantEnvKeysFromGlobals(nil, nil); len(got) != 0 { + t.Errorf("nil/nil: got %v; want empty", got) + } + if got := findForbiddenTenantEnvKeysFromGlobals(map[string]string{}, map[string]struct{}{}); len(got) != 0 { + t.Errorf("empty/empty: got %v; want empty", got) + } + // Non-empty envVars but no global provenance — nothing came from + // global_secrets, so nothing to block (even if a workspace_secrets + // row exists for GITHUB_TOKEN). + if got := findForbiddenTenantEnvKeysFromGlobals(map[string]string{"GITHUB_TOKEN": "ghp_user"}, map[string]struct{}{}); len(got) != 0 { + t.Errorf("workspace-only GITHUB_TOKEN: got %v; want empty", got) + } +} + func TestFormatForbiddenTenantEnvError_Phrasing(t *testing.T) { // Empty input — defensive total function. if msg := formatForbiddenTenantEnvError(nil); !strings.Contains(msg, "RFC#523") { diff --git a/workspace-server/internal/handlers/workspace_provision_shared.go b/workspace-server/internal/handlers/workspace_provision_shared.go index dda460b73..80677623e 100644 --- a/workspace-server/internal/handlers/workspace_provision_shared.go +++ b/workspace-server/internal/handlers/workspace_provision_shared.go @@ -120,38 +120,52 @@ func (h *WorkspaceHandler) prepareProvisionContext( payload models.CreateWorkspacePayload, resetClaudeSession bool, ) (*preparedProvisionContext, *provisionAbort) { - envVars, decryptErr := loadWorkspaceSecrets(ctx, workspaceID) + envVars, globalSecretKeys, decryptErr := loadWorkspaceSecrets(ctx, workspaceID) if decryptErr != "" { return nil, &provisionAbort{Msg: decryptErr} } - // RFC#523 Layer 1 (task #146): refuse to start a tenant workspace - // when any forbidden operator-scope env var is present in the - // resolved secret-load env-set. Runs IMMEDIATELY after - // loadWorkspaceSecrets and BEFORE applyAgentGitHTTPCreds — the - // per-agent persona injection sets a fallback GITEA_USER/GITEA_TOKEN - // pair that the buildContainerEnv forensic #145 guard will strip - // later. We want THIS layer to catch leaks from the operator- - // controlled stores (global_secrets, workspace_secrets) only, not - // the deliberate per-agent platform injection that lives downstream. + // RFC#523 Layer 1 (issue molecule-ai/internal#523): refuse to start a + // tenant workspace when any forbidden operator-scope env var is + // present in the operator-controlled store (global_secrets). // - // Threat model is "an upstream secret-writer accidentally widened - // the propagation set" — e.g. an operator pastes GITEA_TOKEN into - // a workspace_secrets row. Caught here, surfaced loudly to the - // canvas Events tab, fail-closed. The existing forensic #145 guard - // in provisioner.buildContainerEnv / CPProvisioner.Start stays as - // defense-in-depth: it silently strips at container-env-build time. + // PROVENANCE-AWARE — fix for the over-fire reported by CTO empirical + // 2026-05-20: the original implementation ran this check on the + // merged env-set, which conflated two very different sources: + // + // 1. global_secrets — operator-side store. ANY operator-scope token + // here is an upstream bleed (e.g. tenant_secrets_seed.go pre- + // 4f45d37 propagating CP-env GITHUB_TOKEN into every fresh + // tenant's row). RFC#523's literal threat model. + // + // 2. workspace_secrets — user-set via the canvas Secrets tab, + // authenticated as the workspace owner. If the user pastes + // their own scoped GitHub PAT under GITHUB_TOKEN so the agent + // can push to their personal repos, that is the system working + // as designed — not the leak RFC#523 was written to catch. + // + // The provenance side-channel from loadWorkspaceSecrets tells us + // which keys came from global_secrets (workspace_secrets writes + // override and clear the flag, since the user explicitly re-set + // the value). We restrict the abort to that set. + // + // Defense-in-depth NOT removed: provisioner.buildContainerEnv still + // runs the forensic #145 silent-strip (lower-confidence late layer), + // and workspace/entrypoint.sh has Layer 2 inside the container. If a + // real operator-scope token slips into workspace_secrets some other + // way, the later layers (and the per-workspace SG, and the per-tenant + // VPC isolation) are still in force. // // Key names (not values) are echoed in the user-facing error so // the operator can locate and remove the offending row. Per memory // `feedback_passwords_in_chat_are_burned`, key names are not // secret; values would be. - if forbidden := findForbiddenTenantEnvKeys(envVars); len(forbidden) > 0 { + if forbidden := findForbiddenTenantEnvKeysFromGlobals(envVars, globalSecretKeys); len(forbidden) > 0 { msg := formatForbiddenTenantEnvError(forbidden) - log.Printf("Provisioner: ABORT workspace=%s — forbidden operator-scope env keys present: %v (RFC#523)", workspaceID, forbidden) + log.Printf("Provisioner: ABORT workspace=%s — forbidden operator-scope env keys present in global_secrets: %v (RFC#523)", workspaceID, forbidden) return nil, &provisionAbort{ Msg: msg, - Extra: map[string]interface{}{"error": msg, "forbidden_env_keys": forbidden, "rfc": "523"}, + Extra: map[string]interface{}{"error": msg, "forbidden_env_keys": forbidden, "rfc": "523", "source": "global_secrets"}, } } -- 2.52.0