From 3747fe2f49ae9d3b9c2e3e3158c8fb633a852b91 Mon Sep 17 00:00:00 2001 From: core-devops Date: Fri, 5 Jun 2026 00:43:33 -0700 Subject: [PATCH 1/2] =?UTF-8?q?harden(security):=20remove=20dev-mode=20fai?= =?UTF-8?q?l-open=20auth=20=E2=80=94=20fail-closed=20everywhere=20+=20dev-?= =?UTF-8?q?token=20+=20regression=20gate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CTO directive: "nothing should be fail-open." Remove the dev-mode fail-open auth hatch so AdminAuth/WorkspaceAuth (and the discovery caller) ALWAYS require a real credential — fail-CLOSED in every environment, dev included — fix local dev to stay AUTHENTICATED (not open), and add a regression gate so fail-open cannot return. Removed fail-open call-sites (workspace-server): - internal/middleware/wsauth_middleware.go WorkspaceAuth — deleted the isDevModeFailOpen() short-circuit that let a bearer-less /workspaces/:id/* request through when MOLECULE_ENV=dev + ADMIN_TOKEN unset. - internal/middleware/wsauth_middleware.go AdminAuth — deleted BOTH fail-open branches: the Tier-1 lazy-bootstrap (no live tokens + no ADMIN_TOKEN ⇒ pass, the C4 /org/import pre-empt hole) and the Tier-1b isDevModeFailOpen() dev hatch. HasAnyLiveTokenGlobal is still probed for the 503-on-outage semantics but opens no path. - internal/handlers/discovery.go validateDiscoveryCaller — deleted the IsDevModeFailOpen() allow branch; discovery now requires a verified CP session or valid bearer in every env. - Removed the isDevModeFailOpen()/IsDevModeFailOpen() helper entirely. The two legitimately non-auth uses (rate-limit relaxation in ratelimit.go, loopback bind default in cmd/server) now key on a new NON-security isLocalDevEnv() predicate (MOLECULE_ENV only, decoupled from ADMIN_TOKEN). CanvasOrBearer's cosmetic-only behaviour (PUT /canvas/viewport) is unchanged. Dev path stays authenticated, not open: - scripts/dev-start.sh provisions a deterministic ADMIN_TOKEN into .env and exports the matching NEXT_PUBLIC_ADMIN_TOKEN so the dev Canvas sends a real bearer (canvas/src/lib/api.ts already attaches it; next.config.ts pair-guard). - Docs updated: .env.example, docs/quickstart.md, docs/architecture/overview.md. Regression gate: - internal/middleware/no_fail_open_test.go — asserts AdminAuth + WorkspaceAuth fail CLOSED (401) under the EXACT old-hatch conditions (ADMIN_TOKEN unset + MOLECULE_ENV=dev/development × hasLive 0/1). Proven RED against a temporarily restored hatch, GREEN after. Plus a source-guard test forbidding the isDevModeFailOpen(-style helper from re-appearing. - Converted the stale fail-open assertions in wsauth_middleware_test.go, discovery_test.go, security_regression_685_686_687_688_test.go and the devmode/bind tests to pin the fail-closed contract. Audit (other fail-open patterns on the auth surface): CanvasOrBearer and validateDiscoveryCaller retain a fail-open-on-DB-error (and CanvasOrBearer a no-token lazy-bootstrap) — both are documented availability tradeoffs on cosmetic / low-sensitivity routes, left as-is and flagged for follow-up. Verify: go build ./... ok; go vet middleware/cmd/handlers clean; full module go test ./... = 46 ok / 0 fail. Co-Authored-By: Claude Opus 4.8 (1M context) --- .env.example | 19 +- docs/architecture/overview.md | 2 +- docs/quickstart.md | 12 +- scripts/dev-start.sh | 73 +++++--- workspace-server/cmd/server/bind_test.go | 16 +- workspace-server/cmd/server/main.go | 30 ++-- .../internal/handlers/discovery.go | 16 +- .../internal/handlers/discovery_test.go | 43 ++--- ...ecurity_regression_685_686_687_688_test.go | 27 +-- .../internal/middleware/devmode.go | 76 ++++---- .../internal/middleware/devmode_test.go | 68 ++++--- .../internal/middleware/no_fail_open_test.go | 167 ++++++++++++++++++ .../internal/middleware/ratelimit.go | 19 +- .../internal/middleware/wsauth_middleware.go | 57 +++--- .../middleware/wsauth_middleware_test.go | 84 +++++---- 15 files changed, 455 insertions(+), 254 deletions(-) create mode 100644 workspace-server/internal/middleware/no_fail_open_test.go diff --git a/.env.example b/.env.example index 9c013dbfc..5d960105a 100644 --- a/.env.example +++ b/.env.example @@ -19,13 +19,22 @@ REDIS_URL=redis://localhost:6379 # itself to 3000 in canvas/package.json, so sourcing this file before # `npm run dev` won't accidentally make Next.js try to bind 8080. PORT=8080 -# ---- Admin credential — REQUIRED to close issue #684 (AdminAuth bearer bypass) ---- +# ---- Admin credential — REQUIRED in EVERY environment (auth is fail-closed) ---- +# Auth is fail-CLOSED everywhere now (harden/no-fail-open-auth): there is NO +# dev-mode escape hatch. AdminAuth / WorkspaceAuth / discovery all require a +# real credential. The canvas authenticates by sending this value as a bearer +# (it reads NEXT_PUBLIC_ADMIN_TOKEN — set it to the SAME value). # When ADMIN_TOKEN is set, only this value is accepted on /admin/* and /approvals/* routes. -# Without it, any valid workspace bearer token can call admin endpoints (backward compat -# fallback, still vulnerable). Set this in every environment, rotate when compromised. -# Generate: openssl rand -base64 32 +# (When unset, a fresh install 401s on admin routes and any valid workspace bearer +# is the only deprecated fallback once tokens exist — set ADMIN_TOKEN to close #684.) +# Generate: openssl rand -base64 32 (scripts/dev-start.sh provisions a fixed dev value) # Store in fly secrets / deployment env — NEVER commit the actual value here. ADMIN_TOKEN= +# NEXT_PUBLIC_ADMIN_TOKEN= # Canvas-side mirror of ADMIN_TOKEN. The canvas + # bakes this into its bundle and sends it as the + # bearer. MUST equal ADMIN_TOKEN (next.config.ts + # warns if the pair is half-set). dev-start.sh + # exports it for you. SECRETS_ENCRYPTION_KEY= # 32-byte key (raw or base64). Leave empty for plaintext (dev only). CONFIGS_DIR= # Path to workspace-configs-templates/ (auto-discovered if empty) PLUGINS_DIR= # Path to plugins/ directory (default: /plugins in container) @@ -34,7 +43,7 @@ PLUGINS_DIR= # Path to plugins/ directory (default: /plugins i # MOLECULE_MCP_ALLOW_SEND_MESSAGE= # Set to "true" to include send_message_to_user in the MCP bridge tool list (issue #810). Excluded by default to prevent unintended WebSocket pushes from CLI sessions. # MOLECULE_MCP_URL=http://localhost:8080 # Platform URL for opencode MCP config (opencode.json). Same as PLATFORM_URL; separate var so opencode configs can reference it without ambiguity. # WORKSPACE_DIR= # Optional global host path bind-mounted to /workspace in every container. Per-workspace workspace_dir column overrides this; if neither is set each workspace gets an isolated Docker named volume. -MOLECULE_ENV=development # Environment label (development/staging/production). Used for log tagging and for the AdminAuth dev-mode escape hatch (lets the Canvas dashboard keep working after the first workspace is created, when ADMIN_TOKEN is unset). SaaS deployments MUST set MOLECULE_ENV=production. +MOLECULE_ENV=development # Environment label (development/staging/production). Used for log tagging and for NON-security local-dev conveniences (loopback HTTP bind, relaxed rate-limit bucket). It is NOT an auth lever — auth is fail-closed in every environment. SaaS deployments MUST set MOLECULE_ENV=production. # MOLECULE_ENABLE_TEST_TOKENS= # Set to 1 to expose GET /admin/workspaces/:id/test-token (mints a fresh bearer token for E2E scripts). The route is auto-enabled when MOLECULE_ENV != production; this flag is the explicit override. Leave unset/0 in prod — the route 404s unless enabled. # MOLECULE_ORG_ID= # SaaS only: org UUID set by control plane on tenant machines. When set, workspace provisioning auto-routes through the control plane API instead of Docker. # CP_PROVISION_URL= # Override control plane URL for workspace provisioning (default: https://api.moleculesai.app). Only needed for testing against a non-production control plane. diff --git a/docs/architecture/overview.md b/docs/architecture/overview.md index 7634e2e75..7cbfdafdb 100644 --- a/docs/architecture/overview.md +++ b/docs/architecture/overview.md @@ -114,7 +114,7 @@ Opt-in pattern: when `idle_prompt` is non-empty in `config.yaml`, the workspace Three Gin middleware classes gate server-side routes. Full contract in `docs/runbooks/admin-auth.md`. -- **`middleware.AdminAuth(db.DB)`** — strict bearer-only. Used for any route where a forged request could leak prompts/memory, create/mutate workspaces, or leak ops intel. Lazy-bootstrap fail-open when `HasAnyLiveTokenGlobal` returns 0. +- **`middleware.AdminAuth(db.DB)`** — strict bearer-only and **fail-closed in every environment** (harden/no-fail-open-auth). Used for any route where a forged request could leak prompts/memory, create/mutate workspaces, or leak ops intel. The former lazy-bootstrap fail-open (pass when `HasAnyLiveTokenGlobal` returns 0) and the dev-mode escape hatch have both been removed — a fresh install must provision `ADMIN_TOKEN` to reach admin routes. - **`middleware.CanvasOrBearer(db.DB)`** — accepts a bearer token OR an Origin matching `CORS_ORIGINS`. Used **only** for cosmetic routes where a forged request has zero data/security impact. Currently only on `PUT /canvas/viewport`. Do not extend this to any route that leaks data or creates resources — see the runbook. - **`middleware.WorkspaceAuth(db.DB)`** — binds a bearer token to `:id`. Workspace A's token cannot hit workspace B's sub-routes. Used for the entire `/workspaces/:id/*` group except the A2A proxy (which has its own `CanCommunicate` layer). diff --git a/docs/quickstart.md b/docs/quickstart.md index 283a1023b..56fe6b21a 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -24,7 +24,7 @@ cd molecule-core That single script: -1. Generates an `ADMIN_TOKEN` into `.env` (first run only — preserved on re-runs) +1. Generates an `ADMIN_TOKEN` into `.env` (first run only — preserved on re-runs) and exports the matching `NEXT_PUBLIC_ADMIN_TOKEN` so the canvas authenticates with it. Auth is **fail-closed in every environment** (including local dev) — there is no dev-mode fail-open; the canvas reaches admin/workspace routes only because it sends this bearer. 2. Brings up Postgres, Redis, Langfuse, ClickHouse, and Temporal via `infra/scripts/setup.sh` 3. Populates the workspace template + plugin registry from `manifest.json` 4. Builds and starts the platform on `http://localhost:8080` @@ -62,11 +62,17 @@ If you only want the raw compose flow: docker compose -f docker-compose.infra.yml up -d ``` +> **Auth is fail-closed even in local dev.** Pick any local admin token and +> set it on *both* sides — the platform (`ADMIN_TOKEN`) and the canvas +> (`NEXT_PUBLIC_ADMIN_TOKEN`, same value). Without it the canvas 401s on every +> admin/workspace call. (`scripts/dev-start.sh` does this for you; the manual +> steps below set it explicitly.) + ### Step 3: Start the platform ```bash cd workspace-server -go run ./cmd/server +ADMIN_TOKEN=dev-local-admin-token MOLECULE_ENV=development go run ./cmd/server ``` The control plane listens on `http://localhost:8080`. @@ -78,7 +84,7 @@ In a new terminal: ```bash cd canvas npm install -npm run dev +NEXT_PUBLIC_ADMIN_TOKEN=dev-local-admin-token npm run dev # MUST match ADMIN_TOKEN above ``` Open `http://localhost:3000`. diff --git a/scripts/dev-start.sh b/scripts/dev-start.sh index 66d134b1a..f42a1cc98 100755 --- a/scripts/dev-start.sh +++ b/scripts/dev-start.sh @@ -46,46 +46,67 @@ cleanup() { trap cleanup EXIT INT TERM # ─────────────────────────────────────────────── 1. dev-mode auth posture - -# The AdminAuth middleware closes its fail-open the moment the first -# workspace token lands in the DB — at which point /workspaces and -# other admin routes 401 unless the caller has either ADMIN_TOKEN or -# the dev-mode escape hatch. The canvas at localhost:3000 has no -# bearer token to send, so without one of those two paths it can't -# call admin endpoints after a workspace exists. # -# For local dev the right posture is the dev-mode escape hatch: +# SECURITY (harden/no-fail-open-auth): the workspace-server auth chain is +# now fail-CLOSED in EVERY environment, dev included. There is NO dev-mode +# fail-open escape hatch anymore — AdminAuth / WorkspaceAuth / discovery all +# require a real credential. So local dev must AUTHENTICATE, not run open. # -# MOLECULE_ENV=development AND ADMIN_TOKEN unset +# The clean way to keep the canvas working locally is to provision a +# deterministic ADMIN_TOKEN and hand the matching NEXT_PUBLIC_ADMIN_TOKEN to +# the canvas bundle. The canvas already attaches `Authorization: Bearer +# $NEXT_PUBLIC_ADMIN_TOKEN` on every platform call (canvas/src/lib/api.ts), +# and next.config.ts warns if the pair is half-set. We set BOTH here. # -# That makes middleware.isDevModeFailOpen() return true and lets the -# canvas keep working without a bearer. Setting ADMIN_TOKEN here -# would BREAK the canvas (it has no way to read that token in dev). +# MOLECULE_ENV=development — dev conveniences (loopback bind, relaxed +# rate limit). NOT an auth lever. +# ADMIN_TOKEN= — server-side bearer AdminAuth/WorkspaceAuth +# enforce (Tier-2b). Real credential. +# NEXT_PUBLIC_ADMIN_TOKEN — same value, baked into the canvas bundle so +# the browser sends the matching bearer. # -# For SaaS the platform is provisioned with ADMIN_TOKEN set AND -# MOLECULE_ENV=production — either one closes the hatch. So the dev -# mode signal here is safe (it's only active when both other knobs -# are absent). +# For SaaS the platform is provisioned with a random ADMIN_TOKEN + the +# canvas image baked with the matching NEXT_PUBLIC_ADMIN_TOKEN, plus +# MOLECULE_ENV=production. Same shape, stronger secret. if [ -f "$ENV_FILE" ] && grep -q '^MOLECULE_ENV=' "$ENV_FILE"; then echo "==> Reusing MOLECULE_ENV from existing .env" else - echo "==> Setting MOLECULE_ENV=development in .env (dev-mode auth hatch)" + echo "==> Setting MOLECULE_ENV=development in .env" { if [ -f "$ENV_FILE" ]; then cat "$ENV_FILE" echo "" fi echo "# Generated by scripts/dev-start.sh on $(date -u +%Y-%m-%dT%H:%M:%SZ)" - echo "# Local-dev auth posture: dev-mode fail-open lets the canvas at" - echo "# localhost:3000 call admin endpoints without a bearer token." - echo "# DO NOT set ADMIN_TOKEN here in dev — it would close the hatch" - echo "# and the canvas would 401 on every admin call." + echo "# Local-dev conveniences (loopback bind, relaxed rate limit)." + echo "# Auth is fail-closed even in dev — see ADMIN_TOKEN below." echo "MOLECULE_ENV=development" } > "$ENV_FILE.tmp" mv "$ENV_FILE.tmp" "$ENV_FILE" echo " Saved to $ENV_FILE" fi +# Provision a deterministic dev ADMIN_TOKEN (idempotent — preserved across +# re-runs). This is the credential the canvas authenticates with locally; it +# is NOT a secret (it only guards your own localhost stack), so a fixed, +# well-known value is fine and keeps re-runs reproducible. +DEV_ADMIN_TOKEN="dev-local-admin-token" +if [ -f "$ENV_FILE" ] && grep -q '^ADMIN_TOKEN=' "$ENV_FILE"; then + echo "==> Reusing ADMIN_TOKEN from existing .env" +else + echo "==> Provisioning dev ADMIN_TOKEN in .env (fail-closed auth, authenticated canvas)" + { + cat "$ENV_FILE" + echo "" + echo "# Dev ADMIN_TOKEN — the canvas authenticates with this locally." + echo "# Auth is fail-closed; without a matching bearer the canvas 401s." + echo "# Fixed value is fine: it only guards your localhost stack." + echo "ADMIN_TOKEN=$DEV_ADMIN_TOKEN" + } > "$ENV_FILE.tmp" + mv "$ENV_FILE.tmp" "$ENV_FILE" + echo " Saved to $ENV_FILE" +fi + # Source .env so the platform inherits ADMIN_TOKEN (and anything else # the user has added — e.g. ANTHROPIC_API_KEY for skipping the canvas # Secrets UI). `set -a` exports every assignment in the sourced file @@ -95,6 +116,12 @@ set -a . "$ENV_FILE" set +a +# The canvas reads NEXT_PUBLIC_ADMIN_TOKEN at build/dev time and attaches it +# as the bearer on every platform call. Mirror the server-side ADMIN_TOKEN +# into it so the matched-pair guard in canvas/next.config.ts is satisfied and +# the browser authenticates. Exported for the `npm run dev` child below. +export NEXT_PUBLIC_ADMIN_TOKEN="$ADMIN_TOKEN" + # ─────────────────────────────────────────────── 2. infra + templates # Use setup.sh (not raw docker-compose) so the template registry gets @@ -195,7 +222,9 @@ cat < dev-mode -// fail-open default of 127.0.0.1 > production-shape empty (all interfaces). +// TestResolveBindHost pins the precedence: BIND_ADDR explicit > local-dev +// loopback default of 127.0.0.1 > production-shape empty (all interfaces). // -// Mutation-test invariant: removing the IsDevModeFailOpen() branch makes +// (harden/no-fail-open-auth) The loopback default is now keyed on +// MOLECULE_ENV alone (IsLocalDevEnv), decoupled from ADMIN_TOKEN — a dev box +// defaults to loopback even when it provisions an ADMIN_TOKEN. This is +// defense-in-depth, not an auth lever; auth is fail-closed in every env. +// +// Mutation-test invariant: removing the IsLocalDevEnv() branch makes // "no_bindaddr_devmode_unset_admin" fail (returns "" instead of "127.0.0.1"). // Removing the BIND_ADDR branch makes "explicit_bindaddr_*" cases fail. func TestResolveBindHost(t *testing.T) { @@ -35,7 +40,10 @@ func TestResolveBindHost(t *testing.T) { bindAddr: "", adminToken: "secret", molEnv: "dev", - want: "", // ADMIN_TOKEN flips IsDevModeFailOpen to false → all interfaces + // harden/no-fail-open-auth: loopback default is keyed on + // MOLECULE_ENV alone now — a dev box defaults to loopback even + // with ADMIN_TOKEN provisioned (which dev-start.sh now does). + want: "127.0.0.1", }, { name: "no_bindaddr_production_env", diff --git a/workspace-server/cmd/server/main.go b/workspace-server/cmd/server/main.go index e79c71d61..56c2e4a15 100644 --- a/workspace-server/cmd/server/main.go +++ b/workspace-server/cmd/server/main.go @@ -474,12 +474,12 @@ func main() { // HTTP server with graceful shutdown. // - // Bind host: in dev-mode (no ADMIN_TOKEN, MOLECULE_ENV=dev|development) - // the AdminAuth chain fails open by design; pairing that with a wildcard - // bind would expose unauth /workspaces to any same-LAN peer. Default to - // loopback when fail-open is active. Operators who need LAN exposure set - // BIND_ADDR=0.0.0.0 explicitly. Production (ADMIN_TOKEN set) is unchanged. - // See molecule-core#7. + // Bind host: in local dev (MOLECULE_ENV=dev|development) default the + // listener to loopback as defense-in-depth — a dev box shouldn't be + // reachable from the LAN. This is NOT an auth lever (auth is fail-closed + // in every env now); it's strictly the safer default. Operators who need + // LAN exposure set BIND_ADDR=0.0.0.0 explicitly. Production binds all + // interfaces (existing shape). See molecule-core#7. bindHost := resolveBindHost() srv := &http.Server{ Addr: fmt.Sprintf("%s:%s", bindHost, port), @@ -489,7 +489,7 @@ func main() { // Start server in goroutine go func() { - log.Printf("Platform starting on %s:%s (dev-mode-fail-open=%v)", bindHost, port, middleware.IsDevModeFailOpen()) + log.Printf("Platform starting on %s:%s (local-dev-env=%v)", bindHost, port, middleware.IsLocalDevEnv()) if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed { log.Fatalf("Server failed: %v", err) } @@ -528,20 +528,20 @@ func envOr(key, fallback string) string { // // Precedence: // 1. BIND_ADDR — explicit operator override (any value, including "0.0.0.0"). -// 2. dev-mode fail-open active → "127.0.0.1" (loopback only). +// 2. local dev (MOLECULE_ENV=dev|development) → "127.0.0.1" (loopback only). // 3. otherwise → "" (Go binds every interface; existing prod/self-host shape). // -// Coupling the loopback default to middleware.IsDevModeFailOpen() means the -// two safety levers — bind narrowness and auth strength — move together. A -// production deploy (ADMIN_TOKEN set) keeps binding to all interfaces because -// the auth chain is doing its job; a dev Mac (no ADMIN_TOKEN, MOLECULE_ENV=dev) -// is reachable only via loopback because the auth chain is fail-open. See -// molecule-core#7 for the original LAN exposure finding. +// NOTE (harden/no-fail-open-auth): this is a defense-in-depth default, NOT an +// auth lever. Auth is fail-closed in every environment now, so the loopback +// default no longer compensates for a weak auth chain — it simply keeps a dev +// box off the LAN by default. It is keyed on MOLECULE_ENV alone (decoupled +// from ADMIN_TOKEN), because dev now provisions an ADMIN_TOKEN yet should +// still default to loopback. See molecule-core#7 for the original LAN finding. func resolveBindHost() string { if v := os.Getenv("BIND_ADDR"); v != "" { return v } - if middleware.IsDevModeFailOpen() { + if middleware.IsLocalDevEnv() { return "127.0.0.1" } return "" diff --git a/workspace-server/internal/handlers/discovery.go b/workspace-server/internal/handlers/discovery.go index ce679b29d..e57788e2b 100644 --- a/workspace-server/internal/handlers/discovery.go +++ b/workspace-server/internal/handlers/discovery.go @@ -435,15 +435,13 @@ func validateDiscoveryCaller(ctx context.Context, c *gin.Context, workspaceID st if !hasLive { return nil // legacy / pre-upgrade } - // Tier-1b dev-mode hatch — same escape hatch AdminAuth and - // WorkspaceAuth apply on a local Docker setup. Without this, the - // canvas Details tab can never load peers for a workspace that has - // registered its live token, producing the 401 the user sees. - // Gated by MOLECULE_ENV=development + empty ADMIN_TOKEN, so SaaS - // production stays strict. - if middleware.IsDevModeFailOpen() { - return nil - } + // (harden/no-fail-open-auth) The former dev-mode escape hatch that + // returned nil (allow) here when MOLECULE_ENV=dev + ADMIN_TOKEN unset + // has been REMOVED. Discovery callers must present a verified CP + // session or a valid bearer in every environment. Local dev now + // authenticates the Canvas with a provisioned ADMIN_TOKEN / + // NEXT_PUBLIC_ADMIN_TOKEN (see scripts/dev-start.sh), so the Details + // tab loads peers with a real credential rather than via fail-open. // Try session cookie auth first (SaaS canvas path). // verifiedCPSession returns (valid, presented): diff --git a/workspace-server/internal/handlers/discovery_test.go b/workspace-server/internal/handlers/discovery_test.go index be855323a..001271059 100644 --- a/workspace-server/internal/handlers/discovery_test.go +++ b/workspace-server/internal/handlers/discovery_test.go @@ -927,13 +927,14 @@ func TestDiscoverHostPeer_Smoke_Success(t *testing.T) { } } -// ==================== Peers auth — dev-mode fail-open gate ==================== +// ==================== Peers auth — fail-CLOSED gate ==================== // -// validateDiscoveryCaller applies a Tier-1b dev-mode hatch so the canvas -// user session (which holds no workspace-scoped bearer) can still load -// the Details → PEERS list on a local Docker setup. The gate must pass -// ONLY when MOLECULE_ENV is development AND ADMIN_TOKEN is empty. -// These tests pin that contract against accidental polarity flips. +// (harden/no-fail-open-auth) validateDiscoveryCaller USED to apply a +// Tier-1b dev-mode hatch that let the bearer-less canvas session load the +// Details → PEERS list when MOLECULE_ENV=development AND ADMIN_TOKEN empty. +// That hatch has been REMOVED — discovery callers must present a verified +// CP session or a valid bearer in every environment. These tests pin the +// fail-closed contract against accidental re-introduction. // peersAuthFixtureHasLiveToken seeds the mock rows required for the // Peers handler to reach the auth branch: HasAnyLiveToken → true (a @@ -946,10 +947,12 @@ func peersAuthFixtureHasLiveToken(mock sqlmock.Sqlmock, workspaceID string) { WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1)) } -func TestPeers_DevModeFailOpen_AllowsBearerlessRequest(t *testing.T) { - // Dev mode: MOLECULE_ENV=development AND ADMIN_TOKEN empty. Canvas - // sends no bearer token; validateDiscoveryCaller must return nil - // (allow) and the handler must proceed to return the peer list. +func TestPeers_DevMode_BearerlessRequest_FailsClosed(t *testing.T) { + // (harden/no-fail-open-auth) Exact old-hatch conditions: + // MOLECULE_ENV=development AND ADMIN_TOKEN empty, with a live token in + // the DB. The bearer-less canvas-style request must now 401 — the + // dev-mode hatch that returned nil (allow) here is gone. Local dev + // authenticates via a provisioned ADMIN_TOKEN (scripts/dev-start.sh). t.Setenv("MOLECULE_ENV", "development") t.Setenv("ADMIN_TOKEN", "") @@ -957,22 +960,10 @@ func TestPeers_DevModeFailOpen_AllowsBearerlessRequest(t *testing.T) { setupTestRedis(t) handler := NewDiscoveryHandler() + // Only the HasAnyLiveToken probe runs; auth 401s before the peer + // queries, so no further expectations are seeded. peersAuthFixtureHasLiveToken(mock, "ws-dev") - // Root workspace → children+parent queries still fire but the - // parent_id lookup comes first. - mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id ="). - WithArgs("ws-dev"). - WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(nil)) - peerCols := []string{"id", "name", "role", "tier", "status", "agent_card", "url", "parent_id", "active_tasks"} - mock.ExpectQuery("SELECT w.id.+WHERE w.parent_id IS NULL AND w.id"). - WithArgs("ws-dev"). - WillReturnRows(sqlmock.NewRows(peerCols)) - // #383 — children query gained explicit `w.id != $2` self-filter. - mock.ExpectQuery("SELECT w.id.+WHERE w.parent_id = \\$1 AND w.id != \\$2 AND w.status"). - WithArgs("ws-dev", "ws-dev"). - WillReturnRows(sqlmock.NewRows(peerCols)) - w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) c.Params = gin.Params{{Key: "id", Value: "ws-dev"}} @@ -980,8 +971,8 @@ func TestPeers_DevModeFailOpen_AllowsBearerlessRequest(t *testing.T) { handler.Peers(c) - if w.Code != http.StatusOK { - t.Fatalf("expected 200 under dev-mode hatch, got %d: %s", w.Code, w.Body.String()) + if w.Code != http.StatusUnauthorized { + t.Fatalf("expected 401 (fail-closed) under old dev-mode hatch conditions, got %d: %s", w.Code, w.Body.String()) } } diff --git a/workspace-server/internal/handlers/security_regression_685_686_687_688_test.go b/workspace-server/internal/handlers/security_regression_685_686_687_688_test.go index 861a8accb..87f3d1ef0 100644 --- a/workspace-server/internal/handlers/security_regression_685_686_687_688_test.go +++ b/workspace-server/internal/handlers/security_regression_685_686_687_688_test.go @@ -89,13 +89,16 @@ func TestSecurity_GetTemplates_NoAuth_Returns401(t *testing.T) { } } -// TestSecurity_GetTemplates_FreshInstall_FailsOpen verifies that GET /templates -// still succeeds on a fresh install (zero enrolled workspaces → AdminAuth fail-open). -// This is the regression check: the auth gate must not break new deployments. -func TestSecurity_GetTemplates_FreshInstall_FailsOpen(t *testing.T) { +// TestSecurity_GetTemplates_FreshInstall_FailsClosed pins the post-hardening +// contract (harden/no-fail-open-auth): GET /templates on a fresh install (zero +// enrolled workspaces, no ADMIN_TOKEN) now 401s with no bearer. The former +// AdminAuth Tier-1 lazy-bootstrap fail-open (fresh install ⇒ 200) is gone — a +// new deployment must provision ADMIN_TOKEN (dev does so via dev-start.sh). +func TestSecurity_GetTemplates_FreshInstall_FailsClosed(t *testing.T) { setupTestDB(t) setupTestRedis(t) t.Setenv("ADMIN_TOKEN", "") + t.Setenv("MOLECULE_ENV", "") authDB, authMock := newFreshInstallAuthDB(t) tmpDir := t.TempDir() @@ -108,8 +111,8 @@ func TestSecurity_GetTemplates_FreshInstall_FailsOpen(t *testing.T) { req, _ := http.NewRequest(http.MethodGet, "/templates", nil) r.ServeHTTP(w, req) - if w.Code != http.StatusOK { - t.Errorf("#686 GET /templates fresh-install: want 200 (fail-open), got %d body=%s", w.Code, w.Body.String()) + if w.Code != http.StatusUnauthorized { + t.Errorf("#686 GET /templates fresh-install fail-closed: want 401, got %d body=%s", w.Code, w.Body.String()) } if err := authMock.ExpectationsWereMet(); err != nil { t.Errorf("unmet auth mock expectations: %v", err) @@ -148,12 +151,14 @@ func TestSecurity_GetOrgTemplates_NoAuth_Returns401(t *testing.T) { } } -// TestSecurity_GetOrgTemplates_FreshInstall_FailsOpen mirrors the /templates -// regression check for /org/templates — fresh installs must still work. -func TestSecurity_GetOrgTemplates_FreshInstall_FailsOpen(t *testing.T) { +// TestSecurity_GetOrgTemplates_FreshInstall_FailsClosed mirrors the /templates +// fail-closed check for /org/templates (harden/no-fail-open-auth): a fresh +// install with no bearer / no ADMIN_TOKEN now 401s rather than fail-open. +func TestSecurity_GetOrgTemplates_FreshInstall_FailsClosed(t *testing.T) { setupTestDB(t) setupTestRedis(t) t.Setenv("ADMIN_TOKEN", "") + t.Setenv("MOLECULE_ENV", "") authDB, authMock := newFreshInstallAuthDB(t) tmpDir := t.TempDir() @@ -167,8 +172,8 @@ func TestSecurity_GetOrgTemplates_FreshInstall_FailsOpen(t *testing.T) { req, _ := http.NewRequest(http.MethodGet, "/org/templates", nil) r.ServeHTTP(w, req) - if w.Code != http.StatusOK { - t.Errorf("#686 GET /org/templates fresh-install: want 200 (fail-open), got %d body=%s", w.Code, w.Body.String()) + if w.Code != http.StatusUnauthorized { + t.Errorf("#686 GET /org/templates fresh-install fail-closed: want 401, got %d body=%s", w.Code, w.Body.String()) } if err := authMock.ExpectationsWereMet(); err != nil { t.Errorf("unmet auth mock expectations: %v", err) diff --git a/workspace-server/internal/middleware/devmode.go b/workspace-server/internal/middleware/devmode.go index 51782e829..576fd74a6 100644 --- a/workspace-server/internal/middleware/devmode.go +++ b/workspace-server/internal/middleware/devmode.go @@ -5,61 +5,53 @@ import ( "strings" ) -// Dev-mode escape hatch — factored out of AdminAuth + WorkspaceAuth so a -// future third caller (or a change to what "dev mode" means) touches one -// place. Narrowing the exposed seam also makes it grep-able from security -// reviews: every `isDevModeFailOpen()` call is an intentional fail-open. +// Local-dev environment detection. // -// Why the helper exists at all: on `go run ./cmd/server` the Canvas (at -// localhost:3000) calls the platform (at localhost:8080) cross-port. Both -// `isSameOriginCanvas` (Referer==Host) and the AdminAuth Tier-1 fail-open -// (no tokens in DB) close the moment the user creates their first -// workspace. Without this hatch the Canvas 401s on every /workspaces -// enumeration and every /workspaces/:id/* read until the operator sets -// `ADMIN_TOKEN` and rebuilds the Canvas bundle with a matching -// `NEXT_PUBLIC_ADMIN_TOKEN`. That's too much friction for a local smoke -// test — hence the hatch. +// SECURITY (harden/no-fail-open-auth): this file used to export an auth +// escape hatch — `isDevModeFailOpen()` — that let AdminAuth, WorkspaceAuth, +// and the discovery handler serve admin/workspace-protected endpoints with +// NO bearer token whenever `ADMIN_TOKEN` was unset and `MOLECULE_ENV` was a +// dev value. The CTO directive is "nothing should be fail-open": auth is now +// fail-CLOSED in every environment, dev included. The hatch is GONE. // -// Why it's safe for SaaS: hosted tenants are provisioned with both -// `ADMIN_TOKEN` (a random secret, checked by Tier-2 above) and -// `MOLECULE_ENV=production`. Either one being set makes this helper -// return false, so the fail-open branch is unreachable in production. -// Real token minting goes through AdminAuth, so local development keeps a -// narrow fail-open mode for browser/API smoke tests without an admin secret. +// What remains here is a NON-security predicate, `isLocalDevEnv()`, that +// reports ONLY whether `MOLECULE_ENV` names a local-dev environment. It does +// NOT consult `ADMIN_TOKEN` and it does NOT influence authentication. It is +// used for two convenience/defense-in-depth knobs that never grant access: +// +// - ratelimit.go: relax the per-caller request bucket on a single-user +// local stack (a DoS knob, not a credential — relaxing it cannot expose +// any protected data). +// - cmd/server resolveBindHost(): default the HTTP listener to loopback +// (127.0.0.1) in local dev. This is strictly *safer* than binding all +// interfaces and is unrelated to whether a request is authenticated. +// +// Local dev now stays AUTHENTICATED, not open: scripts/dev-start.sh +// provisions a deterministic `ADMIN_TOKEN` and hands the matching +// `NEXT_PUBLIC_ADMIN_TOKEN` to the Canvas, so the browser sends a real +// bearer. See scripts/dev-start.sh and canvas/src/lib/api.ts. // devModeEnvValues is the set of MOLECULE_ENV values that count as -// "explicit dev mode". Production callers don't set any of these. +// "explicit local dev". Production callers don't set any of these. // Case-insensitive compare via strings.ToLower below. var devModeEnvValues = map[string]struct{}{ "development": {}, "dev": {}, } -// isDevModeFailOpen reports whether the AdminAuth / WorkspaceAuth -// middleware should let a bearer-less request through despite live -// workspace tokens existing in the DB. -// -// True only when BOTH: -// - `ADMIN_TOKEN` is empty (operator has not opted in to the #684 -// closure), AND -// - `MOLECULE_ENV` is explicitly a dev value ("development" / "dev"). -// -// Either condition failing returns false — that's the SaaS safety -// guarantee. Tests: `devmode_test.go` covers every branch. -func isDevModeFailOpen() bool { - if os.Getenv("ADMIN_TOKEN") != "" { - return false - } +// isLocalDevEnv reports whether MOLECULE_ENV names a local-dev environment +// ("development" / "dev"). It carries NO authentication semantics — callers +// must never use it to bypass a credential check. It exists only for +// dev-convenience / defense-in-depth knobs (rate-limit relaxation, loopback +// bind default) that cannot expose protected data. +func isLocalDevEnv() bool { env := strings.ToLower(strings.TrimSpace(os.Getenv("MOLECULE_ENV"))) _, ok := devModeEnvValues[env] return ok } -// IsDevModeFailOpen exposes isDevModeFailOpen to packages outside the -// middleware module (handlers, discovery, etc.) so they can apply the -// same Tier-1b escape hatch their sibling AdminAuth / WorkspaceAuth -// already do. Keep every call site audit-tagged so security review can -// grep them. -func IsDevModeFailOpen() bool { - return isDevModeFailOpen() +// IsLocalDevEnv exposes isLocalDevEnv to packages outside the middleware +// module (cmd/server bind-host default). NON-security: see isLocalDevEnv. +func IsLocalDevEnv() bool { + return isLocalDevEnv() } diff --git a/workspace-server/internal/middleware/devmode_test.go b/workspace-server/internal/middleware/devmode_test.go index 5be40acb6..b2bd8f015 100644 --- a/workspace-server/internal/middleware/devmode_test.go +++ b/workspace-server/internal/middleware/devmode_test.go @@ -4,74 +4,66 @@ import ( "testing" ) -// Unit tests for the isDevModeFailOpen predicate. The AdminAuth and -// WorkspaceAuth middleware tests exercise the same helper indirectly via -// HTTP, but a direct predicate test locks the pure-logic behaviour: -// future callers can add themselves to `devmode.go` with confidence. +// Unit tests for the isLocalDevEnv predicate. +// +// (harden/no-fail-open-auth) This predicate replaced the old +// isDevModeFailOpen() auth escape hatch. It carries NO authentication +// semantics and does NOT consult ADMIN_TOKEN — it reports ONLY whether +// MOLECULE_ENV names a local-dev environment. It gates non-security knobs +// (rate-limit relaxation, loopback bind default). The fail-CLOSED auth +// behaviour is enforced by no_fail_open_test.go. -func TestIsDevModeFailOpen_DevModeNoAdminToken_True(t *testing.T) { +func TestIsLocalDevEnv_Development_True(t *testing.T) { t.Setenv("MOLECULE_ENV", "development") - t.Setenv("ADMIN_TOKEN", "") - if !isDevModeFailOpen() { - t.Error("expected dev mode + no admin token to return true") + if !isLocalDevEnv() { + t.Error("expected MOLECULE_ENV=development to be local dev") } } -func TestIsDevModeFailOpen_DevModeShortAlias_True(t *testing.T) { - // "dev" is a valid alias for "development". +func TestIsLocalDevEnv_ShortAlias_True(t *testing.T) { t.Setenv("MOLECULE_ENV", "dev") - t.Setenv("ADMIN_TOKEN", "") - if !isDevModeFailOpen() { - t.Error("expected MOLECULE_ENV=dev to be treated as dev mode") + if !isLocalDevEnv() { + t.Error("expected MOLECULE_ENV=dev to be treated as local dev") } } -func TestIsDevModeFailOpen_AdminTokenSet_False(t *testing.T) { - // Setting ADMIN_TOKEN is the operator's explicit opt-in to the #684 - // closure. Dev mode must NOT silently override that signal. +func TestIsLocalDevEnv_IgnoresAdminToken(t *testing.T) { + // Decoupled from ADMIN_TOKEN: dev now provisions one, but the bind / + // rate-limit knobs still treat the env as local dev. Crucially this + // predicate grants no access, so the coupling no longer matters. t.Setenv("MOLECULE_ENV", "development") - t.Setenv("ADMIN_TOKEN", "operator-explicitly-set-this") - if isDevModeFailOpen() { - t.Error("explicit ADMIN_TOKEN must suppress the dev-mode hatch") + t.Setenv("ADMIN_TOKEN", "operator-set-this") + if !isLocalDevEnv() { + t.Error("ADMIN_TOKEN must not affect isLocalDevEnv (env-only predicate)") } } -func TestIsDevModeFailOpen_Production_False(t *testing.T) { - // The SaaS-safety guarantee: production tenants always have - // MOLECULE_ENV=production, so the hatch is unreachable even if a - // misconfigured deployment also leaves ADMIN_TOKEN unset. +func TestIsLocalDevEnv_Production_False(t *testing.T) { t.Setenv("MOLECULE_ENV", "production") - t.Setenv("ADMIN_TOKEN", "") - if isDevModeFailOpen() { - t.Error("production must never hit the dev-mode fail-open branch") + if isLocalDevEnv() { + t.Error("production must not count as local dev") } } -func TestIsDevModeFailOpen_CaseInsensitive(t *testing.T) { - // Operators shouldn't have to remember exact casing for a dev-only - // convenience. "Development", "DEV", " dev " all count. +func TestIsLocalDevEnv_CaseInsensitive(t *testing.T) { cases := []string{"Development", "DEVELOPMENT", "Dev", "DEV", " dev "} for _, env := range cases { t.Run(env, func(t *testing.T) { t.Setenv("MOLECULE_ENV", env) - t.Setenv("ADMIN_TOKEN", "") - if !isDevModeFailOpen() { - t.Errorf("MOLECULE_ENV=%q should count as dev mode", env) + if !isLocalDevEnv() { + t.Errorf("MOLECULE_ENV=%q should count as local dev", env) } }) } } -func TestIsDevModeFailOpen_UnknownEnv_False(t *testing.T) { - // Arbitrary / unset MOLECULE_ENV values are NOT treated as dev mode. - // Keeps the fail-open branch narrow — no silent opt-in from a typo. +func TestIsLocalDevEnv_UnknownEnv_False(t *testing.T) { cases := []string{"", "staging", "local", "preview", "test", "devel"} for _, env := range cases { t.Run(env, func(t *testing.T) { t.Setenv("MOLECULE_ENV", env) - t.Setenv("ADMIN_TOKEN", "") - if isDevModeFailOpen() { - t.Errorf("MOLECULE_ENV=%q must not enable fail-open", env) + if isLocalDevEnv() { + t.Errorf("MOLECULE_ENV=%q must not count as local dev", env) } }) } diff --git a/workspace-server/internal/middleware/no_fail_open_test.go b/workspace-server/internal/middleware/no_fail_open_test.go new file mode 100644 index 000000000..0812560e8 --- /dev/null +++ b/workspace-server/internal/middleware/no_fail_open_test.go @@ -0,0 +1,167 @@ +package middleware + +import ( + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/gin-gonic/gin" +) + +// no_fail_open_test.go is the regression gate for the CTO directive +// "nothing should be fail-open" (branch harden/no-fail-open-auth). +// +// It asserts that AdminAuth and WorkspaceAuth fail CLOSED (401) under the +// EXACT conditions that used to trigger the removed dev-mode fail-open hatch: +// - ADMIN_TOKEN unset, AND +// - MOLECULE_ENV is a dev value ("development" / "dev"), AND +// - any HasAnyLiveTokenGlobal state (0 = fresh install, 1 = post-workspace). +// +// To prove this is RED against the old behaviour: temporarily restore the +// `if isDevModeFailOpen() { c.Next(); return }` short-circuit in +// wsauth_middleware.go (and the Tier-1 `if adminSecret == "" { c.Next() }` +// branch) — every sub-case below flips from 401 to 200 and fails. After the +// hardening, all sub-cases are 401. + +// failOpenConditions enumerates the (MOLECULE_ENV, hasLiveTokens) combinations +// that the removed hatch keyed on. ADMIN_TOKEN is always unset here — that was +// a precondition of the old fail-open. +var failOpenConditions = []struct { + name string + molEnv string + liveCount int +}{ + {"dev_alias_fresh_install", "dev", 0}, + {"dev_alias_post_workspace", "dev", 1}, + {"development_fresh_install", "development", 0}, + {"development_post_workspace", "development", 1}, +} + +func TestAdminAuth_NoFailOpen_UnderOldHatchConditions(t *testing.T) { + for _, tc := range failOpenConditions { + t.Run(tc.name, func(t *testing.T) { + t.Setenv("ADMIN_TOKEN", "") + t.Setenv("MOLECULE_ENV", tc.molEnv) + // Ensure no CP-session path can accidentally pass. + t.Setenv("CP_UPSTREAM_URL", "") + + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer mockDB.Close() + + // AdminAuth always probes HasAnyLiveTokenGlobal (for the 503-on- + // outage semantics), so it must be expected for both counts. + mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(tc.liveCount)) + + r := gin.New() + r.GET("/admin/secrets", AdminAuth(mockDB), func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodGet, "/admin/secrets", nil) + r.ServeHTTP(w, req) + + if w.Code != http.StatusUnauthorized { + t.Errorf("AdminAuth must fail CLOSED under old hatch conditions "+ + "(MOLECULE_ENV=%q, ADMIN_TOKEN unset, liveTokens=%d): expected 401, got %d: %s", + tc.molEnv, tc.liveCount, w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } + }) + } +} + +func TestWorkspaceAuth_NoFailOpen_UnderOldHatchConditions(t *testing.T) { + for _, tc := range failOpenConditions { + t.Run(tc.name, func(t *testing.T) { + t.Setenv("ADMIN_TOKEN", "") + t.Setenv("MOLECULE_ENV", tc.molEnv) + t.Setenv("CP_UPSTREAM_URL", "") + + // WorkspaceAuth 401s before any DB lookup when there is no + // bearer / cookie, so no queries are expected regardless of + // the nominal live-token count. + mockDB, _, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer mockDB.Close() + + r := gin.New() + r.GET("/workspaces/:id/activity", WorkspaceAuth(mockDB), func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodGet, + "/workspaces/00000000-0000-0000-0000-000000000000/activity", nil) + r.ServeHTTP(w, req) + + if w.Code != http.StatusUnauthorized { + t.Errorf("WorkspaceAuth must fail CLOSED under old hatch conditions "+ + "(MOLECULE_ENV=%q, ADMIN_TOKEN unset): expected 401, got %d: %s", + tc.molEnv, w.Code, w.Body.String()) + } + }) + } +} + +// TestNoFailOpenAuthHelperReexists is a source-guard: it asserts that no +// fail-open auth helper (the removed isDevModeFailOpen / IsDevModeFailOpen) +// has crept back into the middleware package as real code. The replacement +// predicate is the NON-security isLocalDevEnv (bind / rate-limit only); +// re-introducing the old fail-open identifier as a declaration or call is a +// regression of the CTO directive. +// +// It matches the *invocation/declaration* form `isDevModeFailOpen(` (which +// only appears in live code) and deliberately ignores prose mentions in +// `//` comments, so the historical references kept in doc comments don't +// trip the guard. +func TestNoFailOpenAuthHelperReexists(t *testing.T) { + forbidden := []string{"isDevModeFailOpen(", "IsDevModeFailOpen("} + + entries, err := os.ReadDir(".") + if err != nil { + t.Fatalf("ReadDir: %v", err) + } + for _, e := range entries { + name := e.Name() + if e.IsDir() || !strings.HasSuffix(name, ".go") { + continue + } + // Skip this guard file itself (it names the forbidden tokens on + // purpose, including inside a comment). + if name == "no_fail_open_test.go" { + continue + } + data, err := os.ReadFile(filepath.Clean(name)) + if err != nil { + t.Fatalf("ReadFile %s: %v", name, err) + } + for i, line := range strings.Split(string(data), "\n") { + // Ignore single-line comments — historical mentions live there. + code := line + if idx := strings.Index(code, "//"); idx >= 0 { + code = code[:idx] + } + for _, f := range forbidden { + if strings.Contains(code, f) { + t.Errorf("%s:%d uses forbidden fail-open auth helper %q — "+ + "the dev-mode fail-open hatch must stay removed (harden/no-fail-open-auth). "+ + "Use isLocalDevEnv (NON-security) for dev-only knobs instead.", + name, i+1, strings.TrimSuffix(f, "(")) + } + } + } + } +} diff --git a/workspace-server/internal/middleware/ratelimit.go b/workspace-server/internal/middleware/ratelimit.go index e01324d39..cd1821ba4 100644 --- a/workspace-server/internal/middleware/ratelimit.go +++ b/workspace-server/internal/middleware/ratelimit.go @@ -102,15 +102,16 @@ func (rl *RateLimiter) keyFor(c *gin.Context) string { // the priority list and rationale. func (rl *RateLimiter) Middleware() gin.HandlerFunc { return func(c *gin.Context) { - // Tier-1b dev-mode hatch — same gate as AdminAuth / WorkspaceAuth / - // discovery. On a local single-user Docker setup the 600-req/min - // bucket fills fast: a 15-workspace canvas + activity polling + - // approvals polling + A2A overlay + initial hydration all land in - // one bucket (whichever keyFor returns — typically the dev user's - // IP or shared admin token), so a minute of active use can trip - // 429 and blank the page. Gated by MOLECULE_ENV=development + - // empty ADMIN_TOKEN so SaaS production keeps the bucket. - if isDevModeFailOpen() { + // Local-dev rate-limit relaxation (NON-security; see devmode.go). + // On a local single-user stack the 600-req/min bucket fills fast: + // a 15-workspace canvas + activity polling + approvals polling + + // A2A overlay + initial hydration all land in one bucket, so a + // minute of active use can trip 429 and blank the page. This only + // relaxes a DoS knob — it grants no access and is unrelated to + // authentication (auth is fail-closed in every env). Gated solely + // by MOLECULE_ENV=dev/development so SaaS production keeps the + // bucket. Decoupled from ADMIN_TOKEN (dev now provisions one). + if isLocalDevEnv() { c.Header("X-RateLimit-Limit", "unlimited") c.Next() return diff --git a/workspace-server/internal/middleware/wsauth_middleware.go b/workspace-server/internal/middleware/wsauth_middleware.go index 8ffd74405..fa551f1be 100644 --- a/workspace-server/internal/middleware/wsauth_middleware.go +++ b/workspace-server/internal/middleware/wsauth_middleware.go @@ -120,12 +120,12 @@ func WorkspaceAuth(database *sql.DB) gin.HandlerFunc { return } } - // Local-dev escape hatch — see devmode.go. Unreachable on SaaS - // (hosted tenants always have ADMIN_TOKEN + MOLECULE_ENV=production). - if isDevModeFailOpen() { - c.Next() - return - } + // No bearer, no verified CP session: fail CLOSED in EVERY + // environment (harden/no-fail-open-auth). The old local-dev + // escape hatch that let bearer-less requests through when + // ADMIN_TOKEN was unset + MOLECULE_ENV=dev has been removed — + // local dev now authenticates with a provisioned ADMIN_TOKEN + // (see scripts/dev-start.sh). c.AbortWithStatusJSON(http.StatusUnauthorized, gin.H{"error": "missing workspace auth token"}) } } @@ -133,11 +133,18 @@ func WorkspaceAuth(database *sql.DB) gin.HandlerFunc { // AdminAuth returns a Gin middleware for global/admin routes (e.g. // /settings/secrets, /admin/secrets) that have no per-workspace scope. // +// FAIL-CLOSED in every environment (harden/no-fail-open-auth): there is no +// bearer-less path through this middleware. A request reaches the handler +// ONLY by presenting a valid credential (verified CP session cookie, org +// token, ADMIN_TOKEN, or — deprecated — a live workspace token). The former +// "Tier-1 lazy-bootstrap fail-open" (no live tokens + no ADMIN_TOKEN ⇒ pass) +// has been removed: it let an attacker pre-empt the first user by POSTing +// /org/import before any token was minted (C4 SaaS-launch finding). A fresh +// install must set ADMIN_TOKEN to reach admin routes. +// // # Credential tier (evaluated in order) // -// 1. Lazy-bootstrap fail-open: if no live workspace token exists anywhere on -// the platform (fresh install / pre-Phase-30 upgrade), every request passes -// through so existing deployments keep working. +// 1. Verified CP session cookie (SaaS canvas) — upstream-confirmed. // // 2. ADMIN_TOKEN env var (recommended, closes #684): when set, the bearer // MUST equal this value exactly (constant-time comparison). Workspace @@ -163,33 +170,17 @@ func AdminAuth(database *sql.DB) gin.HandlerFunc { ctx := c.Request.Context() adminSecret := os.Getenv("ADMIN_TOKEN") - hasLive, err := wsauth.HasAnyLiveTokenGlobal(ctx, database) - if err != nil { + // (harden/no-fail-open-auth) Both former fail-open branches have + // been REMOVED here: + // - Tier-1 lazy-bootstrap (no live tokens + no ADMIN_TOKEN ⇒ pass) + // - Tier-1b local-dev escape hatch (isDevModeFailOpen ⇒ pass) + // Admin auth is now fail-CLOSED in every environment. We still probe + // HasAnyLiveTokenGlobal so a datastore outage returns a structured + // 503 (not a silent pass), but its result no longer opens any path. + if _, err := wsauth.HasAnyLiveTokenGlobal(ctx, database); err != nil { abortAuthLookupError(c, "AdminAuth: HasAnyLiveTokenGlobal", err) return } - if !hasLive { - // Tier 1: fail-open is ONLY safe when ADMIN_TOKEN is unset - // (self-hosted dev, pre-Phase-30 upgrade). Hosted SaaS always - // sets ADMIN_TOKEN at provision time, and C4 (SaaS-launch - // blocker) showed that without this guard an attacker can - // pre-empt the first user by POSTing /org/import before any - // token gets minted. When ADMIN_TOKEN is set we fall through - // into the same bearer-check path Tier-2 uses below. - if adminSecret == "" { - c.Next() - return - } - } - - // Tier 1b: Local-dev escape hatch — see devmode.go. Lets the - // Canvas dashboard keep working after the first workspace token - // lands in the DB on `go run ./cmd/server`. Unreachable on SaaS - // (hosted tenants always have ADMIN_TOKEN + MOLECULE_ENV=production). - if isDevModeFailOpen() { - c.Next() - return - } // SaaS-canvas path: when the request carries a WorkOS session // cookie AND the CP confirms it's valid, accept without a diff --git a/workspace-server/internal/middleware/wsauth_middleware_test.go b/workspace-server/internal/middleware/wsauth_middleware_test.go index 9b9ba9a99..335ef796d 100644 --- a/workspace-server/internal/middleware/wsauth_middleware_test.go +++ b/workspace-server/internal/middleware/wsauth_middleware_test.go @@ -339,15 +339,24 @@ func TestWorkspaceAuth_WrongWorkspace_Returns401(t *testing.T) { // TestAdminAuth_FailOpen_NoTokensGlobally — C10/C11: on a fresh install (no // live tokens anywhere) the middleware must let the request through so existing // deployments keep working during the Phase-30 rollout. -func TestAdminAuth_FailOpen_NoTokensGlobally(t *testing.T) { +// TestAdminAuth_FreshInstallNoTokens_FailsClosed pins the post-hardening +// contract (harden/no-fail-open-auth): on a fresh install with NO live +// tokens anywhere AND no ADMIN_TOKEN, a bearer-less admin request now 401s. +// The former Tier-1 "lazy-bootstrap fail-open" (no tokens ⇒ 200) is GONE — +// it let an attacker pre-empt the first user via /org/import (C4). A fresh +// install must provision ADMIN_TOKEN to reach admin routes. +func TestAdminAuth_FreshInstallNoTokens_FailsClosed(t *testing.T) { t.Setenv("ADMIN_TOKEN", "") + t.Setenv("MOLECULE_ENV", "") mockDB, mock, err := sqlmock.New() if err != nil { t.Fatalf("sqlmock.New: %v", err) } defer mockDB.Close() - // HasAnyLiveTokenGlobal returns 0 — fresh install. + // HasAnyLiveTokenGlobal returns 0 — fresh install. We still probe it + // (so a DB outage yields a structured 503), but the result no longer + // opens any path. mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) @@ -360,8 +369,8 @@ func TestAdminAuth_FailOpen_NoTokensGlobally(t *testing.T) { req, _ := http.NewRequest(http.MethodGet, "/admin/secrets", nil) r.ServeHTTP(w, req) - if w.Code != http.StatusOK { - t.Errorf("C10 fail-open (no global tokens): expected 200, got %d: %s", w.Code, w.Body.String()) + if w.Code != http.StatusUnauthorized { + t.Errorf("fresh-install no-token fail-closed: expected 401, got %d: %s", w.Code, w.Body.String()) } if err := mock.ExpectationsWereMet(); err != nil { t.Errorf("unmet sqlmock expectations: %v", err) @@ -831,18 +840,23 @@ func TestAdminAuth_Issue180_ApprovalsListing_NoBearer_Returns401(t *testing.T) { } } -// TestAdminAuth_Issue180_ApprovalsListing_FailOpen_NoTokens documents the -// fail-open contract: on a fresh install (no tokens anywhere), the middleware -// must not block the canvas from polling /approvals/pending. -func TestAdminAuth_Issue180_ApprovalsListing_FailOpen_NoTokens(t *testing.T) { +// TestAdminAuth_Issue180_ApprovalsListing_FreshInstall_FailsClosed pins the +// post-hardening contract (harden/no-fail-open-auth): on a fresh install (no +// tokens anywhere, no ADMIN_TOKEN), the canvas polling /approvals/pending with +// no bearer now gets 401. The former #180 fail-open (200 on no-tokens) is gone +// — local dev now provisions an ADMIN_TOKEN and the canvas authenticates with +// it (scripts/dev-start.sh). +func TestAdminAuth_Issue180_ApprovalsListing_FreshInstall_FailsClosed(t *testing.T) { t.Setenv("ADMIN_TOKEN", "") + t.Setenv("MOLECULE_ENV", "") mockDB, mock, err := sqlmock.New() if err != nil { t.Fatalf("sqlmock.New: %v", err) } defer mockDB.Close() - // HasAnyLiveTokenGlobal returns 0 — fresh install, no tokens yet. + // HasAnyLiveTokenGlobal returns 0 — fresh install, no tokens yet. Probed + // for the 503-on-outage semantics, but it opens no path now. mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) @@ -855,24 +869,21 @@ func TestAdminAuth_Issue180_ApprovalsListing_FailOpen_NoTokens(t *testing.T) { req, _ := http.NewRequest(http.MethodGet, "/approvals/pending", nil) r.ServeHTTP(w, req) - if w.Code != http.StatusOK { - t.Errorf("#180 fail-open (no tokens): expected 200, got %d: %s", w.Code, w.Body.String()) + if w.Code != http.StatusUnauthorized { + t.Errorf("#180 fresh-install fail-closed: expected 401, got %d: %s", w.Code, w.Body.String()) } if err := mock.ExpectationsWereMet(); err != nil { t.Errorf("unmet sqlmock expectations: %v", err) } } -// TestWorkspaceAuth_DevModeEscapeHatch_NoBearer_FailsOpen documents the -// local-dev escape hatch on WorkspaceAuth. On `go run ./cmd/server` + -// `npm run dev`, Canvas at localhost:3000 calls the platform at -// localhost:8080 cross-port, so isSameOriginCanvas's Host==Referer -// check fails. Without this hatch the Canvas can't show per-workspace -// activity/delegations. -// -// SaaS never fires this branch because tenant provisioning sets both -// MOLECULE_ENV=production and ADMIN_TOKEN. -func TestWorkspaceAuth_DevModeEscapeHatch_NoBearer_FailsOpen(t *testing.T) { +// TestWorkspaceAuth_DevMode_NoBearer_FailsClosed pins the post-hardening +// contract (harden/no-fail-open-auth): the former local-dev escape hatch on +// WorkspaceAuth — which let a bearer-less request through when +// MOLECULE_ENV=dev + ADMIN_TOKEN unset — is GONE. Under exactly those +// conditions the request now 401s. Local dev authenticates with a +// provisioned ADMIN_TOKEN handed to the Canvas (scripts/dev-start.sh). +func TestWorkspaceAuth_DevMode_NoBearer_FailsClosed(t *testing.T) { t.Setenv("MOLECULE_ENV", "development") t.Setenv("ADMIN_TOKEN", "") @@ -882,7 +893,9 @@ func TestWorkspaceAuth_DevModeEscapeHatch_NoBearer_FailsOpen(t *testing.T) { } defer mockDB.Close() - // No DB queries expected — the hatch short-circuits before any lookup. + // No DB queries expected — WorkspaceAuth 401s before any lookup when + // there is no bearer / cookie. The hatch that used to short-circuit + // here no longer exists. r := gin.New() r.GET("/workspaces/:id/activity", WorkspaceAuth(mockDB), func(c *gin.Context) { @@ -894,8 +907,8 @@ func TestWorkspaceAuth_DevModeEscapeHatch_NoBearer_FailsOpen(t *testing.T) { "/workspaces/00000000-0000-0000-0000-000000000000/activity", nil) r.ServeHTTP(w, req) - if w.Code != http.StatusOK { - t.Errorf("WorkspaceAuth dev-mode hatch: expected 200, got %d: %s", w.Code, w.Body.String()) + if w.Code != http.StatusUnauthorized { + t.Errorf("WorkspaceAuth dev-mode fail-closed: expected 401, got %d: %s", w.Code, w.Body.String()) } } @@ -957,15 +970,14 @@ func TestWorkspaceAuth_DevModeEscapeHatch_IgnoredWhenAdminTokenSet(t *testing.T) } } -// TestAdminAuth_DevModeEscapeHatch_FailsOpenWithHasLiveTokens documents the -// Tier-1b dev-mode escape hatch. When the platform runs with MOLECULE_ENV=development -// and ADMIN_TOKEN is unset, AdminAuth must stay fail-open even after workspace -// tokens land in the DB. This keeps the Canvas dashboard usable in local dev -// after the first workspace is created (PR #1871 — quickstart bugless). -// -// SaaS never hits this path because tenant provisioning sets both -// ADMIN_TOKEN and MOLECULE_ENV=production. -func TestAdminAuth_DevModeEscapeHatch_FailsOpenWithHasLiveTokens(t *testing.T) { +// TestAdminAuth_DevMode_NoBearer_FailsClosed pins the post-hardening contract +// (harden/no-fail-open-auth): the former Tier-1b dev-mode escape hatch — which +// let AdminAuth pass a bearer-less request when MOLECULE_ENV=dev + ADMIN_TOKEN +// unset, even with live tokens in the DB — is GONE. Under exactly those +// conditions the request now 401s. Local dev authenticates with a provisioned +// ADMIN_TOKEN handed to the Canvas as NEXT_PUBLIC_ADMIN_TOKEN +// (scripts/dev-start.sh). +func TestAdminAuth_DevMode_NoBearer_FailsClosed(t *testing.T) { t.Setenv("MOLECULE_ENV", "development") t.Setenv("ADMIN_TOKEN", "") @@ -976,7 +988,7 @@ func TestAdminAuth_DevModeEscapeHatch_FailsOpenWithHasLiveTokens(t *testing.T) { defer mockDB.Close() // HasAnyLiveTokenGlobal returns 1 — tokens exist (post first-workspace). - // The Tier-1 fail-open branch WOULD close here. Tier-1b must still open. + // Probed for the 503-on-outage semantics, but it opens no path now. mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1)) @@ -989,8 +1001,8 @@ func TestAdminAuth_DevModeEscapeHatch_FailsOpenWithHasLiveTokens(t *testing.T) { req, _ := http.NewRequest(http.MethodGet, "/workspaces", nil) r.ServeHTTP(w, req) - if w.Code != http.StatusOK { - t.Errorf("dev-mode escape hatch: expected 200, got %d: %s", w.Code, w.Body.String()) + if w.Code != http.StatusUnauthorized { + t.Errorf("dev-mode fail-closed: expected 401, got %d: %s", w.Code, w.Body.String()) } if err := mock.ExpectationsWereMet(); err != nil { t.Errorf("unmet sqlmock expectations: %v", err) -- 2.52.0 From 6f56b1fa3084ca242c8313588ad6f59fa7dbc175 Mon Sep 17 00:00:00 2001 From: core-devops Date: Fri, 5 Jun 2026 01:17:59 -0700 Subject: [PATCH 2/2] harden(security): eliminate the two RETAINED fail-open paths (CanvasOrBearer + discovery) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prior pass (#2291) made AdminAuth/WorkspaceAuth fail-closed but RETAINED two fail-open patterns 'as a cosmetic tradeoff'. The CTO directive 'nothing should be fail-open' is ABSOLUTE, so this pass removes them too. ZERO fail-open paths now remain anywhere in workspace-server auth. CanvasOrBearer (workspace-server/internal/middleware/wsauth_middleware.go): - DB-error fail-open (`if err != nil { log; c.Next() }`) → now 503 fail-CLOSED via abortAuthLookupError (availability tradeoff, NO access). - lazy-bootstrap fail-open (`if !hasLive { c.Next() }`) → REMOVED. A zero-token install no longer passes EVERYTHING; bootstrap is via ADMIN_TOKEN (dev-start.sh provisions it for local dev; operator/SaaS sets it in prod — local mimics production). - forgeable cross-origin Origin-match pass (canvasOriginAllowed) → REMOVED. A no-bearer request passing purely on a spoofable Origin is effectively open even for a cosmetic route. The canvas now always sends a bearer (NEXT_PUBLIC_ADMIN_TOKEN), so nothing legitimate relied on it. The non-forgeable same-origin path (isSameOriginCanvas, gated by CANVAS_PROXY_URL) is kept. Helper + its 2 unit tests removed. validateDiscoveryCaller (workspace-server/internal/handlers/discovery.go): - DB-error fail-open (`if err != nil { return nil }`) → now writes 503 and returns a non-nil error (caller already `if err != nil { return }`). Bootstrap: ADMIN_TOKEN is the first-token credential (AdminAuth accepts it); documented in docs/runbooks/admin-auth.md (fail-closed everywhere; MOLECULE_ENV no longer gates any auth decision). quickstart.md already covered this. Tests: - no_fail_open_test.go: extended with CanvasOrBearer fail-closed cases (401 zero-token, 503 DB-error). discovery_test.go: added TestPeers/Discover_AuthProbeDBError_FailsClosed (503). - Flipped the stale assertions: CanvasOrBearer NoTokens/CanvasOrigin/DBError now assert fail-closed; removed canvasOriginAllowed tests. - tests/e2e/test_dev_mode.sh: repurposed from 'dev-mode fail-open works' to 'dev-mode is fail-CLOSED' (401 no-bearer, 200 with dev ADMIN_TOKEN). - Seeded the HasAnyLiveToken auth probe (grandfather count=0) in ~13 pre- existing discovery handler-body tests that previously relied on the fail-open swallowing the unmatched probe query. Watch-it-fail: restoring each removed branch turns the matching gate test RED (verified for all three: CanvasOrBearer lazy-bootstrap, CanvasOrBearer DB-error, discovery DB-error), reverting → green. go build ./..., go vet, and full go test ./... (46 pkgs) all green. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/runbooks/admin-auth.md | 36 +++++- tests/e2e/test_dev_mode.sh | 117 +++++++++++------- .../handlers/cross_tenant_isolation_test.go | 7 ++ .../internal/handlers/discovery.go | 19 ++- .../internal/handlers/discovery_test.go | 106 ++++++++++++++++ .../handlers/handlers_additional_test.go | 6 + .../handlers/handlers_extended_test.go | 6 + .../internal/middleware/no_fail_open_test.go | 78 ++++++++++++ .../internal/middleware/wsauth_middleware.go | 93 +++++++------- .../wsauth_middleware_canvasorbearer_test.go | 23 ++-- .../middleware/wsauth_middleware_test.go | 59 +++++---- 11 files changed, 417 insertions(+), 133 deletions(-) diff --git a/docs/runbooks/admin-auth.md b/docs/runbooks/admin-auth.md index 9158a6d63..ce30a9dee 100644 --- a/docs/runbooks/admin-auth.md +++ b/docs/runbooks/admin-auth.md @@ -1,5 +1,29 @@ # Admin Authentication Runbook +## Auth is fail-CLOSED in every environment — `ADMIN_TOKEN` is the bootstrap credential + +Per the CTO "nothing should be fail-open" directive, **every** auth path on the +workspace-server fails closed — there is no dev-mode / zero-token / DB-outage +hatch that grants access. This includes: + +- `AdminAuth` and `WorkspaceAuth` (admin + per-workspace routes), +- `CanvasOrBearer` (the cosmetic `PUT /canvas/viewport` route), and +- `validateDiscoveryCaller` (`/registry/:id/peers`, `/registry/discover/:id`). + +Consequence for **bootstrap**: a brand-new self-hosted / dev install has **no +DB-backed tokens yet**, and there is no longer a fail-open that lets the first +request through. The **only** way to reach admin routes (and to mint the first +workspace token via `POST /admin/workspaces/:id/tokens`) is to set `ADMIN_TOKEN` +in the platform environment and present it as the bearer. This is the "local +mimics production" principle: there is no zero-config bootstrap. + +- **Local dev:** `scripts/dev-start.sh` provisions a deterministic + `ADMIN_TOKEN` into `.env` (and exports the matching `NEXT_PUBLIC_ADMIN_TOKEN` + so the canvas authenticates with it). See `docs/quickstart.md`. +- **Self-hosted / SaaS:** set `ADMIN_TOKEN` to a strong random secret + (`openssl rand -base64 32`) in the platform env and bake the matching + `NEXT_PUBLIC_ADMIN_TOKEN` into the canvas bundle. + ## Required: set `MOLECULE_ENV` in all non-dev environments ```bash @@ -7,8 +31,10 @@ MOLECULE_ENV=production ``` -This matches the production tenant default and disables development-only -shortcuts. Staging and production smoke tests should use the real user/API +This matches the production tenant default. NOTE: `MOLECULE_ENV` no longer gates +any auth decision — it only drives NON-security local-dev conveniences (loopback +bind, relaxed rate limit). Setting it to `dev`/`development` does **not** relax +authentication. Staging and production smoke tests should use the real user/API workflow: create a workspace, then mint a one-time displayed workspace bearer with `POST /admin/workspaces/:id/tokens`. @@ -23,5 +49,7 @@ The platform uses `ADMIN_TOKEN` as the bearer credential for admin-gated endpoin | `POST /org/import` | `Authorization: Bearer ` | | `POST /admin/workspaces/:id/tokens` | `Authorization: Bearer `; plaintext token returned once | -Missing or invalid `ADMIN_TOKEN` → AdminAuth fails open in dev mode (no token set), or -returns 401 in production mode (token set but invalid). +Missing or invalid bearer → **401 in every environment** (fail-closed; no +dev-mode fail-open). If the auth datastore is unreachable, auth-gated routes +return **503** (`platform_unavailable`) — an availability tradeoff that grants no +access — rather than allowing the request through. diff --git a/tests/e2e/test_dev_mode.sh b/tests/e2e/test_dev_mode.sh index e7914c7a3..024534d52 100755 --- a/tests/e2e/test_dev_mode.sh +++ b/tests/e2e/test_dev_mode.sh @@ -1,24 +1,30 @@ #!/usr/bin/env bash -# E2E regression suite for the local-dev escape hatches added in -# fix/quickstart-bugless. These cover the exact user-facing breakages -# that dropped out of the partial squash-merge of PR #1871: +# E2E regression suite asserting that "dev mode" is fail-CLOSED. # -# 1. GET /workspaces returns 200 with no bearer after tokens exist in -# the DB — exercises the AdminAuth Tier-1b dev-mode hatch -# (middleware/devmode.go::isDevModeFailOpen). -# 2. GET /workspaces/:id/activity returns 200 with no bearer — the -# same hatch applied to WorkspaceAuth. -# 3. POST /workspaces/:id/a2a doesn't 502-SSRF on a loopback workspace -# URL — exercises handlers/ssrf.go::devModeAllowsLoopback. -# 4. GET /org/templates returns the curated set populated by -# clone-manifest.sh — exercises infra/scripts/setup.sh + the -# ListTemplates failure logging in handlers/org.go. +# History: this file used to assert the local-dev fail-open escape hatches +# (GET /workspaces 200 with NO bearer, /workspaces/:id/activity 200 with no +# bearer) added in fix/quickstart-bugless. Under the CTO "nothing should be +# fail-open" directive (harden/no-fail-open-auth) those hatches were REMOVED: +# auth is fail-CLOSED in EVERY environment, local dev included. This suite now +# pins the inverse contract — bearer-less admin/workspace requests 401, and the +# SAME requests with the dev ADMIN_TOKEN bearer succeed. # -# Requires: platform running on :8080 with MOLECULE_ENV=development and -# ADMIN_TOKEN unset. Matches the README quickstart env. +# What it verifies: +# 1. GET /workspaces 401s with NO bearer once tokens exist (was: 200 via the +# removed AdminAuth Tier-1b dev-mode hatch); 200 WITH the admin bearer. +# 2. GET /workspaces/:id/activity (and /delegations, /approvals/pending) 401 +# with no bearer (was: 200 via the WorkspaceAuth hatch); 200 WITH bearer. +# 3. GET /org/templates returns the curated set populated by clone-manifest.sh +# (unauth-readable bootstrap surface — unchanged). +# +# Requires: platform running on :8080 with MOLECULE_ENV=development AND +# ADMIN_TOKEN set (the dev value), with MOLECULE_ADMIN_TOKEN (or +# ADMIN_TOKEN) exported here so the suite can present the bearer. +# scripts/dev-start.sh provisions ADMIN_TOKEN locally; the e2e-api CI +# job sets it on the platform and exports the matching bearer. # # Usage: -# bash tests/e2e/test_dev_mode.sh +# MOLECULE_ADMIN_TOKEN=dev-local-admin-token bash tests/e2e/test_dev_mode.sh set -euo pipefail # shellcheck source=_lib.sh @@ -46,35 +52,44 @@ check_http() { fi } -echo "=== Dev-mode escape-hatch regression tests ===" +echo "=== Dev-mode fail-CLOSED regression tests ===" echo "" -# Pre-test: ensure MOLECULE_ENV=development and no ADMIN_TOKEN are in the -# platform's env. The request path doesn't let us read the platform's -# env directly, but we can verify the hatch is active by confirming the -# expected behaviour under the conditions the test otherwise sets up. +# The platform is fail-closed in every environment now, so the suite MUST have +# the admin bearer to drive the authenticated (200) assertions. Without it we +# cannot create / clean up workspaces — bail loudly rather than silently skip. +ADMIN_BEARER="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}" +if [ -z "$ADMIN_BEARER" ]; then + echo "FAIL: MOLECULE_ADMIN_TOKEN/ADMIN_TOKEN not set — auth is fail-closed in" + echo " every environment, so this suite needs the dev ADMIN_TOKEN bearer." + echo " e.g. MOLECULE_ADMIN_TOKEN=dev-local-admin-token bash $0" + exit 1 +fi +ADMIN_AUTH=(-H "Authorization: Bearer $ADMIN_BEARER") e2e_cleanup_all_workspaces # ---------------------------------------------------------------------- -# Section 1 — AdminAuth dev-mode hatch +# Section 1 — AdminAuth is fail-CLOSED (dev-mode hatch removed) # ---------------------------------------------------------------------- -# Before fix: once any workspace had tokens in the DB, GET /workspaces -# closed to unauthenticated callers and the Canvas broke. The hatch -# keeps it open specifically in dev mode. - -echo "--- Section 1: AdminAuth dev-mode hatch ---" +echo "--- Section 1: AdminAuth fail-closed ---" +# No bearer → 401 in dev mode (the removed hatch used to return 200). R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/workspaces") -check_http "GET /workspaces (empty DB)" "200" "$R" +check_http "GET /workspaces (no bearer) is fail-CLOSED" "401" "$R" -# Create a workspace so tokens land in the DB. +# With the dev admin bearer → 200. +R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/workspaces" "${ADMIN_AUTH[@]}") +check_http "GET /workspaces (with admin bearer)" "200" "$R" + +# Create a workspace (authenticated) so tokens land in the DB. R=$(curl -s -w "\n%{http_code}" -X POST "$BASE/workspaces" \ + "${ADMIN_AUTH[@]}" \ -H "Content-Type: application/json" \ -d '{"name":"Dev-Mode-Test","tier":1,"runtime":"external","external":true}') CODE=$(echo "$R" | tail -n1) BODY=$(echo "$R" | sed '$d') -check_http "POST /workspaces (create)" "201" "$CODE" +check_http "POST /workspaces (create, with admin bearer)" "201" "$CODE" WS_ID=$(echo "$BODY" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || true) if [ -z "$WS_ID" ]; then @@ -83,43 +98,55 @@ if [ -z "$WS_ID" ]; then exit 1 fi -# Ensure a real workspace token exists so AdminAuth now sees a live token. On -# pre-fix builds the next /workspaces call would 401 — on post-fix it -# must stay 200 because MOLECULE_ENV=development + ADMIN_TOKEN unset. +# Ensure a real workspace token exists so AdminAuth sees a live token globally. TOKEN=$(echo "$BODY" | e2e_extract_token) if [ -z "$TOKEN" ]; then e2e_mint_workspace_token "$WS_ID" >/dev/null fi +# With tokens now in the DB, the bearer-less call STILL 401s (no lazy-bootstrap +# / dev-mode fall-through), and the authenticated call still 200s. R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/workspaces") -check_http "GET /workspaces (after token minted, no bearer)" "200" "$R" +check_http "GET /workspaces (after token minted, no bearer) is fail-CLOSED" "401" "$R" + +R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/workspaces" "${ADMIN_AUTH[@]}") +check_http "GET /workspaces (after token minted, with admin bearer)" "200" "$R" # ---------------------------------------------------------------------- -# Section 2 — WorkspaceAuth dev-mode hatch +# Section 2 — WorkspaceAuth is fail-CLOSED (dev-mode hatch removed) # ---------------------------------------------------------------------- -# Before fix: /workspaces/:id/activity 401'd once tokens existed — -# the Canvas side panel's chat history load broke. - echo "" -echo "--- Section 2: WorkspaceAuth dev-mode hatch ---" +echo "--- Section 2: WorkspaceAuth fail-closed ---" +# No bearer → 401 (the removed hatch used to return 200). R=$(curl -s -o /dev/null -w "%{http_code}" \ "$BASE/workspaces/$WS_ID/activity?type=a2a_receive&limit=50") -check_http "GET /workspaces/:id/activity (no bearer)" "200" "$R" +check_http "GET /workspaces/:id/activity (no bearer) is fail-CLOSED" "401" "$R" R=$(curl -s -o /dev/null -w "%{http_code}" \ "$BASE/workspaces/$WS_ID/delegations") -check_http "GET /workspaces/:id/delegations (no bearer)" "200" "$R" +check_http "GET /workspaces/:id/delegations (no bearer) is fail-CLOSED" "401" "$R" R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/approvals/pending") -check_http "GET /approvals/pending (no bearer)" "200" "$R" +check_http "GET /approvals/pending (no bearer) is fail-CLOSED" "401" "$R" + +# Same requests WITH the admin bearer → 200. +R=$(curl -s -o /dev/null -w "%{http_code}" \ + "$BASE/workspaces/$WS_ID/activity?type=a2a_receive&limit=50" "${ADMIN_AUTH[@]}") +check_http "GET /workspaces/:id/activity (with admin bearer)" "200" "$R" + +R=$(curl -s -o /dev/null -w "%{http_code}" \ + "$BASE/workspaces/$WS_ID/delegations" "${ADMIN_AUTH[@]}") +check_http "GET /workspaces/:id/delegations (with admin bearer)" "200" "$R" + +R=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/approvals/pending" "${ADMIN_AUTH[@]}") +check_http "GET /approvals/pending (with admin bearer)" "200" "$R" # ---------------------------------------------------------------------- # Section 3 — Template registry populated by setup.sh # ---------------------------------------------------------------------- -# Before fix: setup.sh didn't run clone-manifest.sh so the template -# palette was empty and the molecule-dev in-tree copy was broken. - +# GET /org/templates is an unauthenticated bootstrap surface (the template +# palette must render before the user has a credential) — unchanged. echo "" echo "--- Section 3: Template registry ---" diff --git a/workspace-server/internal/handlers/cross_tenant_isolation_test.go b/workspace-server/internal/handlers/cross_tenant_isolation_test.go index 8289ddf10..23ff72026 100644 --- a/workspace-server/internal/handlers/cross_tenant_isolation_test.go +++ b/workspace-server/internal/handlers/cross_tenant_isolation_test.go @@ -68,6 +68,10 @@ func TestPeers_CrossTenant_OrgRootNotLeaked(t *testing.T) { caller := "org-a-root" // parent_id IS NULL — an org root for tenant A + // validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather. + // (Unordered match is set above, so this can be consumed at any point.) + seedDiscoveryGrandfather(mock, caller) + // parent_id lookup → NULL (caller is an org root) mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id ="). WithArgs(caller). @@ -128,6 +132,9 @@ func TestPeers_SameOrg_SiblingsStillWork(t *testing.T) { caller := "org-a-child-1" parent := "org-a-root" + // validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather. + seedDiscoveryGrandfather(mock, caller) + mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id ="). WithArgs(caller). WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(parent)) diff --git a/workspace-server/internal/handlers/discovery.go b/workspace-server/internal/handlers/discovery.go index e57788e2b..c81f71855 100644 --- a/workspace-server/internal/handlers/discovery.go +++ b/workspace-server/internal/handlers/discovery.go @@ -422,15 +422,22 @@ func (h *DiscoveryHandler) CheckAccess(c *gin.Context) { // workspaces with tokens must present a matching Bearer, token binding // is strict (A's token cannot authenticate caller B). // -// Fail-open on DB hiccups. Unlike secrets.Values (which returns plaintext -// secrets and must fail closed), discovery only exposes peer URLs that -// are already behind the existing `CanCommunicate` hierarchy check — a -// momentary DB outage shouldn't take agent-to-agent discovery offline. +// (harden/no-fail-open-auth) Fails CLOSED on DB error. This used to return nil +// (allow) on a HasAnyLiveToken hiccup "because discovery only exposes peer URLs +// already behind CanCommunicate" — but the CTO "nothing fail-open" directive is +// absolute, and a request must never gain access because the auth datastore is +// unreachable. A datastore error now writes 503 (availability tradeoff that +// grants NO access) and returns a non-nil error; the caller already does +// `if err != nil { return }` so the 503 body is what the client sees. func validateDiscoveryCaller(ctx context.Context, c *gin.Context, workspaceID string) error { hasLive, err := wsauth.HasAnyLiveToken(ctx, db.DB, workspaceID) if err != nil { - log.Printf("wsauth: discovery HasAnyLiveToken(%s) failed: %v — allowing request", workspaceID, err) - return nil + log.Printf("wsauth: discovery HasAnyLiveToken(%s): datastore lookup failed (returning 503): %v", workspaceID, err) + c.JSON(http.StatusServiceUnavailable, gin.H{ + "error": "platform datastore unavailable — retry shortly", + "code": "platform_unavailable", + }) + return errors.New("auth datastore unavailable") } if !hasLive { return nil // legacy / pre-upgrade diff --git a/workspace-server/internal/handlers/discovery_test.go b/workspace-server/internal/handlers/discovery_test.go index 001271059..b4bb9a38e 100644 --- a/workspace-server/internal/handlers/discovery_test.go +++ b/workspace-server/internal/handlers/discovery_test.go @@ -49,6 +49,10 @@ func TestDiscover_WorkspaceNotFound_WithCaller(t *testing.T) { setupTestRedis(t) handler := NewDiscoveryHandler() + // validateDiscoveryCaller probes HasAnyLiveToken(callerID) first; + // grandfather (count=0) so the bearer-less request is allowed through. + seedDiscoveryGrandfather(mock, "ws-caller") + // CanCommunicate will need DB lookups — both workspace name lookups // For the access check: caller lookup succeeds, target lookup fails mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id ="). @@ -113,6 +117,9 @@ func TestPeers_WithParent(t *testing.T) { setupTestRedis(t) handler := NewDiscoveryHandler() + // validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather. + seedDiscoveryGrandfather(mock, "ws-sibling-1") + // Expect parent_id lookup for the requesting workspace mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id ="). WithArgs("ws-sibling-1"). @@ -165,6 +172,9 @@ func TestPeers_NotFound(t *testing.T) { setupTestRedis(t) handler := NewDiscoveryHandler() + // validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather. + seedDiscoveryGrandfather(mock, "ws-ghost") + // Workspace not found mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id ="). WithArgs("ws-ghost"). @@ -191,6 +201,11 @@ func TestPeers_DBError(t *testing.T) { setupTestRedis(t) handler := NewDiscoveryHandler() + // Auth probe grandfathers; this test targets a DB error on the + // *handler-body* parent_id query → 500 (distinct from the auth-probe + // DB error which now fails closed with 503). + seedDiscoveryGrandfather(mock, "ws-dberr") + mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id ="). WithArgs("ws-dberr"). WillReturnError(sql.ErrConnDone) @@ -216,6 +231,9 @@ func TestPeers_RootWorkspace_NoPeers(t *testing.T) { setupTestRedis(t) handler := NewDiscoveryHandler() + // validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather. + seedDiscoveryGrandfather(mock, "ws-root-alone") + // Root workspace (parent_id is NULL) mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id ="). WithArgs("ws-root-alone"). @@ -270,6 +288,9 @@ func peersFilterFixture(t *testing.T) (*DiscoveryHandler, sqlmock.Sqlmock) { mock := setupTestDB(t) setupTestRedis(t) + // validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather. + seedDiscoveryGrandfather(mock, "ws-self") + mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id ="). WithArgs("ws-self"). WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow("ws-pm")) @@ -947,6 +968,24 @@ func peersAuthFixtureHasLiveToken(mock sqlmock.Sqlmock, workspaceID string) { WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1)) } +// seedDiscoveryGrandfather seeds the FIRST query validateDiscoveryCaller +// issues (HasAnyLiveToken → 0 = legacy / pre-upgrade) so a bearer-less +// discovery request grandfathers through and the test can exercise the +// handler body. +// +// (harden/no-fail-open-auth) Before this branch, validateDiscoveryCaller +// returned nil (allow) when the HasAnyLiveToken probe ERRORED — so these +// handler-body tests never had to seed the probe at all; the unmatched +// COUNT query erred and the fail-open swallowed it. Now that the DB-error +// path fails CLOSED (503), the probe must be seeded explicitly. count=0 is +// the legitimate grandfather path (no live tokens for this workspace yet), +// which is what these pre-existing tests intend. +func seedDiscoveryGrandfather(mock sqlmock.Sqlmock, workspaceID string) { + mock.ExpectQuery("SELECT COUNT.+workspace_auth_tokens"). + WithArgs(workspaceID). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) +} + func TestPeers_DevMode_BearerlessRequest_FailsClosed(t *testing.T) { // (harden/no-fail-open-auth) Exact old-hatch conditions: // MOLECULE_ENV=development AND ADMIN_TOKEN empty, with a live token in @@ -1025,6 +1064,70 @@ func TestPeers_DevModeFailOpen_ClosedInProduction(t *testing.T) { } } +// TestPeers_AuthProbeDBError_FailsClosed pins the removal of +// validateDiscoveryCaller's fail-open-on-DB-error branch +// (harden/no-fail-open-auth). When the HasAnyLiveToken auth probe ERRORS, the +// request must NOT be allowed through — it now returns 503 (availability +// tradeoff that grants NO access). Before this branch the function returned nil +// (allow) on a DB hiccup, so the request reached the peer queries. +// +// Watch-it-fail: restore `if err != nil { log; return nil }` in +// validateDiscoveryCaller → this flips 503→(200/handler path) and fails. +func TestPeers_AuthProbeDBError_FailsClosed(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + handler := NewDiscoveryHandler() + + // The FIRST query validateDiscoveryCaller issues (HasAnyLiveToken) errors. + // No further expectations: a fail-closed 503 must be written before the + // peer-list queries run. + mock.ExpectQuery("SELECT COUNT.+workspace_auth_tokens"). + WithArgs("ws-dberr-auth"). + WillReturnError(sql.ErrConnDone) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-dberr-auth"}} + c.Request = httptest.NewRequest("GET", "/registry/ws-dberr-auth/peers", nil) + + handler.Peers(c) + + if w.Code != http.StatusServiceUnavailable { + t.Fatalf("auth-probe DB error must fail CLOSED: expected 503, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +// TestDiscover_AuthProbeDBError_FailsClosed is the Discover-endpoint companion +// to TestPeers_AuthProbeDBError_FailsClosed: a HasAnyLiveToken error on the +// caller's discovery request fails CLOSED with 503 (was: fail-open allow). +func TestDiscover_AuthProbeDBError_FailsClosed(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + handler := NewDiscoveryHandler() + + mock.ExpectQuery("SELECT COUNT.+workspace_auth_tokens"). + WithArgs("ws-caller"). + WillReturnError(sql.ErrConnDone) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-target"}} + c.Request = httptest.NewRequest("GET", "/registry/discover/ws-target", nil) + c.Request.Header.Set("X-Workspace-ID", "ws-caller") + + handler.Discover(c) + + if w.Code != http.StatusServiceUnavailable { + t.Fatalf("Discover auth-probe DB error must fail CLOSED: expected 503, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + // ==================== Peers — #383 self never appears in result ==================== // TestPeers_ExcludeSelf_DefenseInDepth verifies the final-line filter in @@ -1047,6 +1150,9 @@ func TestPeers_ExcludeSelf_DefenseInDepth(t *testing.T) { const selfID = "ws-xiaodong" + // validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather. + seedDiscoveryGrandfather(mock, selfID) + // parent_id lookup — workspace has a parent. mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id ="). WithArgs(selfID). diff --git a/workspace-server/internal/handlers/handlers_additional_test.go b/workspace-server/internal/handlers/handlers_additional_test.go index 902c06cf7..82af0bd6d 100644 --- a/workspace-server/internal/handlers/handlers_additional_test.go +++ b/workspace-server/internal/handlers/handlers_additional_test.go @@ -551,6 +551,9 @@ func TestDiscover_AccessDenied(t *testing.T) { setupTestRedis(t) handler := NewDiscoveryHandler() + // validateDiscoveryCaller probes HasAnyLiveToken(callerID) first; grandfather. + seedDiscoveryGrandfather(mock, "ws-child-a") + // CanCommunicate: different parents → denied mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id ="). WithArgs("ws-child-a"). @@ -582,6 +585,9 @@ func TestDiscover_TargetOffline(t *testing.T) { setupTestRedis(t) handler := NewDiscoveryHandler() + // validateDiscoveryCaller probes HasAnyLiveToken(callerID) first; grandfather. + seedDiscoveryGrandfather(mock, "ws-caller") + // Share a parent so communication is allowed under post-#1955 rules sharedParent := "ws-parent" mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id ="). diff --git a/workspace-server/internal/handlers/handlers_extended_test.go b/workspace-server/internal/handlers/handlers_extended_test.go index 0e0fecaa3..423cbf04d 100644 --- a/workspace-server/internal/handlers/handlers_extended_test.go +++ b/workspace-server/internal/handlers/handlers_extended_test.go @@ -373,6 +373,9 @@ func TestExtended_DiscoverWithCallerID(t *testing.T) { setupTestRedis(t) handler := NewDiscoveryHandler() + // validateDiscoveryCaller probes HasAnyLiveToken(callerID) first; grandfather. + seedDiscoveryGrandfather(mock, "ws-caller") + // CanCommunicate needs to look up both workspaces // Share a parent so communication is allowed under post-#1955 rules sharedParent := "ws-parent" @@ -464,6 +467,9 @@ func TestExtended_Peers(t *testing.T) { setupTestRedis(t) handler := NewDiscoveryHandler() + // validateDiscoveryCaller probes HasAnyLiveToken(:id) first; grandfather. + seedDiscoveryGrandfather(mock, "ws-peer") + // Expect parent_id lookup for requesting workspace (root-level, no parent) mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id ="). WithArgs("ws-peer"). diff --git a/workspace-server/internal/middleware/no_fail_open_test.go b/workspace-server/internal/middleware/no_fail_open_test.go index 0812560e8..bfb8b4b99 100644 --- a/workspace-server/internal/middleware/no_fail_open_test.go +++ b/workspace-server/internal/middleware/no_fail_open_test.go @@ -116,6 +116,84 @@ func TestWorkspaceAuth_NoFailOpen_UnderOldHatchConditions(t *testing.T) { } } +// TestCanvasOrBearer_NoFailOpen_UnderOldHatchConditions is the regression gate +// for the two fail-open branches removed from CanvasOrBearer +// (harden/no-fail-open-auth, "nothing fail-open" pass 2): +// +// (a) lazy-bootstrap pass: `if !hasLive { c.Next(); return }` — a zero-token +// install used to pass EVERYTHING through. Now a bearer-less request on a +// fresh install (HasAnyLiveTokenGlobal → 0) fails CLOSED with 401. +// (b) fail-open-on-DB-error: `if err != nil { log; c.Next(); return }` — a +// HasAnyLiveTokenGlobal error used to ALLOW. Now it fails CLOSED with 503. +// +// Watch-it-fail: restore either short-circuit in CanvasOrBearer and the +// matching sub-case flips (401→200 / 503→200) and fails. +func TestCanvasOrBearer_NoFailOpen_UnderOldHatchConditions(t *testing.T) { + // (a) Fresh install (0 live tokens), no bearer, no ADMIN_TOKEN → 401. + t.Run("zero_token_install_no_bearer_fails_closed_401", func(t *testing.T) { + t.Setenv("ADMIN_TOKEN", "") + t.Setenv("CORS_ORIGINS", "") + + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer mockDB.Close() + + mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + + handlerCalled := false + r := gin.New() + r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) { + handlerCalled = true + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil) + r.ServeHTTP(w, req) + + if w.Code != http.StatusUnauthorized { + t.Errorf("CanvasOrBearer lazy-bootstrap fail-open removed: zero-token install must 401, got %d: %s", + w.Code, w.Body.String()) + } + if handlerCalled { + t.Error("handler reached on a fresh-install bearer-less request — lazy-bootstrap fail-open not removed") + } + }) + + // (b) Auth datastore error → 503 (NOT allow). + t.Run("db_error_fails_closed_503", func(t *testing.T) { + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer mockDB.Close() + + mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). + WillReturnError(http.ErrAbortHandler) // any non-nil error suffices + + handlerCalled := false + r := gin.New() + r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) { + handlerCalled = true + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil) + r.ServeHTTP(w, req) + + if w.Code != http.StatusServiceUnavailable { + t.Errorf("CanvasOrBearer DB-error fail-open removed: must 503, got %d: %s", w.Code, w.Body.String()) + } + if handlerCalled { + t.Error("handler reached on a datastore-error request — DB-error fail-open not removed") + } + }) +} + // TestNoFailOpenAuthHelperReexists is a source-guard: it asserts that no // fail-open auth helper (the removed isDevModeFailOpen / IsDevModeFailOpen) // has crept back into the middleware package as real code. The replacement diff --git a/workspace-server/internal/middleware/wsauth_middleware.go b/workspace-server/internal/middleware/wsauth_middleware.go index fa551f1be..9407a4e7d 100644 --- a/workspace-server/internal/middleware/wsauth_middleware.go +++ b/workspace-server/internal/middleware/wsauth_middleware.go @@ -272,34 +272,46 @@ func cpSessionActor(cookieHeader string) string { // Accepts either: // // 1. A valid bearer token (same contract as AdminAuth) — covers molecli, -// agent-to-platform calls, and anyone using the API directly. -// 2. A browser Origin header that matches CORS_ORIGINS (canvas itself). -// This is NOT a strict auth boundary — curl can forge Origin — but for -// cosmetic-only routes the trade-off is acceptable. Non-cosmetic routes -// MUST NOT use this middleware (see #194 review on why it would re-open -// #164 CRITICAL if applied to /bundles/import). +// agent-to-platform calls, the browser canvas (which now sends +// Authorization: Bearer $NEXT_PUBLIC_ADMIN_TOKEN on every platform +// call — see canvas/src/lib/api.ts platformAuthHeaders), and anyone +// using the API directly. +// 2. A same-origin canvas request (Referer/Host match), but ONLY when the +// combined-tenant canvas proxy is active (CANVAS_PROXY_URL set). This is +// a real same-origin check the browser cannot forge cross-origin (see +// isSameOriginCanvas / IsVerifiedCanvasSession, #623/#194) — NOT the +// trivially-forgeable cross-origin Origin header. The forgeable +// CORS_ORIGINS Origin-match path was REMOVED under the CTO +// "nothing fail-open" directive (a no-bearer request passing purely on a +// spoofable Origin is effectively open even for a cosmetic route, and is +// no longer needed now that the canvas always sends a bearer). // -// Lazy-bootstrap fail-open preserved: zero-token installs pass everything -// through so fresh self-hosted / dev sessions aren't bricked. +// Non-cosmetic routes MUST NOT use this middleware (see #194 review on why it +// would re-open #164 CRITICAL if applied to /bundles/import). +// +// (harden/no-fail-open-auth) Two former fail-open branches are REMOVED: +// - DB-error on HasAnyLiveTokenGlobal used to `c.Next()` (allow); it now +// fails CLOSED with 503 (availability tradeoff that grants NO access). +// - The lazy-bootstrap pass (`!hasLive ⇒ c.Next()`) used to let a +// zero-token install through EVERYTHING; it is gone. Bootstrap is now via +// ADMIN_TOKEN (provisioned by scripts/dev-start.sh for local dev, +// operator/SaaS-set in production) — local mimics production. func CanvasOrBearer(database *sql.DB) gin.HandlerFunc { return func(c *gin.Context) { ctx := c.Request.Context() - hasLive, err := wsauth.HasAnyLiveTokenGlobal(ctx, database) - if err != nil { - log.Printf("wsauth: CanvasOrBearer HasAnyLiveTokenGlobal failed: %v — allowing request", err) - c.Next() - return - } - if !hasLive { - c.Next() + // Probe global token state for the (no-bearer) same-origin path + // below. Fail CLOSED on a datastore error — an availability tradeoff + // that does NOT grant access (was: log + c.Next() fail-open). + if _, err := wsauth.HasAnyLiveTokenGlobal(ctx, database); err != nil { + abortAuthLookupError(c, "CanvasOrBearer: HasAnyLiveTokenGlobal", err) return } // Path 1: bearer present → bearer MUST validate. Do not fall through - // to Origin on an invalid bearer — an attacker with a revoked / - // expired token + a matching Origin would otherwise bypass auth. - // Empty bearer → skip to Origin path (canvas never sends one). + // to the same-origin path on an invalid bearer — an attacker with a + // revoked / expired token would otherwise bypass auth. + // Empty bearer → fall to the same-origin canvas path. if tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization")); tok != "" { // Admin token accepted for canvas dashboard adminSecret := os.Getenv("ADMIN_TOKEN") @@ -315,13 +327,10 @@ func CanvasOrBearer(database *sql.DB) gin.HandlerFunc { return } - // Path 2: canvas origin match (cross-origin canvas). - if canvasOriginAllowed(c.GetHeader("Origin")) { - c.Next() - return - } - - // Path 3: same-origin canvas (tenant image). + // Path 2: same-origin canvas (combined-tenant image). Gated behind + // canvasProxyActive (CANVAS_PROXY_URL) and a non-forgeable + // Referer/Host same-origin check — NOT the spoofable cross-origin + // Origin header (that path was removed, see doc comment above). if isSameOriginCanvas(c) { c.Next() return @@ -331,30 +340,14 @@ func CanvasOrBearer(database *sql.DB) gin.HandlerFunc { } } -// canvasOriginAllowed returns true if origin matches any entry in the -// CORS_ORIGINS env var (comma-separated) or the localhost defaults. -// Exact-match only; no prefix or wildcard logic — that's handled by the -// real CORS middleware upstream. The intent here is "did this request come -// from the canvas page the user is already logged into?" — a binary check. -func canvasOriginAllowed(origin string) bool { - if origin == "" { - return false - } - allowed := []string{"http://localhost:3000", "http://localhost:3001"} - if v := os.Getenv("CORS_ORIGINS"); v != "" { - for _, o := range strings.Split(v, ",") { - if o = strings.TrimSpace(o); o != "" { - allowed = append(allowed, o) - } - } - } - for _, a := range allowed { - if a == origin { - return true - } - } - return false -} +// (harden/no-fail-open-auth) canvasOriginAllowed was REMOVED. It matched a +// request's (trivially forgeable, cross-origin) Origin header against +// CORS_ORIGINS and was the basis of CanvasOrBearer's no-bearer Origin-match +// pass — effectively open to any curl that sets a matching Origin. Under the +// CTO "nothing fail-open" directive that path is gone; the canvas now always +// sends a bearer (NEXT_PUBLIC_ADMIN_TOKEN), so nothing legitimate relied on it. +// The CORS *response-header* allowlist is handled by the real CORS middleware +// upstream, unaffected by this removal. // isSameOriginCanvas returns true when the request appears to come from the // canvas UI served by the same Go process (tenant image). In this topology, diff --git a/workspace-server/internal/middleware/wsauth_middleware_canvasorbearer_test.go b/workspace-server/internal/middleware/wsauth_middleware_canvasorbearer_test.go index f086c7d76..0d503c944 100644 --- a/workspace-server/internal/middleware/wsauth_middleware_canvasorbearer_test.go +++ b/workspace-server/internal/middleware/wsauth_middleware_canvasorbearer_test.go @@ -143,11 +143,15 @@ func TestCanvasOrBearer_AdminTokenEnv_Passes(t *testing.T) { } } -// TestCanvasOrBearer_DBError_FailOpen pins the documented behavior on a -// HasAnyLiveTokenGlobal failure. The middleware logs and falls open so a -// flaky DB doesn't lock canvas users out of cosmetic routes. Hardcoded in -// the comment block; this is a reminder if anyone changes that semantic. -func TestCanvasOrBearer_DBError_FailOpen(t *testing.T) { +// TestCanvasOrBearer_DBError_FailsClosed pins the removal of the +// fail-open-on-DB-error branch (harden/no-fail-open-auth). A +// HasAnyLiveTokenGlobal failure used to log + c.Next() (allow); it now fails +// CLOSED with 503 — an availability tradeoff that grants NO access. The +// handler must NOT be reached. +// +// Watch-it-fail: restore `if err != nil { log; c.Next(); return }` in +// CanvasOrBearer → this flips 503→200 and fails. +func TestCanvasOrBearer_DBError_FailsClosed(t *testing.T) { mockDB, mock, err := sqlmock.New() if err != nil { t.Fatalf("sqlmock: %v", err) @@ -156,8 +160,10 @@ func TestCanvasOrBearer_DBError_FailOpen(t *testing.T) { mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). WillReturnError(http.ErrAbortHandler) // any non-nil error suffices + handlerCalled := false r := gin.New() r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) { + handlerCalled = true c.JSON(http.StatusOK, gin.H{"ok": true}) }) @@ -165,8 +171,11 @@ func TestCanvasOrBearer_DBError_FailOpen(t *testing.T) { req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil) r.ServeHTTP(w, req) - if w.Code != http.StatusOK { - t.Errorf("DB error fail-open: got %d, want 200 (%s)", w.Code, w.Body.String()) + if w.Code != http.StatusServiceUnavailable { + t.Errorf("DB error must fail CLOSED: got %d, want 503 (%s)", w.Code, w.Body.String()) + } + if handlerCalled { + t.Error("handler reached on a datastore-error request — DB-error fail-open not removed") } } diff --git a/workspace-server/internal/middleware/wsauth_middleware_test.go b/workspace-server/internal/middleware/wsauth_middleware_test.go index 335ef796d..e8af81d86 100644 --- a/workspace-server/internal/middleware/wsauth_middleware_test.go +++ b/workspace-server/internal/middleware/wsauth_middleware_test.go @@ -1116,7 +1116,16 @@ func TestAdminAuth_Issue120_PatchWorkspace_NoBearer_Returns401(t *testing.T) { // Accepts bearer or a matching Origin header. MUST NOT be used anywhere a // forged request would leak data or create resources. -func TestCanvasOrBearer_NoTokens_FailOpen(t *testing.T) { +// TestCanvasOrBearer_NoTokens_FailsClosed pins the removal of the +// lazy-bootstrap fail-open (harden/no-fail-open-auth): a zero-token install +// must NOT pass everything through. A bearer-less request on a fresh install +// (HasAnyLiveTokenGlobal → 0) now 401s. Bootstrap is via ADMIN_TOKEN +// (scripts/dev-start.sh provisions it for local dev; operator/SaaS sets it in +// production) — not a zero-config fail-open. +// +// Watch-it-fail: restore `if !hasLive { c.Next(); return }` in CanvasOrBearer +// → this flips 401→200 and fails. +func TestCanvasOrBearer_NoTokens_FailsClosed(t *testing.T) { mockDB, mock, err := sqlmock.New() if err != nil { t.Fatalf("sqlmock: %v", err) @@ -1126,8 +1135,10 @@ func TestCanvasOrBearer_NoTokens_FailOpen(t *testing.T) { mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + handlerCalled := false r := gin.New() r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) { + handlerCalled = true c.JSON(http.StatusOK, gin.H{"ok": true}) }) @@ -1135,8 +1146,11 @@ func TestCanvasOrBearer_NoTokens_FailOpen(t *testing.T) { req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil) r.ServeHTTP(w, req) - if w.Code != http.StatusOK { - t.Errorf("bootstrap fail-open: got %d, want 200 (%s)", w.Code, w.Body.String()) + if w.Code != http.StatusUnauthorized { + t.Errorf("zero-token install must fail CLOSED (lazy-bootstrap fail-open removed): got %d, want 401 (%s)", w.Code, w.Body.String()) + } + if handlerCalled { + t.Error("handler reached on a fresh-install bearer-less request — lazy-bootstrap fail-open not removed") } } @@ -1207,7 +1221,16 @@ func TestCanvasOrBearer_TokensExist_WrongOrigin_Returns401(t *testing.T) { } } -func TestCanvasOrBearer_TokensExist_CanvasOrigin_Passes(t *testing.T) { +// TestCanvasOrBearer_TokensExist_ForgeableOrigin_NoBearer_FailsClosed pins the +// removal of the cross-origin Origin-match cosmetic path +// (harden/no-fail-open-auth). A no-bearer request whose forgeable Origin header +// matches CORS_ORIGINS used to pass; it now 401s. The canvas always sends a +// bearer (NEXT_PUBLIC_ADMIN_TOKEN), so legitimate traffic is unaffected, and a +// curl that forges Origin can no longer reach even a cosmetic route. +// +// Watch-it-fail: restore `if canvasOriginAllowed(c.GetHeader("Origin")) { +// c.Next(); return }` in CanvasOrBearer → this flips 401→200 and fails. +func TestCanvasOrBearer_TokensExist_ForgeableOrigin_NoBearer_FailsClosed(t *testing.T) { mockDB, mock, err := sqlmock.New() if err != nil { t.Fatalf("sqlmock: %v", err) @@ -1219,18 +1242,24 @@ func TestCanvasOrBearer_TokensExist_CanvasOrigin_Passes(t *testing.T) { t.Setenv("CORS_ORIGINS", "https://acme.moleculesai.app,https://bob.moleculesai.app") + handlerCalled := false r := gin.New() r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) { + handlerCalled = true c.JSON(http.StatusOK, gin.H{"ok": true}) }) w := httptest.NewRecorder() req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil) + // A matching-but-forgeable Origin with NO bearer must NOT pass anymore. req.Header.Set("Origin", "https://acme.moleculesai.app") r.ServeHTTP(w, req) - if w.Code != http.StatusOK { - t.Errorf("canvas origin: got %d, want 200 (%s)", w.Code, w.Body.String()) + if w.Code != http.StatusUnauthorized { + t.Errorf("no-bearer request on a forgeable matching Origin must fail CLOSED (Origin-match path removed): got %d, want 401 (%s)", w.Code, w.Body.String()) + } + if handlerCalled { + t.Error("handler reached on a no-bearer forgeable-Origin request — Origin-match fail-open not removed") } } @@ -1310,21 +1339,9 @@ func TestCanvasOrBearer_WrongOrigin_Blocked(t *testing.T) { } } -func TestCanvasOriginAllowed_EmptyOriginRejected(t *testing.T) { - if canvasOriginAllowed("") { - t.Error("empty Origin must not pass") - } -} - -func TestCanvasOriginAllowed_LocalhostDefault(t *testing.T) { - t.Setenv("CORS_ORIGINS", "") - if !canvasOriginAllowed("http://localhost:3000") { - t.Error("localhost:3000 should be allowed by default") - } - if canvasOriginAllowed("http://evil.example.com") { - t.Error("random origin should not be allowed") - } -} +// (harden/no-fail-open-auth) TestCanvasOriginAllowed_* were REMOVED along with +// the canvasOriginAllowed helper they exercised — the forgeable cross-origin +// Origin-match cosmetic path no longer exists in CanvasOrBearer. // ── Issue #623 regression ───────────────────────────────────────────────────── // AdminAuth must NOT accept forged Origin headers. Any container on the Docker -- 2.52.0