From 5a3dbb95e1c94b11bad7eaa57426e6c931490193 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sat, 25 Apr 2026 23:49:28 -0700 Subject: [PATCH] fix(api): probe /cp/auth/me before redirecting on 401 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The actual cause-fix for the staging-tabs E2E saga (#2073/#2074/#2075). Old behaviour: ANY 401 from any fetch on a SaaS tenant subdomain called redirectToLogin → window.location.href = AuthKit. This is wrong. Plenty of 401s don't mean "session is dead": - workspace-scoped endpoints (/workspaces/:id/peers, /plugins) require a workspace-scoped token, not the tenant admin bearer - resource-permission mismatches (user has tenant access but not this specific workspace) - misconfigured proxies returning 401 spuriously A single transient one of those yanked authenticated users back to AuthKit. Same bug yanked the staging-tabs E2E off the tenant origin mid-test for 6+ hours tonight, leading to the cascade of test-side mocks (#2073/#2074/#2075) that worked around the symptom without fixing the cause. This PR fixes it at the source. The new logic: - 401 on /cp/auth/* path → that IS the canonical session-dead signal → redirect (unchanged) - 401 on any other path with slug present → probe /cp/auth/me: probe 401 → session genuinely dead → redirect probe 200 → session fine, endpoint refused this token → throw a real Error, caller renders error state probe network err → assume session-fine (conservative) → throw real Error - slug empty (localhost / LAN / reserved subdomain) → throw without redirect (unchanged) The probe adds one extra fetch on a 401, only when slug is set and the path isn't already auth-scoped. That's rare and worthwhile — a transient probe round-trip is cheap; an unwanted auth redirect is a UX disaster. Tests: - api-401.test.ts rewritten with the full matrix: * /cp/auth/me 401 → redirect (no probe, that IS the signal) * non-auth 401 + probe 401 → redirect * non-auth 401 + probe 200 → throw, no redirect ← the fix * non-auth 401 + probe network err → throw, no redirect * empty slug paths (localhost/LAN/reserved) → throw, no probe - 43 tests in canvas/src/lib/__tests__/api*.test.ts all pass - tsc clean The staging-tabs E2E spec's universal-401 route handler stays as defense-in-depth (silences resource-load console noise + guards against panels without try/catch), but the comment now describes its role honestly: api.ts is the primary fix, the route is the safety net. Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/e2e/staging-tabs.spec.ts | 39 ++++++----- canvas/src/lib/__tests__/api-401.test.ts | 87 +++++++++++++++++++----- canvas/src/lib/api.ts | 48 ++++++++++--- 3 files changed, 129 insertions(+), 45 deletions(-) diff --git a/canvas/e2e/staging-tabs.spec.ts b/canvas/e2e/staging-tabs.spec.ts index 5c5273f6..bfc788ce 100644 --- a/canvas/e2e/staging-tabs.spec.ts +++ b/canvas/e2e/staging-tabs.spec.ts @@ -87,27 +87,30 @@ test.describe("staging canvas tabs", () => { }), ); - // Universal 401 → empty-200 fallback for any fetch. + // Universal 401 → empty-200 fallback (defense-in-depth). // - // The narrow first pass (#2073, scoped to /workspaces//*) didn't - // catch all the redirect triggers — SkillsTab.tsx alone fetches - // /plugins and /plugins/sources outside the /workspaces/ tree, and - // each of those 401s with the tenant admin bearer in SaaS mode. - // canvas/src/lib/api.ts:62-74 calls `redirectToLogin` on ANY 401, - // so a single non-workspace-scoped 401 yanks the page off the - // tenant origin and breaks every locator that runs after. + // The original product bug was canvas/src/lib/api.ts:62-74 calling + // `redirectToLogin` on EVERY 401 — a single workspace-scoped 401 + // (e.g. /workspaces/:id/peers, /plugins) yanked the user (and the + // test) to AuthKit. That's now fixed at the source: api.ts probes + // /cp/auth/me before redirecting, so a 401 from a non-auth path + // with a live session throws a regular error instead. // - // Broaden the route to ALL fetches: pass-through real responses, - // swap 401s for 200 + empty body. Skip `/cp/auth/me` and the - // tenant-origin HTML/JS bundle requests (resourceType !== fetch); - // those are already handled or shouldn't be intercepted. + // This route handler stays as a SAFETY NET, not the primary + // defense: + // 1. It silences resource-load console noise from the browser + // (those messages don't include the URL — useless in + // diagnostics, captured by the filter in the assertion + // block but having no 401s reach the network is cleaner). + // 2. It guards against panels that DON'T have try/catch around + // their api calls — an unhandled rejection would surface + // as console.error → fail the assertion. Panels SHOULD + // handle errors, but until they're all audited, this is + // the test's belt to api.ts's braces. // - // For tab-render tests we don't need real data — the gate is - // "panel mounts without crashing, no Failed-to-load toast". Body - // shape is best-effort by URL: list endpoints (paths not ending - // in a UUID-shaped segment) get `[]`; single-resource endpoints - // get `{}`. Both are valid JSON; well-written panels render an - // empty state for either rather than throwing. + // Pass-through real responses; swap 401s for 200 + empty body. + // Skip /cp/auth/me (mocked above) and non-fetch resources + // (HTML/JS/CSS bundles that should NOT be intercepted). await context.route("**", async (route, request) => { if (request.resourceType() !== "fetch") { return route.fallback(); diff --git a/canvas/src/lib/__tests__/api-401.test.ts b/canvas/src/lib/__tests__/api-401.test.ts index b3589d12..ad41af35 100644 --- a/canvas/src/lib/__tests__/api-401.test.ts +++ b/canvas/src/lib/__tests__/api-401.test.ts @@ -6,32 +6,44 @@ import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"; // runs happily in node. Splitting keeps the node tests fast. // --------------------------------------------------------------------------- -// 401 handling — gated on SaaS-tenant hostname +// 401 handling — session-probe-before-redirect // --------------------------------------------------------------------------- // -// Before fix/quickstart-bugless, any 401 from any endpoint triggered -// `redirectToLogin()`, navigating to `/cp/auth/login`. That route -// exists only on SaaS (mounted by cp_proxy when CP_UPSTREAM_URL is -// set). On localhost / self-hosted / Vercel preview it 404s, so the -// user lands on a broken login page instead of seeing the actual error. +// History: +// 1. fix/quickstart-bugless: gated redirect on SaaS hostname (slug). +// 2. fix/api-401-probe-before-redirect (this file): probe /cp/auth/me +// before redirecting on a 401 from a non-auth path. The earlier +// behaviour redirected on EVERY 401, so a single 401 from +// /workspaces/:id/plugins (workspace-scoped — refused by the +// tenant admin bearer) yanked the user to AuthKit even when +// the session was fine. The probe lets us tell "session dead" +// from "endpoint refused this token." // -// These tests lock in: -// - SaaS tenant hostname (*.moleculesai.app) → 401 still redirects. -// - non-SaaS hostname (localhost, LAN IP, apex) → 401 throws, no -// redirect, so the caller renders a real error affordance. +// Matrix: +// slug | path | probe → me | expected +// --- | --- | --- | --- +// acme | /cp/auth/me | (n/a) | redirect (path IS auth) +// acme | /workspaces/... | 401 | redirect (session dead) +// acme | /workspaces/... | 200 | throw, no redirect +// acme | /workspaces/... | network err| throw, no redirect +// "" | /workspaces/... | (n/a) | throw, no redirect (no slug) const mockFetch = vi.fn(); globalThis.fetch = mockFetch; -function mockFailure(status: number, text: string) { +function mockNextResponse(status: number, text = "") { mockFetch.mockResolvedValueOnce({ - ok: false, + ok: status >= 200 && status < 300, status, json: () => Promise.reject(new Error("no json")), text: () => Promise.resolve(text), } as unknown as Response); } +function mockNextNetworkError() { + mockFetch.mockRejectedValueOnce(new Error("network")); +} + function setHostname(host: string) { Object.defineProperty(window, "location", { configurable: true, @@ -59,27 +71,66 @@ describe("api 401 handling", () => { vi.resetModules(); }); - it("redirects to login on SaaS tenant hostname", async () => { + it("redirects when /cp/auth/me itself 401s — that IS the session-dead signal", async () => { setHostname("acme.moleculesai.app"); - mockFailure(401, '{"error":"admin auth required"}'); + // Single fetch: the /cp/auth/me call itself. + mockNextResponse(401, '{"error":"unauthenticated"}'); const { api } = await import("../api"); - await expect(api.get("/workspaces")).rejects.toThrow(/Session expired/); + await expect(api.get("/cp/auth/me")).rejects.toThrow(/Session expired/); expect(redirectSpy).toHaveBeenCalledWith("sign-in"); + // No probe fired — we already know the session is dead. + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + + it("redirects when /cp/auth/me probe ALSO 401s — session genuinely dead", async () => { + setHostname("acme.moleculesai.app"); + // First call: the workspace-scoped fetch returns 401. + mockNextResponse(401, '{"error":"workspace token required"}'); + // Second call: the probe to /cp/auth/me also 401s. + mockNextResponse(401, '{"error":"unauthenticated"}'); + + const { api } = await import("../api"); + await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/Session expired/); + expect(redirectSpy).toHaveBeenCalledWith("sign-in"); + }); + + it("does NOT redirect when probe returns 200 — endpoint refused this token, session fine", async () => { + setHostname("acme.moleculesai.app"); + // First call: workspace-scoped 401. + mockNextResponse(401, '{"error":"workspace token required"}'); + // Second call: probe shows the session is alive. + mockNextResponse(200, '{"user_id":"u1","org_id":"o1","email":"x@y"}'); + + const { api } = await import("../api"); + await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/401/); + expect(redirectSpy).not.toHaveBeenCalled(); + }); + + it("does NOT redirect when probe network-errors — conservative fallback", async () => { + setHostname("acme.moleculesai.app"); + mockNextResponse(401, '{"error":"workspace token required"}'); + mockNextNetworkError(); + + const { api } = await import("../api"); + await expect(api.get("/workspaces/abc/plugins")).rejects.toThrow(/401/); + expect(redirectSpy).not.toHaveBeenCalled(); }); it("does NOT redirect on localhost — throws a real error instead", async () => { setHostname("localhost"); - mockFailure(401, '{"error":"admin auth required"}'); + mockNextResponse(401, '{"error":"admin auth required"}'); const { api } = await import("../api"); await expect(api.get("/workspaces")).rejects.toThrow(/401/); expect(redirectSpy).not.toHaveBeenCalled(); + // No slug → no probe fires either. + expect(mockFetch).toHaveBeenCalledTimes(1); }); it("does NOT redirect on a LAN hostname", async () => { setHostname("192.168.1.74"); - mockFailure(401, '{"error":"missing workspace auth token"}'); + mockNextResponse(401, '{"error":"missing workspace auth token"}'); const { api } = await import("../api"); await expect(api.get("/workspaces/abc/activity")).rejects.toThrow(/401/); @@ -91,7 +142,7 @@ describe("api 401 handling", () => { // Users landing on app.moleculesai.app (pre-tenant-selection) must // see the real 401 error rather than loop on login. setHostname("app.moleculesai.app"); - mockFailure(401, '{"error":"admin auth required"}'); + mockNextResponse(401, '{"error":"admin auth required"}'); const { api } = await import("../api"); await expect(api.get("/workspaces")).rejects.toThrow(/401/); diff --git a/canvas/src/lib/api.ts b/canvas/src/lib/api.ts index e65d92fd..edd1d696 100644 --- a/canvas/src/lib/api.ts +++ b/canvas/src/lib/api.ts @@ -60,15 +60,45 @@ async function request( return request(method, path, body, retryCount + 1, options); } if (res.status === 401) { - // Session expired or credentials lost. On SaaS (tenant subdomain) - // the login page lives at /cp/auth/login and is mounted by the - // control-plane reverse proxy — redirect. On self-hosted / local - // dev / Vercel preview there IS no /cp/* mount, so redirecting - // would navigate to a 404 ("404 page not found") instead of the - // real error the user should see. In that case, throw instead - // and let the caller render a meaningful failure (retry button, - // error banner, etc.). - if (slug) { + // Distinguish "session is dead" from "this endpoint refused this + // token." Old behaviour blanket-redirected on every 401, so a + // single transient 401 from a workspace-scoped endpoint + // (/workspaces/:id/peers, /plugins, etc. that need a workspace + // token rather than the tenant admin bearer) yanked the user + // back to AuthKit even when their session was perfectly fine. + // That broke the staging-tabs E2E for the entire 2026-04-25 + // night; #2073/#2074 worked around the symptom in the test by + // mocking 401→200 for every fetch, but the user-facing bug + // stayed. + // + // The canonical "session is dead" signal is /cp/auth/me + // returning 401. For any 401 on a non-auth path, probe + // /cp/auth/me before deciding to redirect: + // - probe 401 → session is actually dead → redirect + // - probe 200 → session is fine, the endpoint just refused + // our specific token → throw a real error, + // caller renders an error state + // - probe network error → assume session-fine (conservative; + // better to throw than to redirect on a + // transient probe failure) + // + // Self-hosted / localhost / reserved subdomains still throw + // without redirecting (slug is empty in those cases) — same + // policy as before. + const isAuthPath = path.startsWith("/cp/auth/"); + let sessionDead = isAuthPath; + if (!isAuthPath && slug) { + try { + const probe = await fetch(`${PLATFORM_URL}/cp/auth/me`, { + credentials: "include", + signal: AbortSignal.timeout(5000), + }); + sessionDead = probe.status === 401; + } catch { + // Probe failed (network/timeout) — fall through to throw. + } + } + if (sessionDead && slug) { const { redirectToLogin } = await import("./auth"); redirectToLogin("sign-in"); throw new Error("Session expired — redirecting to login");