From 5e36c6638cd37e45f574d52b25d433e8ee6a3138 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 26 Apr 2026 00:01:56 -0700 Subject: [PATCH] feat(platform,canvas): classify "datastore unavailable" as 503 + dedicated UI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User reported the canvas threw a generic "API GET /workspaces: 500 {auth check failed}" error when local Postgres + Redis were both down. Two problems: 1. The error code (500) and message ("auth check failed") said nothing useful. The actual condition was "platform can't reach its datastore to validate your token" — a Service Unavailable class, not Internal Server Error. 2. The canvas had no way to distinguish infra-down from a real auth bug, so it rendered the raw API string in the same generic-error overlay it uses for everything. Fix in two layers: Server (wsauth_middleware.go): - New abortAuthLookupError helper centralises all three sites that previously returned `500 {"error":"auth check failed"}` when HasAnyLiveTokenGlobal or orgtoken.Validate hit a DB error. - Now returns 503 + structured body `{"error": "...", "code": "platform_unavailable"}`. 503 is the correct semantic ("retry shortly, infra is unavailable") and the code field is the contract the canvas reads. - Body deliberately excludes the underlying DB error string — production hostnames / connection-string fragments must not leak into a user-visible error toast. Canvas (api.ts): - New PlatformUnavailableError class. api.ts inspects 503 responses for the platform_unavailable code and throws the typed error instead of the generic "API GET /…: 503 …" message. Generic 503s (upstream-busy, etc.) keep the legacy path so existing busy-retry UX isn't disrupted. Canvas (page.tsx): - New PlatformDownDiagnostic component renders when the initial hydration catches PlatformUnavailableError. Surfaces the actual condition with operator-actionable copy ("brew services start postgresql@14 / redis") + pointer to the platform log + a Reload button. Tests: - Go: TestAdminAuth_DatastoreError_Returns503PlatformUnavailable pins the response shape (status, code field, no DB-error leak) - Canvas: 5 tests for PlatformUnavailableError classification — typed throw on 503+code match, generic-Error fallback for 503-without-code (upstream busy), 500 stays generic, non-JSON body falls back to generic. 1015 canvas tests + full Go middleware suite pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/src/app/page.tsx | 57 ++++++++++- canvas/src/lib/__tests__/api.test.ts | 98 ++++++++++++++++++- canvas/src/lib/api.ts | 28 ++++++ .../internal/middleware/wsauth_middleware.go | 33 +++++-- .../middleware/wsauth_middleware_test.go | 57 +++++++++++ 5 files changed, 264 insertions(+), 9 deletions(-) diff --git a/canvas/src/app/page.tsx b/canvas/src/app/page.tsx index 74291409..e64b5aba 100644 --- a/canvas/src/app/page.tsx +++ b/canvas/src/app/page.tsx @@ -7,13 +7,19 @@ import { CommunicationOverlay } from "@/components/CommunicationOverlay"; import { Spinner } from "@/components/Spinner"; import { connectSocket, disconnectSocket } from "@/store/socket"; import { useCanvasStore } from "@/store/canvas"; -import { api } from "@/lib/api"; +import { api, PlatformUnavailableError } from "@/lib/api"; import type { WorkspaceData } from "@/store/socket"; export default function Home() { const hydrationError = useCanvasStore((s) => s.hydrationError); const setHydrationError = useCanvasStore((s) => s.setHydrationError); const [hydrating, setHydrating] = useState(true); + // Distinct from hydrationError: platform-down is its own UX path + // (different copy, different action — the user's next step is to + // check local services, not to retry the API call). Tracked + // separately rather than encoded into hydrationError so the + // generic-error branch can stay simple. + const [platformDown, setPlatformDown] = useState(false); useEffect(() => { connectSocket(); @@ -28,8 +34,11 @@ export default function Home() { useCanvasStore.getState().setViewport(viewport); } }).catch((err) => { - // Initial hydration failed — show error banner to user console.error("Canvas: initial hydration failed", err); + if (err instanceof PlatformUnavailableError) { + setPlatformDown(true); + return; + } useCanvasStore.getState().setHydrationError( err instanceof Error && err.message ? err.message : "Failed to load canvas" ); @@ -53,6 +62,10 @@ export default function Home() { ); } + if (platformDown) { + return ; + } + return ( <> @@ -78,3 +91,43 @@ export default function Home() { ); } + +/** + * Dedicated diagnostic for the case where the platform reported its + * datastore (Postgres / Redis) is unreachable. Distinct from the + * generic API-error overlay: the user's next action is to check + * local services, not to retry the API call. Includes the exact + * commands for the common dev-host setup. + */ +function PlatformDownDiagnostic() { + return ( +
+
+ Platform infrastructure unreachable +
+

+ The platform server returned 503 platform_unavailable. + That means it can't reach Postgres or Redis to validate your session. + Most common cause on a dev host: one of those services stopped. +

+
+
Try first
+
{`brew services start postgresql@14
+brew services start redis`}
+
+

+ If both are running, check /tmp/molecule-server.log for + the underlying error. If you're on hosted SaaS, this is a platform incident — try again in a moment. +

+ +
+ ); +} diff --git a/canvas/src/lib/__tests__/api.test.ts b/canvas/src/lib/__tests__/api.test.ts index 09eb0eff..d95e367b 100644 --- a/canvas/src/lib/__tests__/api.test.ts +++ b/canvas/src/lib/__tests__/api.test.ts @@ -7,7 +7,7 @@ import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"; const mockFetch = vi.fn(); globalThis.fetch = mockFetch; -import { api } from "../api"; +import { api, PlatformUnavailableError } from "../api"; // --------------------------------------------------------------------------- // Helpers @@ -380,3 +380,99 @@ describe("api – request timeout signal", () => { expect(sigA).not.toBe(sigB); }); }); + +// --------------------------------------------------------------------------- +// PlatformUnavailableError classification +// --------------------------------------------------------------------------- +// +// When the platform's wsauth middleware can't reach Postgres/Redis to +// validate a token, it returns 503 + {error, code:"platform_unavailable"}. +// api.ts must surface that as a typed error so the page-level renderer +// can show a dedicated diagnostic instead of a generic 5xx toast. + +describe("PlatformUnavailableError classification", () => { + beforeEach(() => { + mockFetch.mockReset(); + }); + + function mock503Platform(detail = "platform datastore unavailable — retry shortly") { + const body = JSON.stringify({ error: detail, code: "platform_unavailable" }); + mockFetch.mockResolvedValueOnce({ + ok: false, + status: 503, + json: () => Promise.reject(new Error("not used")), + text: () => Promise.resolve(body), + } as unknown as Response); + } + + it("throws PlatformUnavailableError on 503 + code=platform_unavailable", async () => { + mock503Platform(); + let thrown: unknown; + try { + await api.get("/workspaces"); + } catch (e) { + thrown = e; + } + expect(thrown).toBeInstanceOf(PlatformUnavailableError); + expect((thrown as PlatformUnavailableError).code).toBe("platform_unavailable"); + }); + + it("preserves the server-provided error string as the Error message", async () => { + mock503Platform("Postgres unreachable"); + try { + await api.get("/workspaces"); + } catch (e) { + expect(e).toBeInstanceOf(PlatformUnavailableError); + expect((e as Error).message).toBe("Postgres unreachable"); + return; + } + throw new Error("expected to throw"); + }); + + it("does NOT classify a generic 503 (no platform_unavailable code) as PlatformUnavailableError", async () => { + // Generic upstream-busy 503 — should keep the legacy generic-Error + // path so existing busy-retry UX isn't disrupted. + mockFetch.mockResolvedValueOnce({ + ok: false, + status: 503, + json: () => Promise.reject(new Error("not used")), + text: () => Promise.resolve(JSON.stringify({ error: "upstream busy" })), + } as unknown as Response); + try { + await api.get("/workspaces/x/a2a"); + } catch (e) { + expect(e).not.toBeInstanceOf(PlatformUnavailableError); + expect((e as Error).message).toContain("503"); + return; + } + throw new Error("expected to throw"); + }); + + it("does NOT classify on 500 (server kept legacy 500 for true internal errors)", async () => { + mockFailure(500, "boom"); + try { + await api.get("/workspaces"); + } catch (e) { + expect(e).not.toBeInstanceOf(PlatformUnavailableError); + return; + } + throw new Error("expected to throw"); + }); + + it("falls back to generic Error when 503 body isn't JSON", async () => { + mockFetch.mockResolvedValueOnce({ + ok: false, + status: 503, + json: () => Promise.reject(new Error("not used")), + text: () => Promise.resolve("Service Unavailable"), + } as unknown as Response); + try { + await api.get("/workspaces"); + } catch (e) { + expect(e).not.toBeInstanceOf(PlatformUnavailableError); + expect((e as Error).message).toContain("503"); + return; + } + throw new Error("expected to throw"); + }); +}); diff --git a/canvas/src/lib/api.ts b/canvas/src/lib/api.ts index e65d92fd..79f6b9f6 100644 --- a/canvas/src/lib/api.ts +++ b/canvas/src/lib/api.ts @@ -77,11 +77,39 @@ async function request( } if (!res.ok) { const text = await res.text(); + // Recognise the platform's structured "datastore unreachable" + // shape (returned by wsauth_middleware.abortAuthLookupError when + // Postgres/Redis is down). Surface as a typed error so callers + // can render a dedicated diagnostic instead of a generic toast. + if (res.status === 503 && text) { + try { + const parsed = JSON.parse(text) as { code?: string; error?: string }; + if (parsed.code === "platform_unavailable") { + throw new PlatformUnavailableError(parsed.error || "platform datastore unavailable"); + } + } catch (err) { + // Re-throw the typed error if that's what we just constructed. + // JSON.parse failures fall through to the generic Error below. + if (err instanceof PlatformUnavailableError) throw err; + } + } throw new Error(`API ${method} ${path}: ${res.status} ${text}`); } return res.json(); } +/** Thrown when the platform reports its datastore (Postgres/Redis) is + * unreachable. Surface with a dedicated diagnostic UI rather than a + * generic API-error toast — the user's next action is to check local + * services, not to retry the API call. */ +export class PlatformUnavailableError extends Error { + readonly code = "platform_unavailable" as const; + constructor(message: string) { + super(message); + this.name = "PlatformUnavailableError"; + } +} + export const api = { get: (path: string, options?: RequestOptions) => request("GET", path, undefined, 0, options), post: (path: string, body?: unknown, options?: RequestOptions) => request("POST", path, body, 0, options), diff --git a/workspace-server/internal/middleware/wsauth_middleware.go b/workspace-server/internal/middleware/wsauth_middleware.go index 93538753..ef82d8e7 100644 --- a/workspace-server/internal/middleware/wsauth_middleware.go +++ b/workspace-server/internal/middleware/wsauth_middleware.go @@ -14,6 +14,30 @@ import ( "github.com/gin-gonic/gin" ) +// abortAuthLookupError is the single response shape for "the auth +// middleware tried to validate a token but the underlying datastore +// lookup failed." Returns 503 (not 500) because the right semantic +// is "platform infrastructure unavailable, retry shortly" — not +// "internal server error in our application logic". The structured +// `code` lets the canvas distinguish this from generic 5xx and +// surface a dedicated diagnostic ("Postgres/Redis unreachable — +// check local services") instead of a confusing +// `auth check failed` toast. +// +// `where` is included in the log line so the operator can grep +// which call site fired (WorkspaceAuth vs AdminAuth, the +// HasAnyLiveTokenGlobal probe vs orgtoken.Validate). The +// user-visible body deliberately does NOT include the underlying +// error string — that could leak DB hostnames, connection-string +// fragments, or internal code paths. +func abortAuthLookupError(c *gin.Context, where string, err error) { + log.Printf("wsauth: %s: datastore lookup failed (returning 503): %v", where, err) + c.AbortWithStatusJSON(http.StatusServiceUnavailable, gin.H{ + "error": "platform datastore unavailable — retry shortly", + "code": "platform_unavailable", + }) +} + // WorkspaceAuth returns a Gin middleware that enforces per-workspace bearer-token // authentication on /workspaces/:id/* sub-routes. // @@ -73,8 +97,7 @@ func WorkspaceAuth(database *sql.DB) gin.HandlerFunc { c.Next() return } else if !errors.Is(err, orgtoken.ErrInvalidToken) { - log.Printf("wsauth: WorkspaceAuth: orgtoken.Validate: %v", err) - c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "auth check failed"}) + abortAuthLookupError(c, "WorkspaceAuth: orgtoken.Validate", err) return } // Per-workspace token — narrowest scope, bound to this :id. @@ -136,8 +159,7 @@ func AdminAuth(database *sql.DB) gin.HandlerFunc { hasLive, err := wsauth.HasAnyLiveTokenGlobal(ctx, database) if err != nil { - log.Printf("wsauth: AdminAuth: HasAnyLiveTokenGlobal failed: %v", err) - c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "auth check failed"}) + abortAuthLookupError(c, "AdminAuth: HasAnyLiveTokenGlobal", err) return } if !hasLive { @@ -214,8 +236,7 @@ func AdminAuth(database *sql.DB) gin.HandlerFunc { return } else if !errors.Is(err, orgtoken.ErrInvalidToken) { // DB error — fail closed and log. Don't expose DB text. - log.Printf("wsauth: AdminAuth: orgtoken.Validate: %v", err) - c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "auth check failed"}) + abortAuthLookupError(c, "AdminAuth: orgtoken.Validate", err) return } diff --git a/workspace-server/internal/middleware/wsauth_middleware_test.go b/workspace-server/internal/middleware/wsauth_middleware_test.go index edfd2230..6c802a79 100644 --- a/workspace-server/internal/middleware/wsauth_middleware_test.go +++ b/workspace-server/internal/middleware/wsauth_middleware_test.go @@ -2,8 +2,11 @@ package middleware import ( "crypto/sha256" + "encoding/json" + "errors" "net/http" "net/http/httptest" + "strings" "testing" "github.com/DATA-DOG/go-sqlmock" @@ -1699,3 +1702,57 @@ func TestAdminAuth_684_SpecificRoutes_NoBearer_Returns401(t *testing.T) { }) } } + +// ==================== platform-unavailable classification ==================== +// +// abortAuthLookupError replaces the prior opaque +// `500 {"error":"auth check failed"}` with a 503 + structured code so +// the canvas can render a dedicated diagnostic instead of a confusing +// toast. Pin both the status code and the body shape against +// regression — this is the contract the canvas's +// PlatformUnavailableError classifier reads at api.ts. + +func TestAdminAuth_DatastoreError_Returns503PlatformUnavailable(t *testing.T) { + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer mockDB.Close() + + // Simulate Postgres being down — HasAnyLiveTokenGlobal's COUNT + // query returns a connection error. + mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). + WillReturnError(errors.New("dial tcp [::1]:5432: connect: connection refused")) + + r := gin.New() + r.GET("/workspaces", AdminAuth(mockDB), func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodGet, "/workspaces", nil) + r.ServeHTTP(w, req) + + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503, got %d: %s", w.Code, w.Body.String()) + } + var resp map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil { + t.Fatalf("response body must be JSON: %v (body=%s)", err, w.Body.String()) + } + if resp["code"] != "platform_unavailable" { + t.Errorf("response code = %v, want platform_unavailable (canvas reads this for the dedicated diagnostic)", resp["code"]) + } + if _, ok := resp["error"].(string); !ok { + t.Errorf("response must include human-readable error string, got %v", resp["error"]) + } + // The body must NOT leak the underlying DB error string — + // production hostnames / connection-string fragments could land + // in an error toast otherwise. + if errStr, _ := resp["error"].(string); strings.Contains(errStr, "dial tcp") { + t.Errorf("response leaks underlying DB error: %q", errStr) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +}