feat(platform,canvas): classify "datastore unavailable" as 503 + dedicated UI
User reported the canvas threw a generic "API GET /workspaces: 500
{auth check failed}" error when local Postgres + Redis were both
down. Two problems:
1. The error code (500) and message ("auth check failed") said
nothing useful. The actual condition was "platform can't reach
its datastore to validate your token" — a Service Unavailable
class, not Internal Server Error.
2. The canvas had no way to distinguish infra-down from a real
auth bug, so it rendered the raw API string in the same
generic-error overlay it uses for everything.
Fix in two layers:
Server (wsauth_middleware.go):
- New abortAuthLookupError helper centralises all three sites
that previously returned `500 {"error":"auth check failed"}`
when HasAnyLiveTokenGlobal or orgtoken.Validate hit a DB error.
- Now returns 503 + structured body
`{"error": "...", "code": "platform_unavailable"}`. 503 is
the correct semantic ("retry shortly, infra is unavailable")
and the code field is the contract the canvas reads.
- Body deliberately excludes the underlying DB error string —
production hostnames / connection-string fragments must not
leak into a user-visible error toast.
Canvas (api.ts):
- New PlatformUnavailableError class. api.ts inspects 503
responses for the platform_unavailable code and throws the
typed error instead of the generic "API GET /…: 503 …"
message. Generic 503s (upstream-busy, etc.) keep the legacy
path so existing busy-retry UX isn't disrupted.
Canvas (page.tsx):
- New PlatformDownDiagnostic component renders when the
initial hydration catches PlatformUnavailableError.
Surfaces the actual condition with operator-actionable
copy ("brew services start postgresql@14 / redis") +
pointer to the platform log + a Reload button.
Tests:
- Go: TestAdminAuth_DatastoreError_Returns503PlatformUnavailable
pins the response shape (status, code field, no DB-error leak)
- Canvas: 5 tests for PlatformUnavailableError classification —
typed throw on 503+code match, generic-Error fallback for
503-without-code (upstream busy), 500 stays generic, non-JSON
body falls back to generic.
1015 canvas tests + full Go middleware suite pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b47a1b87b0
commit
5e36c6638c
@ -7,13 +7,19 @@ import { CommunicationOverlay } from "@/components/CommunicationOverlay";
|
||||
import { Spinner } from "@/components/Spinner";
|
||||
import { connectSocket, disconnectSocket } from "@/store/socket";
|
||||
import { useCanvasStore } from "@/store/canvas";
|
||||
import { api } from "@/lib/api";
|
||||
import { api, PlatformUnavailableError } from "@/lib/api";
|
||||
import type { WorkspaceData } from "@/store/socket";
|
||||
|
||||
export default function Home() {
|
||||
const hydrationError = useCanvasStore((s) => s.hydrationError);
|
||||
const setHydrationError = useCanvasStore((s) => s.setHydrationError);
|
||||
const [hydrating, setHydrating] = useState(true);
|
||||
// Distinct from hydrationError: platform-down is its own UX path
|
||||
// (different copy, different action — the user's next step is to
|
||||
// check local services, not to retry the API call). Tracked
|
||||
// separately rather than encoded into hydrationError so the
|
||||
// generic-error branch can stay simple.
|
||||
const [platformDown, setPlatformDown] = useState(false);
|
||||
|
||||
useEffect(() => {
|
||||
connectSocket();
|
||||
@ -28,8 +34,11 @@ export default function Home() {
|
||||
useCanvasStore.getState().setViewport(viewport);
|
||||
}
|
||||
}).catch((err) => {
|
||||
// Initial hydration failed — show error banner to user
|
||||
console.error("Canvas: initial hydration failed", err);
|
||||
if (err instanceof PlatformUnavailableError) {
|
||||
setPlatformDown(true);
|
||||
return;
|
||||
}
|
||||
useCanvasStore.getState().setHydrationError(
|
||||
err instanceof Error && err.message ? err.message : "Failed to load canvas"
|
||||
);
|
||||
@ -53,6 +62,10 @@ export default function Home() {
|
||||
);
|
||||
}
|
||||
|
||||
if (platformDown) {
|
||||
return <PlatformDownDiagnostic />;
|
||||
}
|
||||
|
||||
return (
|
||||
<>
|
||||
<Canvas />
|
||||
@ -78,3 +91,43 @@ export default function Home() {
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Dedicated diagnostic for the case where the platform reported its
|
||||
* datastore (Postgres / Redis) is unreachable. Distinct from the
|
||||
* generic API-error overlay: the user's next action is to check
|
||||
* local services, not to retry the API call. Includes the exact
|
||||
* commands for the common dev-host setup.
|
||||
*/
|
||||
function PlatformDownDiagnostic() {
|
||||
return (
|
||||
<div
|
||||
role="alert"
|
||||
className="fixed inset-0 flex flex-col items-center justify-center bg-zinc-950 text-zinc-300 gap-5 z-[9999] px-6"
|
||||
>
|
||||
<div className="text-amber-400 text-sm font-semibold uppercase tracking-wider">
|
||||
Platform infrastructure unreachable
|
||||
</div>
|
||||
<p className="text-zinc-400 text-sm max-w-lg text-center leading-relaxed">
|
||||
The platform server returned <code className="font-mono text-amber-300">503 platform_unavailable</code>.
|
||||
That means it can't reach Postgres or Redis to validate your session.
|
||||
Most common cause on a dev host: one of those services stopped.
|
||||
</p>
|
||||
<div className="bg-zinc-900/80 border border-zinc-700/50 rounded-lg px-4 py-3 max-w-lg w-full">
|
||||
<div className="text-[10px] uppercase tracking-wider text-zinc-500 mb-2">Try first</div>
|
||||
<pre className="text-[12px] text-zinc-300 font-mono whitespace-pre-wrap leading-relaxed">{`brew services start postgresql@14
|
||||
brew services start redis`}</pre>
|
||||
</div>
|
||||
<p className="text-[11px] text-zinc-500 max-w-lg text-center">
|
||||
If both are running, check <code className="font-mono">/tmp/molecule-server.log</code> for
|
||||
the underlying error. If you're on hosted SaaS, this is a platform incident — try again in a moment.
|
||||
</p>
|
||||
<button
|
||||
onClick={() => window.location.reload()}
|
||||
className="px-4 py-2 bg-blue-600 hover:bg-blue-500 text-white rounded-md text-sm mt-2"
|
||||
>
|
||||
Reload
|
||||
</button>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
@ -7,7 +7,7 @@ import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
|
||||
const mockFetch = vi.fn();
|
||||
globalThis.fetch = mockFetch;
|
||||
|
||||
import { api } from "../api";
|
||||
import { api, PlatformUnavailableError } from "../api";
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
@ -380,3 +380,99 @@ describe("api – request timeout signal", () => {
|
||||
expect(sigA).not.toBe(sigB);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// PlatformUnavailableError classification
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// When the platform's wsauth middleware can't reach Postgres/Redis to
|
||||
// validate a token, it returns 503 + {error, code:"platform_unavailable"}.
|
||||
// api.ts must surface that as a typed error so the page-level renderer
|
||||
// can show a dedicated diagnostic instead of a generic 5xx toast.
|
||||
|
||||
describe("PlatformUnavailableError classification", () => {
|
||||
beforeEach(() => {
|
||||
mockFetch.mockReset();
|
||||
});
|
||||
|
||||
function mock503Platform(detail = "platform datastore unavailable — retry shortly") {
|
||||
const body = JSON.stringify({ error: detail, code: "platform_unavailable" });
|
||||
mockFetch.mockResolvedValueOnce({
|
||||
ok: false,
|
||||
status: 503,
|
||||
json: () => Promise.reject(new Error("not used")),
|
||||
text: () => Promise.resolve(body),
|
||||
} as unknown as Response);
|
||||
}
|
||||
|
||||
it("throws PlatformUnavailableError on 503 + code=platform_unavailable", async () => {
|
||||
mock503Platform();
|
||||
let thrown: unknown;
|
||||
try {
|
||||
await api.get("/workspaces");
|
||||
} catch (e) {
|
||||
thrown = e;
|
||||
}
|
||||
expect(thrown).toBeInstanceOf(PlatformUnavailableError);
|
||||
expect((thrown as PlatformUnavailableError).code).toBe("platform_unavailable");
|
||||
});
|
||||
|
||||
it("preserves the server-provided error string as the Error message", async () => {
|
||||
mock503Platform("Postgres unreachable");
|
||||
try {
|
||||
await api.get("/workspaces");
|
||||
} catch (e) {
|
||||
expect(e).toBeInstanceOf(PlatformUnavailableError);
|
||||
expect((e as Error).message).toBe("Postgres unreachable");
|
||||
return;
|
||||
}
|
||||
throw new Error("expected to throw");
|
||||
});
|
||||
|
||||
it("does NOT classify a generic 503 (no platform_unavailable code) as PlatformUnavailableError", async () => {
|
||||
// Generic upstream-busy 503 — should keep the legacy generic-Error
|
||||
// path so existing busy-retry UX isn't disrupted.
|
||||
mockFetch.mockResolvedValueOnce({
|
||||
ok: false,
|
||||
status: 503,
|
||||
json: () => Promise.reject(new Error("not used")),
|
||||
text: () => Promise.resolve(JSON.stringify({ error: "upstream busy" })),
|
||||
} as unknown as Response);
|
||||
try {
|
||||
await api.get("/workspaces/x/a2a");
|
||||
} catch (e) {
|
||||
expect(e).not.toBeInstanceOf(PlatformUnavailableError);
|
||||
expect((e as Error).message).toContain("503");
|
||||
return;
|
||||
}
|
||||
throw new Error("expected to throw");
|
||||
});
|
||||
|
||||
it("does NOT classify on 500 (server kept legacy 500 for true internal errors)", async () => {
|
||||
mockFailure(500, "boom");
|
||||
try {
|
||||
await api.get("/workspaces");
|
||||
} catch (e) {
|
||||
expect(e).not.toBeInstanceOf(PlatformUnavailableError);
|
||||
return;
|
||||
}
|
||||
throw new Error("expected to throw");
|
||||
});
|
||||
|
||||
it("falls back to generic Error when 503 body isn't JSON", async () => {
|
||||
mockFetch.mockResolvedValueOnce({
|
||||
ok: false,
|
||||
status: 503,
|
||||
json: () => Promise.reject(new Error("not used")),
|
||||
text: () => Promise.resolve("Service Unavailable"),
|
||||
} as unknown as Response);
|
||||
try {
|
||||
await api.get("/workspaces");
|
||||
} catch (e) {
|
||||
expect(e).not.toBeInstanceOf(PlatformUnavailableError);
|
||||
expect((e as Error).message).toContain("503");
|
||||
return;
|
||||
}
|
||||
throw new Error("expected to throw");
|
||||
});
|
||||
});
|
||||
|
||||
@ -77,11 +77,39 @@ async function request<T>(
|
||||
}
|
||||
if (!res.ok) {
|
||||
const text = await res.text();
|
||||
// Recognise the platform's structured "datastore unreachable"
|
||||
// shape (returned by wsauth_middleware.abortAuthLookupError when
|
||||
// Postgres/Redis is down). Surface as a typed error so callers
|
||||
// can render a dedicated diagnostic instead of a generic toast.
|
||||
if (res.status === 503 && text) {
|
||||
try {
|
||||
const parsed = JSON.parse(text) as { code?: string; error?: string };
|
||||
if (parsed.code === "platform_unavailable") {
|
||||
throw new PlatformUnavailableError(parsed.error || "platform datastore unavailable");
|
||||
}
|
||||
} catch (err) {
|
||||
// Re-throw the typed error if that's what we just constructed.
|
||||
// JSON.parse failures fall through to the generic Error below.
|
||||
if (err instanceof PlatformUnavailableError) throw err;
|
||||
}
|
||||
}
|
||||
throw new Error(`API ${method} ${path}: ${res.status} ${text}`);
|
||||
}
|
||||
return res.json();
|
||||
}
|
||||
|
||||
/** Thrown when the platform reports its datastore (Postgres/Redis) is
|
||||
* unreachable. Surface with a dedicated diagnostic UI rather than a
|
||||
* generic API-error toast — the user's next action is to check local
|
||||
* services, not to retry the API call. */
|
||||
export class PlatformUnavailableError extends Error {
|
||||
readonly code = "platform_unavailable" as const;
|
||||
constructor(message: string) {
|
||||
super(message);
|
||||
this.name = "PlatformUnavailableError";
|
||||
}
|
||||
}
|
||||
|
||||
export const api = {
|
||||
get: <T>(path: string, options?: RequestOptions) => request<T>("GET", path, undefined, 0, options),
|
||||
post: <T>(path: string, body?: unknown, options?: RequestOptions) => request<T>("POST", path, body, 0, options),
|
||||
|
||||
@ -14,6 +14,30 @@ import (
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// abortAuthLookupError is the single response shape for "the auth
|
||||
// middleware tried to validate a token but the underlying datastore
|
||||
// lookup failed." Returns 503 (not 500) because the right semantic
|
||||
// is "platform infrastructure unavailable, retry shortly" — not
|
||||
// "internal server error in our application logic". The structured
|
||||
// `code` lets the canvas distinguish this from generic 5xx and
|
||||
// surface a dedicated diagnostic ("Postgres/Redis unreachable —
|
||||
// check local services") instead of a confusing
|
||||
// `auth check failed` toast.
|
||||
//
|
||||
// `where` is included in the log line so the operator can grep
|
||||
// which call site fired (WorkspaceAuth vs AdminAuth, the
|
||||
// HasAnyLiveTokenGlobal probe vs orgtoken.Validate). The
|
||||
// user-visible body deliberately does NOT include the underlying
|
||||
// error string — that could leak DB hostnames, connection-string
|
||||
// fragments, or internal code paths.
|
||||
func abortAuthLookupError(c *gin.Context, where string, err error) {
|
||||
log.Printf("wsauth: %s: datastore lookup failed (returning 503): %v", where, err)
|
||||
c.AbortWithStatusJSON(http.StatusServiceUnavailable, gin.H{
|
||||
"error": "platform datastore unavailable — retry shortly",
|
||||
"code": "platform_unavailable",
|
||||
})
|
||||
}
|
||||
|
||||
// WorkspaceAuth returns a Gin middleware that enforces per-workspace bearer-token
|
||||
// authentication on /workspaces/:id/* sub-routes.
|
||||
//
|
||||
@ -73,8 +97,7 @@ func WorkspaceAuth(database *sql.DB) gin.HandlerFunc {
|
||||
c.Next()
|
||||
return
|
||||
} else if !errors.Is(err, orgtoken.ErrInvalidToken) {
|
||||
log.Printf("wsauth: WorkspaceAuth: orgtoken.Validate: %v", err)
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "auth check failed"})
|
||||
abortAuthLookupError(c, "WorkspaceAuth: orgtoken.Validate", err)
|
||||
return
|
||||
}
|
||||
// Per-workspace token — narrowest scope, bound to this :id.
|
||||
@ -136,8 +159,7 @@ func AdminAuth(database *sql.DB) gin.HandlerFunc {
|
||||
|
||||
hasLive, err := wsauth.HasAnyLiveTokenGlobal(ctx, database)
|
||||
if err != nil {
|
||||
log.Printf("wsauth: AdminAuth: HasAnyLiveTokenGlobal failed: %v", err)
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "auth check failed"})
|
||||
abortAuthLookupError(c, "AdminAuth: HasAnyLiveTokenGlobal", err)
|
||||
return
|
||||
}
|
||||
if !hasLive {
|
||||
@ -214,8 +236,7 @@ func AdminAuth(database *sql.DB) gin.HandlerFunc {
|
||||
return
|
||||
} else if !errors.Is(err, orgtoken.ErrInvalidToken) {
|
||||
// DB error — fail closed and log. Don't expose DB text.
|
||||
log.Printf("wsauth: AdminAuth: orgtoken.Validate: %v", err)
|
||||
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "auth check failed"})
|
||||
abortAuthLookupError(c, "AdminAuth: orgtoken.Validate", err)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@ -2,8 +2,11 @@ package middleware
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
@ -1699,3 +1702,57 @@ func TestAdminAuth_684_SpecificRoutes_NoBearer_Returns401(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== platform-unavailable classification ====================
|
||||
//
|
||||
// abortAuthLookupError replaces the prior opaque
|
||||
// `500 {"error":"auth check failed"}` with a 503 + structured code so
|
||||
// the canvas can render a dedicated diagnostic instead of a confusing
|
||||
// toast. Pin both the status code and the body shape against
|
||||
// regression — this is the contract the canvas's
|
||||
// PlatformUnavailableError classifier reads at api.ts.
|
||||
|
||||
func TestAdminAuth_DatastoreError_Returns503PlatformUnavailable(t *testing.T) {
|
||||
mockDB, mock, err := sqlmock.New()
|
||||
if err != nil {
|
||||
t.Fatalf("sqlmock.New: %v", err)
|
||||
}
|
||||
defer mockDB.Close()
|
||||
|
||||
// Simulate Postgres being down — HasAnyLiveTokenGlobal's COUNT
|
||||
// query returns a connection error.
|
||||
mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
|
||||
WillReturnError(errors.New("dial tcp [::1]:5432: connect: connection refused"))
|
||||
|
||||
r := gin.New()
|
||||
r.GET("/workspaces", AdminAuth(mockDB), func(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, gin.H{"ok": true})
|
||||
})
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
req, _ := http.NewRequest(http.MethodGet, "/workspaces", nil)
|
||||
r.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Errorf("expected 503, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
var resp map[string]interface{}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("response body must be JSON: %v (body=%s)", err, w.Body.String())
|
||||
}
|
||||
if resp["code"] != "platform_unavailable" {
|
||||
t.Errorf("response code = %v, want platform_unavailable (canvas reads this for the dedicated diagnostic)", resp["code"])
|
||||
}
|
||||
if _, ok := resp["error"].(string); !ok {
|
||||
t.Errorf("response must include human-readable error string, got %v", resp["error"])
|
||||
}
|
||||
// The body must NOT leak the underlying DB error string —
|
||||
// production hostnames / connection-string fragments could land
|
||||
// in an error toast otherwise.
|
||||
if errStr, _ := resp["error"].(string); strings.Contains(errStr, "dial tcp") {
|
||||
t.Errorf("response leaks underlying DB error: %q", errStr)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user