diff --git a/workspace-server/internal/handlers/a2a_proxy_helpers.go b/workspace-server/internal/handlers/a2a_proxy_helpers.go index ef6317a64..920e8b5ba 100644 --- a/workspace-server/internal/handlers/a2a_proxy_helpers.go +++ b/workspace-server/internal/handlers/a2a_proxy_helpers.go @@ -426,16 +426,34 @@ func nilIfEmpty(s string) *string { // (their next /registry/register will mint their first token, after // which this branch never fires again for them). // -// Post-RFC#637 addition: when the tokenless workspace is accompanied by -// canvas or admin auth (same-origin request, admin bearer, or org-level -// token), the caller is identified as a canvas-user identity rather than -// a legacy peer agent. The returned isCanvasUser flag lets the A2A proxy -// bypass CanCommunicate for human users, who sit outside the workspace -// hierarchy. +// Post-RFC#637 addition: a request may instead be carrying a HUMAN's +// canvas-user identity (e.g. the 344a2623-… identity workspace from the +// RFC#637 rollout). That human sits OUTSIDE the workspace org hierarchy, so +// the returned isCanvasUser flag lets the A2A proxy bypass CanCommunicate for +// it. Canvas-user classification is decided by isGenuineCanvasUser using +// NON-FORGEABLE credentials only (see that function) — never by the caller's +// X-Workspace-ID alone, and never by a bare same-origin Host/Referer in a +// SaaS image (those are forgeable; see middleware.IsSameOriginCanvas). +// +// #1673: this canvas-user check is now evaluated BEFORE the HasAnyLiveToken +// peer-token contract. Previously it lived only in the !hasLive branch, so a +// canvas-user identity workspace that had acquired live tokens fell into the +// hasLive=true branch, which demands a bearer the canvas frontend never sends +// → silent 401 → the message was dropped before logA2AReceiveQueued wrote the +// activity_logs row, breaking canvas chat for poll-mode workspaces. A genuine +// canvas user is identified by the human's session/admin/org credential, which +// is independent of whether the identity workspace happens to hold peer tokens. // // On auth failure this writes the 401 via c and returns an error so the // handler aborts without running the proxy. func validateCallerToken(ctx context.Context, c *gin.Context, callerID string) (isCanvasUser bool, err error) { + // Genuine canvas-user identity? Decided independently of the caller + // workspace's token state (the #1673 fix) and using only non-forgeable + // signals (the #1944 escalation guard). + if isGenuineCanvasUser(ctx, c) { + return true, nil + } + hasLive, dbErr := wsauth.HasAnyLiveToken(ctx, db.DB, callerID) if dbErr != nil { // Fail-open here matches the heartbeat path — A2A caller auth is @@ -446,22 +464,10 @@ func validateCallerToken(ctx context.Context, c *gin.Context, callerID string) ( return false, nil } if !hasLive { - // Tokenless workspace — could be legacy/pre-upgrade caller or - // canvas-user identity. Distinguish by request auth signals. - if middleware.IsSameOriginCanvas(c) { - return true, nil - } - tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization")) - if tok != "" { - adminSecret := os.Getenv("ADMIN_TOKEN") - if adminSecret != "" && subtle.ConstantTimeCompare([]byte(tok), []byte(adminSecret)) == 1 { - return true, nil - } - if _, _, _, err := orgtoken.Validate(ctx, db.DB, tok); err == nil { - return true, nil - } - } - return false, nil // legacy / pre-upgrade caller + // Tokenless, non-canvas-user workspace — legacy / pre-upgrade peer. + // Grandfather it through (its next /registry/register mints its + // first token, after which it lands in the hasLive=true branch). + return false, nil } tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization")) if tok == "" { @@ -475,6 +481,61 @@ func validateCallerToken(ctx context.Context, c *gin.Context, callerID string) ( return false, nil } +// isGenuineCanvasUser reports whether the request is a real human acting +// through the canvas UI (RFC#637 canvas-user identity), as opposed to a peer +// workspace agent. A true result lets the A2A proxy bypass CanCommunicate, so +// it MUST only accept signals an attacker on the platform network cannot forge: +// +// - A control-plane-verified canvas session: the WorkOS session cookie is +// confirmed upstream to belong to a MEMBER of THIS tenant's org +// (middleware.IsVerifiedCanvasSession → /cp/auth/tenant-member). This is +// the production SaaS canvas path. +// - An Authorization: Bearer matching ADMIN_TOKEN (break-glass / molecli). +// - An Authorization: Bearer matching a live org_api_tokens row (user-minted +// org-scoped API token). +// +// Deliberately NOT accepted as a canvas-user signal in a SaaS image: +// +// - A bare same-origin Host/Referer/Origin (middleware.IsSameOriginCanvas). +// Those headers are trivially forgeable by any container on the Docker +// network, and the combined-tenant image (CANVAS_PROXY_URL set) is exactly +// where a forged Referer + an arbitrary X-Workspace-ID could otherwise +// bypass CanCommunicate and reach cross-workspace A2A — the PR #1944 +// privilege escalation. Same-origin is only honored as a fallback when CP +// session verification is NOT configured (self-hosted / dev), a +// single-tenant topology with no cross-tenant boundary to escalate across; +// even there the org hierarchy still owns intra-org routing. +// +// Note this classification is about the human's credential, not the caller +// workspace's X-Workspace-ID — so it never trusts an attacker-supplied caller +// ID, and it is independent of whether that workspace holds peer tokens. +func isGenuineCanvasUser(ctx context.Context, c *gin.Context) bool { + // Production SaaS: control-plane-verified org-member session cookie. + if middleware.IsVerifiedCanvasSession(c) { + return true + } + + if tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization")); tok != "" { + adminSecret := os.Getenv("ADMIN_TOKEN") + if adminSecret != "" && subtle.ConstantTimeCompare([]byte(tok), []byte(adminSecret)) == 1 { + return true + } + if _, _, _, err := orgtoken.Validate(ctx, db.DB, tok); err == nil { + return true + } + } + + // Self-hosted / dev fallback ONLY: when upstream session verification is + // not configured there is no verified-cookie signal to use, and the + // deployment is single-tenant, so the forgeable same-origin check is an + // acceptable canvas signal. In SaaS (CP session configured) this branch is + // skipped, closing the forged-same-origin escalation. + if !middleware.CPSessionConfigured() && middleware.IsSameOriginCanvas(c) { + return true + } + return false +} + // errInvalidCallerToken is a sentinel for validateCallerToken's "missing // token" branch so the handler-level guard can detect it without string // matching (the wsauth errors are typed for the invalid case). diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go index 947b6f572..2e997d168 100644 --- a/workspace-server/internal/handlers/a2a_proxy_test.go +++ b/workspace-server/internal/handlers/a2a_proxy_test.go @@ -11,6 +11,7 @@ import ( "net/http" "net/http/httptest" "os" + "os/exec" "strings" "testing" "time" @@ -1244,13 +1245,12 @@ func TestValidateCallerToken_WrongWorkspaceBindingRejected(t *testing.T) { } func TestValidateCallerToken_CanvasUser_AdminToken(t *testing.T) { - mock := setupTestDB(t) + setupTestDB(t) setupTestRedis(t) - // Tokenless workspace - mock.ExpectQuery(`SELECT COUNT\(\*\) FROM workspace_auth_tokens`). - WithArgs("ws-canvas-admin"). - WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + // #1673/#1944: the genuine-canvas-user check (admin bearer here) now runs + // BEFORE HasAnyLiveToken, so no SELECT COUNT(*) is issued — the human's + // credential, not the caller workspace's token state, decides canvas-user. t.Setenv("ADMIN_TOKEN", "admin-secret-42") @@ -1276,10 +1276,9 @@ func TestValidateCallerToken_CanvasUser_OrgToken(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) - // Tokenless workspace - mock.ExpectQuery(`SELECT COUNT\(\*\) FROM workspace_auth_tokens`). - WithArgs("ws-canvas-org"). - WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + // #1673/#1944: the genuine-canvas-user check (org token here) now runs + // BEFORE HasAnyLiveToken, so the first DB query is orgtoken.Validate's + // lookup — there is no SELECT COUNT(*) expectation anymore. // orgtoken.Validate lookup mock.ExpectQuery(`SELECT id, prefix, org_id FROM org_api_tokens WHERE token_hash = .* AND revoked_at IS NULL`). @@ -2341,6 +2340,197 @@ func TestProxyA2A_PollMode_ShortCircuits_NoSSRF_NoDispatch(t *testing.T) { } } +// stubVerifiedCPSession points VerifiedCPSession at a stub control-plane that +// confirms the given cookie belongs to a tenant-member, so tests can exercise +// the genuine (non-forgeable) canvas-session path end-to-end without a live CP. +// It sets CP_UPSTREAM_URL + MOLECULE_ORG_SLUG for the test's lifetime; the +// real middleware.VerifiedCPSession HTTP+cache code path runs unchanged. +func stubVerifiedCPSession(t *testing.T, member bool) { + t.Helper() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + if member { + fmt.Fprint(w, `{"member":true,"user_id":"user-canvas-1"}`) + } else { + w.WriteHeader(http.StatusForbidden) + fmt.Fprint(w, `{"member":false}`) + } + })) + t.Cleanup(srv.Close) + t.Setenv("CP_UPSTREAM_URL", srv.URL) + t.Setenv("MOLECULE_ORG_SLUG", "test-tenant") +} + +// TestProxyA2A_PollMode_CanvasUserWithVerifiedSession is the #1673 regression +// guard. A poll-mode canvas-user identity workspace that HAS acquired live +// tokens (the exact condition that made #1673 fire) sends a canvas message +// carrying a control-plane-verified session cookie but no bearer token. The +// fix must classify it as a canvas user BEFORE the HasAnyLiveToken peer-token +// contract, so the request is queued (200) and logA2AReceiveQueued writes the +// activity_logs row — instead of the pre-fix silent 401 that dropped the +// message before any row landed (breaking canvas chat + chat-history). +// +// Runs in a subprocess with CANVAS_PROXY_URL set so middleware.canvasProxyActive +// is true at package-init time (matching the combined-tenant image), proving the +// fix does not depend on disabling same-origin detection. +func TestProxyA2A_PollMode_CanvasUserWithVerifiedSession(t *testing.T) { + if os.Getenv("CANVAS_PROXY_URL") == "" { + cmd := exec.Command(os.Args[0], "-test.run=^TestProxyA2A_PollMode_CanvasUserWithVerifiedSession$", "-test.v") + cmd.Env = append(os.Environ(), "CANVAS_PROXY_URL=http://localhost") + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("subprocess test failed: %v\n%s", err, out) + } + return + } + + stubVerifiedCPSession(t, true) + + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + + const wsTarget = "ws-poll-canvas-target" + const wsCanvasUser = "ws-canvas-user-344a" + + // CRUCIAL: no SELECT COUNT(*) FROM workspace_auth_tokens expectation. The + // genuine-canvas-user check (verified session) must short-circuit BEFORE + // HasAnyLiveToken — that is the #1673 regression path. An identity + // workspace that already holds live tokens must NOT fall into the + // hasLive=true bearer-required branch. + + // isCanvasUser=true → CanCommunicate is skipped (no parent_id lookups). + expectBudgetCheck(mock, wsTarget) + mock.ExpectQuery("SELECT delivery_mode FROM workspaces WHERE id"). + WithArgs(wsTarget). + WillReturnRows(sqlmock.NewRows([]string{"delivery_mode"}).AddRow("poll")) + // logA2AReceiveQueued must fire synchronously and write the row. + mock.ExpectExec("INSERT INTO activity_logs"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: wsTarget}} + + body := `{"jsonrpc":"2.0","id":"canvas-1","method":"message/send","params":{"message":{"role":"user","parts":[{"text":"hello from canvas"}]}}}` + req := httptest.NewRequest("POST", "/workspaces/"+wsTarget+"/a2a", bytes.NewBufferString(body)) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("X-Workspace-ID", wsCanvasUser) + // Verified canvas session cookie (the genuine, non-forgeable signal). + req.Header.Set("Cookie", "wos-session=valid-canvas-session-cookie") + // Same-origin headers, present as a real canvas request would send them — + // but they are NOT what authorizes the bypass here (the verified session is). + req.Host = "localhost" + req.Header.Set("Referer", "https://localhost/") + c.Request = req + + handler.ProxyA2A(c) + + time.Sleep(50 * time.Millisecond) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200 (queued) for canvas-user with verified session, got %d: %s", w.Code, w.Body.String()) + } + var resp map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil { + t.Fatalf("response is not valid JSON: %v", err) + } + if resp["status"] != "queued" { + t.Errorf("response.status = %v, want %q", resp["status"], "queued") + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations (activity_logs row must be written): %v", err) + } +} + +// TestProxyA2A_ForgedSameOrigin_CannotBypassCanCommunicate is the security +// crux of the #1673 fix and the reason PR #1944 was held. In the combined- +// tenant SaaS image (CANVAS_PROXY_URL set, CP session verification configured), +// an attacker forges a same-origin request — correct Host + a matching +// `Referer: https:///` — and supplies an arbitrary X-Workspace-ID naming +// a workspace it does not control, targeting a workspace it is NOT authorized +// to reach. It presents NO verified session cookie, NO admin token, NO org +// token. +// +// PR #1944's same-origin bypass would have classified this as a canvas user and +// skipped CanCommunicate, granting cross-workspace A2A — a privilege +// escalation. The safe fix must instead fall through to the standard +// peer-token contract and CanCommunicate, which rejects the cross-hierarchy +// call with 403. This test proves the escalation is closed. +func TestProxyA2A_ForgedSameOrigin_CannotBypassCanCommunicate(t *testing.T) { + if os.Getenv("CANVAS_PROXY_URL") == "" { + cmd := exec.Command(os.Args[0], "-test.run=^TestProxyA2A_ForgedSameOrigin_CannotBypassCanCommunicate$", "-test.v") + cmd.Env = append(os.Environ(), "CANVAS_PROXY_URL=http://localhost") + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("subprocess test failed: %v\n%s", err, out) + } + return + } + + // SaaS image with CP session verification configured. The stub CP rejects + // any cookie as a non-member; the attacker sends none anyway. This asserts + // that with verification configured, same-origin alone is NOT a canvas + // signal (CPSessionConfigured()==true disables the dev fallback). + stubVerifiedCPSession(t, false) + + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + + const wsTarget = "ws-victim-target" + const wsForgedCaller = "ws-attacker-caller" + + // validateCallerToken: not a genuine canvas user (no verified session, no + // admin/org token, and the dev same-origin fallback is disabled in SaaS). + // So it consults the peer-token contract: HasAnyLiveToken for the forged + // caller. Return 0 → tokenless legacy peer → grandfathered through token + // validation (isCanvasUser stays false). The request must then still be + // gated by CanCommunicate. + mock.ExpectQuery(`SELECT COUNT\(\*\) FROM workspace_auth_tokens`). + WithArgs(wsForgedCaller). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + + // CanCommunicate MUST run (the escalation guard) and DENY: caller and + // target sit under different parents. + mockCanCommunicate(mock, wsForgedCaller, wsTarget, false) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: wsTarget}} + + body := `{"jsonrpc":"2.0","id":"exploit-1","method":"message/send","params":{"message":{"role":"user","parts":[{"text":"cross-workspace exploit"}]}}}` + req := httptest.NewRequest("POST", "/workspaces/"+wsTarget+"/a2a", bytes.NewBufferString(body)) + req.Header.Set("Content-Type", "application/json") + // Arbitrary caller workspace the attacker does not own. + req.Header.Set("X-Workspace-ID", wsForgedCaller) + // Forged same-origin signals (the #1944 bypass vector). + req.Host = "localhost" + req.Header.Set("Referer", "https://localhost/") + req.Header.Set("Origin", "https://localhost") + // No Cookie / Authorization — no genuine canvas credential. + c.Request = req + + handler.ProxyA2A(c) + + if w.Code != http.StatusForbidden { + t.Fatalf("ESCALATION NOT CLOSED: forged same-origin + arbitrary X-Workspace-ID "+ + "reached an unauthorized target with status %d (want 403): %s", w.Code, w.Body.String()) + } + var resp map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil { + t.Fatalf("body not JSON: %v", err) + } + if !strings.Contains(fmt.Sprint(resp["error"]), "access denied") { + t.Errorf("expected an access-denied error from CanCommunicate, got %v", resp["error"]) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations — CanCommunicate must have been consulted: %v", err) + } +} + // TestProxyA2A_PushMode_NoShortCircuit verifies the symmetric contract: // a push-mode workspace (default) is NOT affected by the new short-circuit. // It still proceeds to resolveAgentURL + dispatch. Without this guard, a diff --git a/workspace-server/internal/middleware/wsauth_middleware.go b/workspace-server/internal/middleware/wsauth_middleware.go index 8ef2356c6..8ffd74405 100644 --- a/workspace-server/internal/middleware/wsauth_middleware.go +++ b/workspace-server/internal/middleware/wsauth_middleware.go @@ -412,3 +412,40 @@ func isSameOriginCanvas(c *gin.Context) bool { origin := c.GetHeader("Origin") return origin == "https://"+host || origin == "http://"+host } + +// cpSessionConfigured reports whether this platform is wired for upstream +// session-cookie verification — i.e. it runs as a SaaS tenant image with +// both CP_UPSTREAM_URL and MOLECULE_ORG_SLUG set. When false (self-hosted / +// dev), VerifiedCPSession can never succeed, so callers that want a +// non-forgeable canvas signal in SaaS while still working in dev can use +// this to decide whether the forgeable same-origin fallback is acceptable. +func cpSessionConfigured() bool { + return os.Getenv("CP_UPSTREAM_URL") != "" && tenantSlug() != "" +} + +// CPSessionConfigured is the exported form of cpSessionConfigured for callers +// outside this package (e.g. the A2A proxy's canvas-user classification). +func CPSessionConfigured() bool { + return cpSessionConfigured() +} + +// IsVerifiedCanvasSession returns true ONLY when the request carries a WorkOS +// session cookie that the control plane confirms belongs to a member of THIS +// tenant's org (via /cp/auth/tenant-member). Unlike IsSameOriginCanvas — whose +// Host/Referer/Origin inputs are trivially forgeable by any container on the +// Docker network and which is therefore documented as cosmetic-only (see +// AdminAuth / CanvasOrBearer comments above, #623/#194) — this is a real, +// upstream-verified authentication boundary. It is the correct gate for +// non-cosmetic actions such as A2A dispatch on behalf of a canvas user. +// +// Returns false (no network call) in self-hosted / dev deployments where +// CP_UPSTREAM_URL / MOLECULE_ORG_SLUG are unset; callers should treat that as +// "no verified canvas session available" and fall back accordingly. +func IsVerifiedCanvasSession(c *gin.Context) bool { + cookie := c.GetHeader("Cookie") + if cookie == "" { + return false + } + valid, _ := VerifiedCPSession(cookie) + return valid +}