fix(a2a): canvas-user identity bypass without cross-workspace escalation (#1673) #1948

Merged
hongming merged 1 commits from fix/canvas-user-verified-session-1673 into main 2026-05-27 14:19:49 +00:00
3 changed files with 319 additions and 31 deletions
@@ -426,16 +426,34 @@ func nilIfEmpty(s string) *string {
// (their next /registry/register will mint their first token, after
// which this branch never fires again for them).
//
// Post-RFC#637 addition: when the tokenless workspace is accompanied by
// canvas or admin auth (same-origin request, admin bearer, or org-level
// token), the caller is identified as a canvas-user identity rather than
// a legacy peer agent. The returned isCanvasUser flag lets the A2A proxy
// bypass CanCommunicate for human users, who sit outside the workspace
// hierarchy.
// Post-RFC#637 addition: a request may instead be carrying a HUMAN's
// canvas-user identity (e.g. the 344a2623-… identity workspace from the
// RFC#637 rollout). That human sits OUTSIDE the workspace org hierarchy, so
// the returned isCanvasUser flag lets the A2A proxy bypass CanCommunicate for
// it. Canvas-user classification is decided by isGenuineCanvasUser using
// NON-FORGEABLE credentials only (see that function) — never by the caller's
// X-Workspace-ID alone, and never by a bare same-origin Host/Referer in a
// SaaS image (those are forgeable; see middleware.IsSameOriginCanvas).
//
// #1673: this canvas-user check is now evaluated BEFORE the HasAnyLiveToken
// peer-token contract. Previously it lived only in the !hasLive branch, so a
// canvas-user identity workspace that had acquired live tokens fell into the
// hasLive=true branch, which demands a bearer the canvas frontend never sends
// → silent 401 → the message was dropped before logA2AReceiveQueued wrote the
// activity_logs row, breaking canvas chat for poll-mode workspaces. A genuine
// canvas user is identified by the human's session/admin/org credential, which
// is independent of whether the identity workspace happens to hold peer tokens.
//
// On auth failure this writes the 401 via c and returns an error so the
// handler aborts without running the proxy.
func validateCallerToken(ctx context.Context, c *gin.Context, callerID string) (isCanvasUser bool, err error) {
// Genuine canvas-user identity? Decided independently of the caller
// workspace's token state (the #1673 fix) and using only non-forgeable
// signals (the #1944 escalation guard).
if isGenuineCanvasUser(ctx, c) {
return true, nil
}
hasLive, dbErr := wsauth.HasAnyLiveToken(ctx, db.DB, callerID)
if dbErr != nil {
// Fail-open here matches the heartbeat path — A2A caller auth is
@@ -446,22 +464,10 @@ func validateCallerToken(ctx context.Context, c *gin.Context, callerID string) (
return false, nil
}
if !hasLive {
// Tokenless workspace — could be legacy/pre-upgrade caller or
// canvas-user identity. Distinguish by request auth signals.
if middleware.IsSameOriginCanvas(c) {
return true, nil
}
tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization"))
if tok != "" {
adminSecret := os.Getenv("ADMIN_TOKEN")
if adminSecret != "" && subtle.ConstantTimeCompare([]byte(tok), []byte(adminSecret)) == 1 {
return true, nil
}
if _, _, _, err := orgtoken.Validate(ctx, db.DB, tok); err == nil {
return true, nil
}
}
return false, nil // legacy / pre-upgrade caller
// Tokenless, non-canvas-user workspace — legacy / pre-upgrade peer.
// Grandfather it through (its next /registry/register mints its
// first token, after which it lands in the hasLive=true branch).
return false, nil
}
tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization"))
if tok == "" {
@@ -475,6 +481,61 @@ func validateCallerToken(ctx context.Context, c *gin.Context, callerID string) (
return false, nil
}
// isGenuineCanvasUser reports whether the request is a real human acting
// through the canvas UI (RFC#637 canvas-user identity), as opposed to a peer
// workspace agent. A true result lets the A2A proxy bypass CanCommunicate, so
// it MUST only accept signals an attacker on the platform network cannot forge:
//
// - A control-plane-verified canvas session: the WorkOS session cookie is
// confirmed upstream to belong to a MEMBER of THIS tenant's org
// (middleware.IsVerifiedCanvasSession → /cp/auth/tenant-member). This is
// the production SaaS canvas path.
// - An Authorization: Bearer matching ADMIN_TOKEN (break-glass / molecli).
// - An Authorization: Bearer matching a live org_api_tokens row (user-minted
// org-scoped API token).
//
// Deliberately NOT accepted as a canvas-user signal in a SaaS image:
//
// - A bare same-origin Host/Referer/Origin (middleware.IsSameOriginCanvas).
// Those headers are trivially forgeable by any container on the Docker
// network, and the combined-tenant image (CANVAS_PROXY_URL set) is exactly
// where a forged Referer + an arbitrary X-Workspace-ID could otherwise
// bypass CanCommunicate and reach cross-workspace A2A — the PR #1944
// privilege escalation. Same-origin is only honored as a fallback when CP
// session verification is NOT configured (self-hosted / dev), a
// single-tenant topology with no cross-tenant boundary to escalate across;
// even there the org hierarchy still owns intra-org routing.
//
// Note this classification is about the human's credential, not the caller
// workspace's X-Workspace-ID — so it never trusts an attacker-supplied caller
// ID, and it is independent of whether that workspace holds peer tokens.
func isGenuineCanvasUser(ctx context.Context, c *gin.Context) bool {
// Production SaaS: control-plane-verified org-member session cookie.
if middleware.IsVerifiedCanvasSession(c) {
return true
}
if tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization")); tok != "" {
adminSecret := os.Getenv("ADMIN_TOKEN")
if adminSecret != "" && subtle.ConstantTimeCompare([]byte(tok), []byte(adminSecret)) == 1 {
return true
}
if _, _, _, err := orgtoken.Validate(ctx, db.DB, tok); err == nil {
return true
}
}
// Self-hosted / dev fallback ONLY: when upstream session verification is
// not configured there is no verified-cookie signal to use, and the
// deployment is single-tenant, so the forgeable same-origin check is an
// acceptable canvas signal. In SaaS (CP session configured) this branch is
// skipped, closing the forged-same-origin escalation.
if !middleware.CPSessionConfigured() && middleware.IsSameOriginCanvas(c) {
return true
}
return false
}
// errInvalidCallerToken is a sentinel for validateCallerToken's "missing
// token" branch so the handler-level guard can detect it without string
// matching (the wsauth errors are typed for the invalid case).
@@ -11,6 +11,7 @@ import (
"net/http"
"net/http/httptest"
"os"
"os/exec"
"strings"
"testing"
"time"
@@ -1244,13 +1245,12 @@ func TestValidateCallerToken_WrongWorkspaceBindingRejected(t *testing.T) {
}
func TestValidateCallerToken_CanvasUser_AdminToken(t *testing.T) {
mock := setupTestDB(t)
setupTestDB(t)
setupTestRedis(t)
// Tokenless workspace
mock.ExpectQuery(`SELECT COUNT\(\*\) FROM workspace_auth_tokens`).
WithArgs("ws-canvas-admin").
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
// #1673/#1944: the genuine-canvas-user check (admin bearer here) now runs
// BEFORE HasAnyLiveToken, so no SELECT COUNT(*) is issued — the human's
// credential, not the caller workspace's token state, decides canvas-user.
t.Setenv("ADMIN_TOKEN", "admin-secret-42")
@@ -1276,10 +1276,9 @@ func TestValidateCallerToken_CanvasUser_OrgToken(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
// Tokenless workspace
mock.ExpectQuery(`SELECT COUNT\(\*\) FROM workspace_auth_tokens`).
WithArgs("ws-canvas-org").
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
// #1673/#1944: the genuine-canvas-user check (org token here) now runs
// BEFORE HasAnyLiveToken, so the first DB query is orgtoken.Validate's
// lookup — there is no SELECT COUNT(*) expectation anymore.
// orgtoken.Validate lookup
mock.ExpectQuery(`SELECT id, prefix, org_id FROM org_api_tokens WHERE token_hash = .* AND revoked_at IS NULL`).
@@ -2341,6 +2340,197 @@ func TestProxyA2A_PollMode_ShortCircuits_NoSSRF_NoDispatch(t *testing.T) {
}
}
// stubVerifiedCPSession points VerifiedCPSession at a stub control-plane that
// confirms the given cookie belongs to a tenant-member, so tests can exercise
// the genuine (non-forgeable) canvas-session path end-to-end without a live CP.
// It sets CP_UPSTREAM_URL + MOLECULE_ORG_SLUG for the test's lifetime; the
// real middleware.VerifiedCPSession HTTP+cache code path runs unchanged.
func stubVerifiedCPSession(t *testing.T, member bool) {
t.Helper()
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
if member {
fmt.Fprint(w, `{"member":true,"user_id":"user-canvas-1"}`)
} else {
w.WriteHeader(http.StatusForbidden)
fmt.Fprint(w, `{"member":false}`)
}
}))
t.Cleanup(srv.Close)
t.Setenv("CP_UPSTREAM_URL", srv.URL)
t.Setenv("MOLECULE_ORG_SLUG", "test-tenant")
}
// TestProxyA2A_PollMode_CanvasUserWithVerifiedSession is the #1673 regression
// guard. A poll-mode canvas-user identity workspace that HAS acquired live
// tokens (the exact condition that made #1673 fire) sends a canvas message
// carrying a control-plane-verified session cookie but no bearer token. The
// fix must classify it as a canvas user BEFORE the HasAnyLiveToken peer-token
// contract, so the request is queued (200) and logA2AReceiveQueued writes the
// activity_logs row — instead of the pre-fix silent 401 that dropped the
// message before any row landed (breaking canvas chat + chat-history).
//
// Runs in a subprocess with CANVAS_PROXY_URL set so middleware.canvasProxyActive
// is true at package-init time (matching the combined-tenant image), proving the
// fix does not depend on disabling same-origin detection.
func TestProxyA2A_PollMode_CanvasUserWithVerifiedSession(t *testing.T) {
if os.Getenv("CANVAS_PROXY_URL") == "" {
cmd := exec.Command(os.Args[0], "-test.run=^TestProxyA2A_PollMode_CanvasUserWithVerifiedSession$", "-test.v")
cmd.Env = append(os.Environ(), "CANVAS_PROXY_URL=http://localhost")
out, err := cmd.CombinedOutput()
if err != nil {
t.Fatalf("subprocess test failed: %v\n%s", err, out)
}
return
}
stubVerifiedCPSession(t, true)
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
const wsTarget = "ws-poll-canvas-target"
const wsCanvasUser = "ws-canvas-user-344a"
// CRUCIAL: no SELECT COUNT(*) FROM workspace_auth_tokens expectation. The
// genuine-canvas-user check (verified session) must short-circuit BEFORE
// HasAnyLiveToken — that is the #1673 regression path. An identity
// workspace that already holds live tokens must NOT fall into the
// hasLive=true bearer-required branch.
// isCanvasUser=true → CanCommunicate is skipped (no parent_id lookups).
expectBudgetCheck(mock, wsTarget)
mock.ExpectQuery("SELECT delivery_mode FROM workspaces WHERE id").
WithArgs(wsTarget).
WillReturnRows(sqlmock.NewRows([]string{"delivery_mode"}).AddRow("poll"))
// logA2AReceiveQueued must fire synchronously and write the row.
mock.ExpectExec("INSERT INTO activity_logs").
WillReturnResult(sqlmock.NewResult(0, 1))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: wsTarget}}
body := `{"jsonrpc":"2.0","id":"canvas-1","method":"message/send","params":{"message":{"role":"user","parts":[{"text":"hello from canvas"}]}}}`
req := httptest.NewRequest("POST", "/workspaces/"+wsTarget+"/a2a", bytes.NewBufferString(body))
req.Header.Set("Content-Type", "application/json")
req.Header.Set("X-Workspace-ID", wsCanvasUser)
// Verified canvas session cookie (the genuine, non-forgeable signal).
req.Header.Set("Cookie", "wos-session=valid-canvas-session-cookie")
// Same-origin headers, present as a real canvas request would send them —
// but they are NOT what authorizes the bypass here (the verified session is).
req.Host = "localhost"
req.Header.Set("Referer", "https://localhost/")
c.Request = req
handler.ProxyA2A(c)
time.Sleep(50 * time.Millisecond)
if w.Code != http.StatusOK {
t.Fatalf("expected 200 (queued) for canvas-user with verified session, got %d: %s", w.Code, w.Body.String())
}
var resp map[string]interface{}
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
t.Fatalf("response is not valid JSON: %v", err)
}
if resp["status"] != "queued" {
t.Errorf("response.status = %v, want %q", resp["status"], "queued")
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations (activity_logs row must be written): %v", err)
}
}
// TestProxyA2A_ForgedSameOrigin_CannotBypassCanCommunicate is the security
// crux of the #1673 fix and the reason PR #1944 was held. In the combined-
// tenant SaaS image (CANVAS_PROXY_URL set, CP session verification configured),
// an attacker forges a same-origin request — correct Host + a matching
// `Referer: https://<host>/` — and supplies an arbitrary X-Workspace-ID naming
// a workspace it does not control, targeting a workspace it is NOT authorized
// to reach. It presents NO verified session cookie, NO admin token, NO org
// token.
//
// PR #1944's same-origin bypass would have classified this as a canvas user and
// skipped CanCommunicate, granting cross-workspace A2A — a privilege
// escalation. The safe fix must instead fall through to the standard
// peer-token contract and CanCommunicate, which rejects the cross-hierarchy
// call with 403. This test proves the escalation is closed.
func TestProxyA2A_ForgedSameOrigin_CannotBypassCanCommunicate(t *testing.T) {
if os.Getenv("CANVAS_PROXY_URL") == "" {
cmd := exec.Command(os.Args[0], "-test.run=^TestProxyA2A_ForgedSameOrigin_CannotBypassCanCommunicate$", "-test.v")
cmd.Env = append(os.Environ(), "CANVAS_PROXY_URL=http://localhost")
out, err := cmd.CombinedOutput()
if err != nil {
t.Fatalf("subprocess test failed: %v\n%s", err, out)
}
return
}
// SaaS image with CP session verification configured. The stub CP rejects
// any cookie as a non-member; the attacker sends none anyway. This asserts
// that with verification configured, same-origin alone is NOT a canvas
// signal (CPSessionConfigured()==true disables the dev fallback).
stubVerifiedCPSession(t, false)
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
const wsTarget = "ws-victim-target"
const wsForgedCaller = "ws-attacker-caller"
// validateCallerToken: not a genuine canvas user (no verified session, no
// admin/org token, and the dev same-origin fallback is disabled in SaaS).
// So it consults the peer-token contract: HasAnyLiveToken for the forged
// caller. Return 0 → tokenless legacy peer → grandfathered through token
// validation (isCanvasUser stays false). The request must then still be
// gated by CanCommunicate.
mock.ExpectQuery(`SELECT COUNT\(\*\) FROM workspace_auth_tokens`).
WithArgs(wsForgedCaller).
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
// CanCommunicate MUST run (the escalation guard) and DENY: caller and
// target sit under different parents.
mockCanCommunicate(mock, wsForgedCaller, wsTarget, false)
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: wsTarget}}
body := `{"jsonrpc":"2.0","id":"exploit-1","method":"message/send","params":{"message":{"role":"user","parts":[{"text":"cross-workspace exploit"}]}}}`
req := httptest.NewRequest("POST", "/workspaces/"+wsTarget+"/a2a", bytes.NewBufferString(body))
req.Header.Set("Content-Type", "application/json")
// Arbitrary caller workspace the attacker does not own.
req.Header.Set("X-Workspace-ID", wsForgedCaller)
// Forged same-origin signals (the #1944 bypass vector).
req.Host = "localhost"
req.Header.Set("Referer", "https://localhost/")
req.Header.Set("Origin", "https://localhost")
// No Cookie / Authorization — no genuine canvas credential.
c.Request = req
handler.ProxyA2A(c)
if w.Code != http.StatusForbidden {
t.Fatalf("ESCALATION NOT CLOSED: forged same-origin + arbitrary X-Workspace-ID "+
"reached an unauthorized target with status %d (want 403): %s", w.Code, w.Body.String())
}
var resp map[string]interface{}
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
t.Fatalf("body not JSON: %v", err)
}
if !strings.Contains(fmt.Sprint(resp["error"]), "access denied") {
t.Errorf("expected an access-denied error from CanCommunicate, got %v", resp["error"])
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations — CanCommunicate must have been consulted: %v", err)
}
}
// TestProxyA2A_PushMode_NoShortCircuit verifies the symmetric contract:
// a push-mode workspace (default) is NOT affected by the new short-circuit.
// It still proceeds to resolveAgentURL + dispatch. Without this guard, a
@@ -412,3 +412,40 @@ func isSameOriginCanvas(c *gin.Context) bool {
origin := c.GetHeader("Origin")
return origin == "https://"+host || origin == "http://"+host
}
// cpSessionConfigured reports whether this platform is wired for upstream
// session-cookie verification — i.e. it runs as a SaaS tenant image with
// both CP_UPSTREAM_URL and MOLECULE_ORG_SLUG set. When false (self-hosted /
// dev), VerifiedCPSession can never succeed, so callers that want a
// non-forgeable canvas signal in SaaS while still working in dev can use
// this to decide whether the forgeable same-origin fallback is acceptable.
func cpSessionConfigured() bool {
return os.Getenv("CP_UPSTREAM_URL") != "" && tenantSlug() != ""
}
// CPSessionConfigured is the exported form of cpSessionConfigured for callers
// outside this package (e.g. the A2A proxy's canvas-user classification).
func CPSessionConfigured() bool {
return cpSessionConfigured()
}
// IsVerifiedCanvasSession returns true ONLY when the request carries a WorkOS
// session cookie that the control plane confirms belongs to a member of THIS
// tenant's org (via /cp/auth/tenant-member). Unlike IsSameOriginCanvas — whose
// Host/Referer/Origin inputs are trivially forgeable by any container on the
// Docker network and which is therefore documented as cosmetic-only (see
// AdminAuth / CanvasOrBearer comments above, #623/#194) — this is a real,
// upstream-verified authentication boundary. It is the correct gate for
// non-cosmetic actions such as A2A dispatch on behalf of a canvas user.
//
// Returns false (no network call) in self-hosted / dev deployments where
// CP_UPSTREAM_URL / MOLECULE_ORG_SLUG are unset; callers should treat that as
// "no verified canvas session available" and fall back accordingly.
func IsVerifiedCanvasSession(c *gin.Context) bool {
cookie := c.GetHeader("Cookie")
if cookie == "" {
return false
}
valid, _ := VerifiedCPSession(cookie)
return valid
}