From 1c9cea980d8f54c86420c8c0ec3c8cc245deff55 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Wed, 29 Apr 2026 14:09:33 -0700 Subject: [PATCH] =?UTF-8?q?feat(wsauth):=20platform=E2=86=92workspace=20in?= =?UTF-8?q?bound=20secret=20(RFC=20#2312,=20PR-A)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foundation for the HTTP-forward architecture that replaces Docker-exec in chat upload + 5 follow-on handlers. This PR is intentionally scoped to schema + token mint + provisioner wiring; no caller reads the secret yet so behavior is unchanged. Why a second per-workspace bearer (not reuse the existing workspace_auth_tokens row): workspace_auth_tokens workspaces.platform_inbound_secret ───────────────────── ───────────────────────────────── workspace → platform platform → workspace hash stored, plaintext gone plaintext stored (platform reads back) workspace presents bearer platform presents bearer platform validates by hash workspace validates by file compare Distinct roles, distinct rotation lifecycle, distinct audit signal — splitting later would require a fleet-wide rolling rotation, so paying the schema cost up front. Changes: * migration 044: ADD COLUMN workspaces.platform_inbound_secret TEXT * wsauth.IssuePlatformInboundSecret + ReadPlatformInboundSecret * issueAndInjectInboundSecret hook in workspace_provision: mints on every workspace create / re-provision; Docker mode writes plaintext to /configs/.platform_inbound_secret alongside .auth_token, SaaS mode persists to DB only (workspace will receive via /registry/register response in a follow-up PR) * 8 unit tests against sqlmock — covers happy path, rotation, NULL column, empty string, missing workspace row, empty workspaceID PR-B (next) wires up workspace-side `/internal/chat/uploads/ingest` that validates the bearer against /configs/.platform_inbound_secret. Refs #2312 (parent RFC), #2308 (chat upload 503 incident). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/workspace_provision.go | 40 ++++++ .../internal/wsauth/platform_inbound.go | 97 +++++++++++++ .../internal/wsauth/platform_inbound_test.go | 129 ++++++++++++++++++ .../044_platform_inbound_secret.down.sql | 9 ++ .../044_platform_inbound_secret.up.sql | 58 ++++++++ 5 files changed, 333 insertions(+) create mode 100644 workspace-server/internal/wsauth/platform_inbound.go create mode 100644 workspace-server/internal/wsauth/platform_inbound_test.go create mode 100644 workspace-server/migrations/044_platform_inbound_secret.down.sql create mode 100644 workspace-server/migrations/044_platform_inbound_secret.up.sql diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index b78b9405..53e7d31d 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -232,6 +232,7 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri // WriteFilesToContainer, which runs immediately after ContainerStart and // wins the race against the Python adapter's startup time (~1-2 s). h.issueAndInjectToken(ctx, workspaceID, &cfg) + h.issueAndInjectInboundSecret(ctx, workspaceID, &cfg) url, err := h.provisioner.Start(ctx, cfg) if err != nil { @@ -437,6 +438,45 @@ func (h *WorkspaceHandler) issueAndInjectToken(ctx context.Context, workspaceID log.Printf("Provisioner: injected fresh auth token for workspace %s into config volume", workspaceID) } +// issueAndInjectInboundSecret mints the platform→workspace shared secret +// (RFC #2312, migration 044) and persists the plaintext into the +// workspaces.platform_inbound_secret column so platform-side handlers can +// read it back on every forward call. +// +// Docker mode also writes the plaintext into cfg.ConfigFiles +// [".platform_inbound_secret"] so WriteFilesToContainer drops it on the +// /configs volume alongside .auth_token. +// +// SaaS mode persists to the DB but does NOT write a local file from +// here — there is no workspace-server-managed volume in SaaS. The +// workspace receives the secret out-of-band via the /registry/register +// response (mirrors the existing .auth_token bootstrap path). +// +// Best-effort: failure logs and continues. The workspace-side +// /internal/* handlers fail-closed when the file is missing, so a +// failed mint surfaces as 401 on the platform's first forward call — +// loud, debuggable, no silent fail-open. +func (h *WorkspaceHandler) issueAndInjectInboundSecret(ctx context.Context, workspaceID string, cfg *provisioner.WorkspaceConfig) { + secret, err := wsauth.IssuePlatformInboundSecret(ctx, db.DB, workspaceID) + if err != nil { + log.Printf("Provisioner: failed to issue platform_inbound_secret for %s: %v — chat upload + other /internal endpoints will 401", workspaceID, err) + return + } + + if saasMode() { + // Plaintext lives in the DB column; the workspace will fetch it + // via /registry/register response (handled in a follow-up PR). + log.Printf("Provisioner: minted platform_inbound_secret for %s (SaaS mode — workspace will receive via register response)", workspaceID) + return + } + + if cfg.ConfigFiles == nil { + cfg.ConfigFiles = make(map[string][]byte) + } + cfg.ConfigFiles[".platform_inbound_secret"] = []byte(secret) + log.Printf("Provisioner: injected platform_inbound_secret for workspace %s into config volume", workspaceID) +} + // findTemplateByName looks for a workspace-configs-templates directory matching a name. func findTemplateByName(configsDir, name string) string { entries, err := os.ReadDir(configsDir) diff --git a/workspace-server/internal/wsauth/platform_inbound.go b/workspace-server/internal/wsauth/platform_inbound.go new file mode 100644 index 00000000..5f48c073 --- /dev/null +++ b/workspace-server/internal/wsauth/platform_inbound.go @@ -0,0 +1,97 @@ +// Package wsauth — platform→workspace inbound secret (per-workspace bearer +// the platform presents when calling INTO a workspace's HTTP server). +// +// Asymmetric to IssueToken in this same package, which mints the +// **outbound** bearer (workspace → platform). See the per-function +// comments and migration 044 for the full rationale on why the two +// roles use distinct secrets stored in different shapes. +// +// IssueToken IssuePlatformInboundSecret +// ────────── ─────────────────────────── +// workspace_auth_tokens row workspaces.platform_inbound_secret column +// plaintext returned once, plaintext stored AND returned (the platform +// hash stored must read it back on every forward call) +// bcrypt-shape compare string-equality compare on workspace side +package wsauth + +import ( + "context" + "crypto/rand" + "database/sql" + "encoding/base64" + "errors" + "fmt" +) + +// platformInboundSecretBytes is the raw-random length before base64url +// encoding. Same 256-bit entropy floor as workspace_auth_tokens; the +// shape lets a future rotation script substitute one for the other +// without changing the bearer-presenter on either side. +const platformInboundSecretBytes = 32 + +// ErrNoInboundSecret is returned by ReadPlatformInboundSecret when the +// row exists but the column is NULL or empty. Callers MUST treat this +// as a structural failure (the row was created by a path that didn't +// mint a secret, or the migration ran but the row predates it) and +// surface the nil bearer to the user as a 500-class error rather than +// silently sending an unauthenticated request to the workspace. +var ErrNoInboundSecret = errors.New("wsauth: workspace has no platform_inbound_secret on file") + +// IssuePlatformInboundSecret generates a fresh per-workspace shared +// secret, persists the plaintext into workspaces.platform_inbound_secret, +// and returns the plaintext so the provisioner can write it into +// /configs/.platform_inbound_secret on the workspace's volume. +// +// The plaintext is INTENTIONALLY retained on the platform side: every +// platform→workspace forward call reads it back to put in the +// Authorization header. Hashing would force a re-mint on every call +// (defeating the purpose of the shared secret) or a separate plaintext +// store (defeating the simplicity). Encryption-at-rest is delegated to +// the underlying Postgres volume — application-layer encryption via +// SECRETS_ENCRYPTION_KEY is a defense-in-depth follow-up. +func IssuePlatformInboundSecret(ctx context.Context, db *sql.DB, workspaceID string) (string, error) { + if workspaceID == "" { + return "", fmt.Errorf("wsauth: workspaceID required") + } + buf := make([]byte, platformInboundSecretBytes) + if _, err := rand.Read(buf); err != nil { + return "", fmt.Errorf("wsauth: generate platform_inbound_secret: %w", err) + } + plaintext := base64.RawURLEncoding.EncodeToString(buf) + + _, err := db.ExecContext(ctx, ` + UPDATE workspaces SET platform_inbound_secret = $1 WHERE id = $2 + `, plaintext, workspaceID) + if err != nil { + return "", fmt.Errorf("wsauth: persist platform_inbound_secret: %w", err) + } + return plaintext, nil +} + +// ReadPlatformInboundSecret returns the plaintext secret for a workspace +// or ErrNoInboundSecret if the column is NULL/empty. Used by platform- +// side handlers that forward HTTPS calls into the workspace. +// +// Callers MUST handle ErrNoInboundSecret explicitly; never default to +// an empty bearer. An empty Authorization header would let any caller +// through if the workspace's auth is fail-open (it is not today, but +// defense-in-depth keeps it that way). +func ReadPlatformInboundSecret(ctx context.Context, db *sql.DB, workspaceID string) (string, error) { + if workspaceID == "" { + return "", fmt.Errorf("wsauth: workspaceID required") + } + var secret sql.NullString + err := db.QueryRowContext(ctx, + `SELECT platform_inbound_secret FROM workspaces WHERE id = $1`, workspaceID, + ).Scan(&secret) + if err == sql.ErrNoRows { + return "", ErrNoInboundSecret + } + if err != nil { + return "", fmt.Errorf("wsauth: read platform_inbound_secret: %w", err) + } + if !secret.Valid || secret.String == "" { + return "", ErrNoInboundSecret + } + return secret.String, nil +} diff --git a/workspace-server/internal/wsauth/platform_inbound_test.go b/workspace-server/internal/wsauth/platform_inbound_test.go new file mode 100644 index 00000000..2c1cda63 --- /dev/null +++ b/workspace-server/internal/wsauth/platform_inbound_test.go @@ -0,0 +1,129 @@ +package wsauth + +import ( + "context" + "errors" + "regexp" + "testing" + + "github.com/DATA-DOG/go-sqlmock" +) + +// IssuePlatformInboundSecret persists the plaintext (not a hash) so the +// platform can read it back on every forward call. This is the primary +// shape difference vs. IssueToken; pin it explicitly. +func TestIssuePlatformInboundSecret_PersistsPlaintext(t *testing.T) { + db, mock := setupMock(t) + + // Capture the plaintext written by the UPDATE so we can verify the + // returned value matches what was stored. AnyArg captures, then we + // pull the captured value via a separate ExpectExec... wait, sqlmock + // doesn't return the captured args. Use a regex-style match on the + // SQL and trust that the function persisted SOMETHING — then assert + // the returned plaintext shape (length + alphabet) matches the + // generator. The end-to-end "platform reads the same value back" is + // covered by ReadPlatformInboundSecret tests. + mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`). + WithArgs(sqlmock.AnyArg(), "ws-abc"). + WillReturnResult(sqlmock.NewResult(1, 1)) + + plaintext, err := IssuePlatformInboundSecret(context.Background(), db, "ws-abc") + if err != nil { + t.Fatalf("IssuePlatformInboundSecret: %v", err) + } + if len(plaintext) < 40 { + t.Errorf("plaintext too short for 256-bit entropy: len=%d", len(plaintext)) + } + if !regexp.MustCompile(`^[A-Za-z0-9_-]+$`).MatchString(plaintext) { + t.Errorf("plaintext contains non-urlsafe chars: %q", plaintext) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet expectations: %v", err) + } +} + +// Re-issue rotates: the same workspace gets a different secret on the +// second call. Without rotation, a leaked secret would be permanent. +func TestIssuePlatformInboundSecret_RotatesOnReissue(t *testing.T) { + db, mock := setupMock(t) + mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret`).WillReturnResult(sqlmock.NewResult(1, 1)) + mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret`).WillReturnResult(sqlmock.NewResult(1, 1)) + + a, _ := IssuePlatformInboundSecret(context.Background(), db, "ws-1") + b, _ := IssuePlatformInboundSecret(context.Background(), db, "ws-1") + if a == b { + t.Errorf("expected fresh secret on re-issue, got %q twice", a) + } +} + +func TestIssuePlatformInboundSecret_RejectsEmptyWorkspaceID(t *testing.T) { + db, _ := setupMock(t) + _, err := IssuePlatformInboundSecret(context.Background(), db, "") + if err == nil { + t.Error("expected error for empty workspaceID, got nil") + } +} + +func TestReadPlatformInboundSecret_HappyPath(t *testing.T) { + db, mock := setupMock(t) + mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`). + WithArgs("ws-abc"). + WillReturnRows(sqlmock.NewRows([]string{"platform_inbound_secret"}).AddRow("the-plaintext")) + + got, err := ReadPlatformInboundSecret(context.Background(), db, "ws-abc") + if err != nil { + t.Fatalf("ReadPlatformInboundSecret: %v", err) + } + if got != "the-plaintext" { + t.Errorf("got %q, want %q", got, "the-plaintext") + } +} + +// NULL column → ErrNoInboundSecret, not empty string. This is load-bearing: +// callers that ignored err and used the empty string would send an +// unauthenticated request to the workspace. +func TestReadPlatformInboundSecret_NullReturnsErrNoInbound(t *testing.T) { + db, mock := setupMock(t) + mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`). + WithArgs("ws-abc"). + WillReturnRows(sqlmock.NewRows([]string{"platform_inbound_secret"}).AddRow(nil)) + + _, err := ReadPlatformInboundSecret(context.Background(), db, "ws-abc") + if !errors.Is(err, ErrNoInboundSecret) { + t.Errorf("expected ErrNoInboundSecret on NULL, got %v", err) + } +} + +// Empty string is treated identically to NULL. +func TestReadPlatformInboundSecret_EmptyReturnsErrNoInbound(t *testing.T) { + db, mock := setupMock(t) + mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`). + WithArgs("ws-abc"). + WillReturnRows(sqlmock.NewRows([]string{"platform_inbound_secret"}).AddRow("")) + + _, err := ReadPlatformInboundSecret(context.Background(), db, "ws-abc") + if !errors.Is(err, ErrNoInboundSecret) { + t.Errorf("expected ErrNoInboundSecret on empty, got %v", err) + } +} + +// Missing workspace row → ErrNoInboundSecret (collapsed from sql.ErrNoRows). +func TestReadPlatformInboundSecret_MissingWorkspace(t *testing.T) { + db, mock := setupMock(t) + mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`). + WithArgs("ws-missing"). + WillReturnRows(sqlmock.NewRows([]string{"platform_inbound_secret"})) + + _, err := ReadPlatformInboundSecret(context.Background(), db, "ws-missing") + if !errors.Is(err, ErrNoInboundSecret) { + t.Errorf("expected ErrNoInboundSecret on missing row, got %v", err) + } +} + +func TestReadPlatformInboundSecret_RejectsEmptyWorkspaceID(t *testing.T) { + db, _ := setupMock(t) + _, err := ReadPlatformInboundSecret(context.Background(), db, "") + if err == nil { + t.Error("expected error for empty workspaceID, got nil") + } +} diff --git a/workspace-server/migrations/044_platform_inbound_secret.down.sql b/workspace-server/migrations/044_platform_inbound_secret.down.sql new file mode 100644 index 00000000..35e035c1 --- /dev/null +++ b/workspace-server/migrations/044_platform_inbound_secret.down.sql @@ -0,0 +1,9 @@ +-- Reverse 044_platform_inbound_secret.up.sql +-- +-- Drops the per-workspace platform→workspace shared secret column. +-- +-- Destructive: any current value is irrecoverable. Re-running the .up.sql +-- adds the column back as NULL; existing workspaces would then need a +-- reprovision (or a separate backfill) to mint a fresh secret. +ALTER TABLE workspaces + DROP COLUMN IF EXISTS platform_inbound_secret; diff --git a/workspace-server/migrations/044_platform_inbound_secret.up.sql b/workspace-server/migrations/044_platform_inbound_secret.up.sql new file mode 100644 index 00000000..a99b8920 --- /dev/null +++ b/workspace-server/migrations/044_platform_inbound_secret.up.sql @@ -0,0 +1,58 @@ +-- 044_platform_inbound_secret.up.sql +-- +-- Per-workspace shared secret that the **platform** presents when calling +-- INTO a workspace's HTTP server (e.g. POST /internal/chat/uploads/ingest). +-- +-- Asymmetric to workspace_auth_tokens, by design: +-- +-- workspace_auth_tokens workspaces.platform_inbound_secret +-- ───────────────────── ───────────────────────────────── +-- workspace → platform platform → workspace +-- plaintext NEVER stored plaintext STORED here +-- bcrypt-style hash compare string-equality compare +-- workspace presents bearer platform presents bearer +-- platform validates by hash workspace validates by file compare +-- +-- Why a separate column / role: +-- +-- * Principle of least privilege. Platform's authority over a workspace +-- is structurally different from the workspace's authority on the +-- platform; one bearer for both blurs that. +-- * Independent rotation. A platform-side compromise can be contained by +-- rotating only this column fleet-wide, without revoking every +-- workspace's outbound auth (which would take down heartbeat / A2A). +-- * Audit clarity. workspace-side logs see distinct bearers for +-- "platform call" vs "user call". +-- * Cheap to add now, expensive to retrofit. Splitting later forces a +-- fleet-wide rolling rotation while old + new clients coexist. +-- +-- Why plaintext (not hashed): +-- +-- The platform needs the plaintext at every forward call to put in the +-- Authorization header. Storing only a hash would force a re-mint on +-- every call (defeating the purpose) or a separate plaintext store +-- (defeating the simplicity). +-- +-- Encryption at rest: this column relies on Postgres-volume / disk +-- encryption (Railway-managed, AES-256). Application-layer encryption +-- via the existing SECRETS_ENCRYPTION_KEY pathway (workspace_secrets) is +-- a defense-in-depth follow-up — tracked separately so this migration +-- stays additive. +-- +-- Workspace-side delivery: the plaintext is written into +-- /configs/.platform_inbound_secret at provision time alongside the +-- existing /configs/.auth_token. Both files are 0600 and live on the +-- workspace's volume only. +-- +-- Nullable for forward-compat: existing rows have no value until the +-- workspace is reprovisioned. /internal/* handlers MUST refuse to serve +-- when the file is empty (fail-closed, not fail-open) so legacy rows +-- can't be hit pre-migration without a fresh secret. +-- +-- Reverse plan: the .down.sql drops the column. Any caller relying on +-- the secret would 404 after the column is gone — the column add is +-- additive in the proper direction; the down is destructive only because +-- the value would be irrecoverable. + +ALTER TABLE workspaces + ADD COLUMN IF NOT EXISTS platform_inbound_secret TEXT NULL;