Merge pull request #2361 from Molecule-AI/staging

staging → main: auto-promote 92a29bb
2026-04-30 07:41:41 -07:00 · 2026-04-30 07:41:41 -07:00 · 21313dcb07
commit 21313dcb07
parent 6ef562ee05 6bd38c2333
48 changed files with 3725 additions and 398 deletions
--- a/.github/workflows/auto-promote-staging.yml
+++ b/.github/workflows/auto-promote-staging.yml
@ -76,6 +76,27 @@ on:
 permissions:
  contents: write
  pull-requests: write
+  # actions: write is needed by the post-merge dispatch tail step
+  # (#2358 / #2357) — `gh workflow run publish-workspace-server-image.yml`
+  # POSTs to /actions/workflows/.../dispatches which requires this scope.
+  # Without it the call 403s and the publish/canary/redeploy chain still
+  # doesn't run on staging→main promotions, undoing #2358.
+  actions: write
+
+# Serialize auto-promote runs. Multiple staging gate completions can land
+# in quick succession (CI + E2E + CodeQL all finish within seconds of
+# each other on a green PR) — without this, two parallel runs both:
+#   1. Open / re-use the same promote PR.
+#   2. Both call `gh pr merge --auto` (idempotent — fine).
+#   3. Both poll for the same mergedAt and both `gh workflow run` publish
+#      → 2× redundant publish builds racing for the same `:staging-latest`
+#      retag, and 2× canary-verify chains.
+# cancel-in-progress: false because we don't want a brand-new run to kill
+# a polling-tail that's about to dispatch — the polling tail's 30 min cap
+# is the right backstop, not workflow-level cancel.
+concurrency:
+  group: auto-promote-staging
+  cancel-in-progress: false

 jobs:
  check-all-gates-green:
@ -271,19 +292,29 @@ jobs:
          PR_NUM: ${{ steps.promote_pr.outputs.promote_pr_num }}
        run: |
          # Poll for merge — max 30 min (60 × 30s). The merge queue
-          # typically lands within 5-10 min when gates are green.
+          # typically lands within 5-10 min when gates are green. Break
+          # early if the PR is closed without merging (operator action,
+          # gates flipped red post-approval, branch-protection rejection)
+          # so we don't tie up a runner for the full 30 min on a dead PR.
          MERGED=""
+          STATE=""
          for _ in $(seq 1 60); do
-            MERGED=$(gh pr view "$PR_NUM" --repo "$REPO" --json mergedAt --jq '.mergedAt // ""')
+            VIEW=$(gh pr view "$PR_NUM" --repo "$REPO" --json mergedAt,state)
+            MERGED=$(echo "$VIEW" | jq -r '.mergedAt // ""')
+            STATE=$(echo "$VIEW" | jq -r '.state // ""')
            if [ -n "$MERGED" ] && [ "$MERGED" != "null" ]; then
              echo "::notice::Promote PR #${PR_NUM} merged at ${MERGED}"
              break
            fi
+            if [ "$STATE" = "CLOSED" ]; then
+              echo "::warning::Promote PR #${PR_NUM} was closed without merging — skipping deploy dispatch."
+              exit 0
+            fi
            sleep 30
          done

          if [ -z "$MERGED" ] || [ "$MERGED" = "null" ]; then
-            echo "::warning::Promote PR #${PR_NUM} didn't merge within 30min — skipping deploy dispatch (manually run \`gh workflow run redeploy-tenants-on-main.yml\` once it lands)."
+            echo "::warning::Promote PR #${PR_NUM} didn't merge within 30min — skipping deploy dispatch (manually run \`gh workflow run publish-workspace-server-image.yml --ref main\` once it lands)."
            exit 0
          fi

--- a/.github/workflows/check-migration-collisions.yml
+++ b/.github/workflows/check-migration-collisions.yml
@ -0,0 +1,58 @@
+name: Check migration collisions
+
+# Hard gate (#2341): fails a PR that adds a migration prefix already
+# claimed by the base branch or another open PR. Caught manually 2026-04-30
+# during PR #2276 rebase: 044_runtime_image_pins collided with
+# 044_platform_inbound_secret from RFC #2312. This workflow makes that
+# check automatic.
+#
+# Trigger model: pull_request only — there's no value running this on
+# pushes to staging or main (those are post-merge; the gate must fire
+# pre-merge to be useful). Path filter scopes to PRs that actually touch
+# migrations.
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      - 'workspace-server/migrations/**'
+      - 'scripts/ops/check_migration_collisions.py'
+      - '.github/workflows/check-migration-collisions.yml'
+
+permissions:
+  contents: read
+  # gh pr list/diff need read access to other PRs
+  pull-requests: read
+
+jobs:
+  check:
+    name: Migration version collision check
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+        with:
+          # Need history to diff against base ref
+          fetch-depth: 0
+
+      - name: Detect collisions
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          BASE_REF: origin/${{ github.event.pull_request.base.ref }}
+          HEAD_REF: ${{ github.event.pull_request.head.sha }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          # gh CLI uses GH_TOKEN from env
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          # Ensure the named base ref exists locally. checkout@v4 with
+          # fetch-depth=0 pulls full history, but the explicit fetch is
+          # cheap insurance against form-of-ref differences across runs.
+          #
+          # IMPORTANT: do NOT pass --depth=1 here. The script below uses
+          # `git diff origin/<base>...<head>` (three-dot, merge-base form),
+          # which fails with "fatal: no merge base" if the base ref is
+          # shallow. The auto-promote staging→main PR (#2361) was blocked
+          # by exactly this for ~5h on 2026-04-30 — the depth=1 fetch
+          # overwrote checkout@v4's full-history clone with a shallow tip.
+          git fetch origin "${{ github.event.pull_request.base.ref }}" || true
+          python3 scripts/ops/check_migration_collisions.py
--- a/scripts/ops/check_migration_collisions.py
+++ b/scripts/ops/check_migration_collisions.py
@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+"""check_migration_collisions.py — fail-loud detector for two open PRs adding
+the same migration version number.
+
+Why this exists: two PRs targeting staging can each add a migration with the
+same numeric prefix (e.g. 044_*.up.sql). Each passes CI independently. They
+collide at merge time. Worst-case the second migration silently doesn't apply
+and the schema drifts from what the code expects. Caught manually 2026-04-30
+during PR #2276 rebase: 044_runtime_image_pins collided with
+044_platform_inbound_secret from RFC #2312.
+
+This check runs on every PR and asserts the migration prefixes added by THIS
+PR don't collide with:
+
+    1. The base branch's tip (someone else already used this number)
+    2. Any other open PR (race-window collision — both pass CI independently)
+
+Exit codes:
+    0  — no collisions
+    1  — collision detected; output names the conflicting PR(s) for the author
+
+Designed to run from a GitHub Actions PR check. Reads PR metadata via the
+GitHub CLI (gh) which is preinstalled on ubuntu-latest runners. Runs in
+under 10s against a typical PR.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+MIGRATIONS_DIR = "workspace-server/migrations"
+MIGRATION_FILE_RE = re.compile(r"^(\d+)_[^/]+\.(up|down)\.sql$")
+
+
+def run(cmd: list[str], check: bool = True) -> str:
+    """Run a subprocess and return stdout. Raise on non-zero when check=True."""
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if check and result.returncode != 0:
+        sys.stderr.write(f"command failed: {' '.join(cmd)}\n{result.stderr}\n")
+        sys.exit(1)
+    return result.stdout
+
+
+def migrations_in_diff(base_ref: str, head_ref: str) -> set[int]:
+    """Return the set of migration prefixes added or modified between two refs.
+
+    Uses --diff-filter=AM (Added or Modified) so a deleted migration doesn't
+    count. Renames (--diff-filter=R) appear as A on the new path and D on the
+    old, so we'd catch a renumbering correctly.
+    """
+    out = run([
+        "git", "diff", "--name-only", "--diff-filter=AM",
+        f"{base_ref}...{head_ref}", "--", MIGRATIONS_DIR,
+    ])
+    prefixes: set[int] = set()
+    for line in out.splitlines():
+        path = Path(line.strip())
+        if not path.name:
+            continue
+        m = MIGRATION_FILE_RE.match(path.name)
+        if not m:
+            # Files like the workflow_checkpoints.up.sql with non-numeric
+            # prefix are intentional — skip without complaint.
+            continue
+        prefixes.add(int(m.group(1)))
+    return prefixes
+
+
+def migrations_on_ref(ref: str) -> set[int]:
+    """Return the set of numeric migration prefixes existing at the given git ref.
+
+    Walks the migrations dir at that ref via `git ls-tree`, not the working
+    tree, so it works against any branch / SHA without checking it out.
+    """
+    out = run([
+        "git", "ls-tree", "-r", "--name-only", ref, "--", MIGRATIONS_DIR,
+    ])
+    prefixes: set[int] = set()
+    for line in out.splitlines():
+        path = Path(line.strip())
+        if not path.name:
+            continue
+        m = MIGRATION_FILE_RE.match(path.name)
+        if not m:
+            continue
+        prefixes.add(int(m.group(1)))
+    return prefixes
+
+
+def open_prs_with_migration_prefix(
+    repo: str, prefix: int, exclude_pr: int
+) -> list[dict]:
+    """Return open PRs (other than `exclude_pr`) that add a migration with
+    `prefix`. Uses `gh pr diff` per PR — we only need to walk PRs that are
+    actually in flight, so the cost is bounded by open-PR count.
+    """
+    out = run([
+        "gh", "pr", "list", "--repo", repo, "--state", "open",
+        "--json", "number,headRefName", "--limit", "100",
+    ])
+    prs = json.loads(out)
+    matches: list[dict] = []
+    for pr in prs:
+        num = pr["number"]
+        if num == exclude_pr:
+            continue
+        try:
+            files = run([
+                "gh", "pr", "diff", str(num), "--repo", repo, "--name-only",
+            ], check=False)
+        except Exception:  # noqa: BLE001
+            continue
+        for raw in files.splitlines():
+            path = Path(raw.strip())
+            if not path.name:
+                continue
+            m = MIGRATION_FILE_RE.match(path.name)
+            if m and int(m.group(1)) == prefix:
+                matches.append(pr)
+                break
+    return matches
+
+
+def main() -> int:
+    pr_number_env = os.environ.get("PR_NUMBER", "").strip()
+    if not pr_number_env:
+        sys.stderr.write(
+            "PR_NUMBER not set — this script is intended to run from a PR "
+            "context. Set PR_NUMBER (e.g. ${{ github.event.pull_request.number }}) "
+            "and BASE_REF (target branch) and HEAD_REF (PR head SHA).\n"
+        )
+        return 1
+    pr_number = int(pr_number_env)
+    base_ref = os.environ.get("BASE_REF", "origin/staging")
+    head_ref = os.environ.get("HEAD_REF", "HEAD")
+    repo = os.environ.get("GITHUB_REPOSITORY", "Molecule-AI/molecule-core")
+
+    added = migrations_in_diff(base_ref, head_ref)
+    if not added:
+        print("no migrations added or modified by this PR — nothing to check")
+        return 0
+
+    print(f"this PR adds/modifies migrations: {sorted(added)}")
+
+    # Collision check 1: base branch already has this prefix on a different
+    # filename. This happens when the PR was branched off an old base and
+    # didn't rebase — base advanced and another PR landed the same number.
+    base_prefixes = migrations_on_ref(base_ref)
+    base_collisions = added & base_prefixes
+    # Filter to "different filename, same prefix" — same filename means the
+    # PR is updating an existing migration in place, which is fine.
+    real_base_collisions: set[int] = set()
+    for prefix in base_collisions:
+        # List filenames at base for this prefix
+        out = run([
+            "git", "ls-tree", "-r", "--name-only", base_ref, "--",
+            MIGRATIONS_DIR,
+        ])
+        base_names = {
+            Path(line).name for line in out.splitlines()
+            if (m := MIGRATION_FILE_RE.match(Path(line).name)) and int(m.group(1)) == prefix
+        }
+        # And in the PR
+        diff_out = run([
+            "git", "diff", "--name-only", "--diff-filter=AM",
+            f"{base_ref}...{head_ref}", "--", MIGRATIONS_DIR,
+        ])
+        pr_names = {
+            Path(line).name for line in diff_out.splitlines()
+            if (m := MIGRATION_FILE_RE.match(Path(line).name)) and int(m.group(1)) == prefix
+        }
+        if pr_names - base_names:
+            real_base_collisions.add(prefix)
+
+    # Collision check 2: another open PR claims the same prefix.
+    open_pr_collisions: dict[int, list[dict]] = {}
+    for prefix in added:
+        peers = open_prs_with_migration_prefix(repo, prefix, pr_number)
+        if peers:
+            open_pr_collisions[prefix] = peers
+
+    if not real_base_collisions and not open_pr_collisions:
+        print("no migration version collisions detected")
+        return 0
+
+    print()
+    print("::error::migration version collision detected")
+    if real_base_collisions:
+        print(f"::error::these prefixes already exist on {base_ref} with different filenames: "
+              f"{sorted(real_base_collisions)}")
+        print("::error::rebase onto current base and renumber to the next available prefix")
+    for prefix, peers in sorted(open_pr_collisions.items()):
+        peer_str = ", ".join(f"#{p['number']} ({p['headRefName']})" for p in peers)
+        print(f"::error::migration prefix {prefix:03d} also claimed by open PR(s): {peer_str}")
+        print(f"::error::rebase coordination needed — only one PR can land a given prefix; "
+              f"renumber yours or theirs")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/ops/test_check_migration_collisions.py
+++ b/scripts/ops/test_check_migration_collisions.py
@ -0,0 +1,65 @@
+"""Unit tests for check_migration_collisions.py — focuses on the regex
+classifier + the diff/base-set logic that runs without git.
+
+The end-to-end git diff + gh pr list path is exercised manually (running
+the workflow against test PRs). These tests pin the pure-logic surface
+so a regression in migration-name parsing fails immediately at PR time.
+
+Run locally: ``python3 -m unittest scripts/ops/test_check_migration_collisions.py -v``
+"""
+
+import importlib.util
+import unittest
+from pathlib import Path
+
+# Load the script as a module without invoking main(). We import the
+# regex + helpers directly so we can test them without setting up git.
+SCRIPT_PATH = Path(__file__).parent / "check_migration_collisions.py"
+spec = importlib.util.spec_from_file_location("ccm", SCRIPT_PATH)
+ccm = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(ccm)
+
+
+class TestMigrationFileRe(unittest.TestCase):
+    """The regex classifier — the load-bearing piece of the detector."""
+
+    def test_matches_standard_three_digit_prefix(self):
+        m = ccm.MIGRATION_FILE_RE.match("044_platform_inbound_secret.up.sql")
+        assert m is not None
+        assert int(m.group(1)) == 44
+        assert m.group(2) == "up"
+
+    def test_matches_down_migration(self):
+        m = ccm.MIGRATION_FILE_RE.match("044_platform_inbound_secret.down.sql")
+        assert m is not None
+        assert int(m.group(1)) == 44
+        assert m.group(2) == "down"
+
+    def test_matches_date_shaped_prefix(self):
+        # Real example from the repo: 20260417000000_workflow_checkpoints
+        m = ccm.MIGRATION_FILE_RE.match("20260417000000_workflow_checkpoints.up.sql")
+        assert m is not None
+        assert int(m.group(1)) == 20260417000000
+
+    def test_matches_long_compound_name(self):
+        m = ccm.MIGRATION_FILE_RE.match("042_a2a_queue.up.sql")
+        assert m is not None
+        assert int(m.group(1)) == 42
+
+    def test_rejects_no_prefix(self):
+        assert ccm.MIGRATION_FILE_RE.match("readme.md") is None
+
+    def test_rejects_alpha_prefix(self):
+        assert ccm.MIGRATION_FILE_RE.match("abc_migration.up.sql") is None
+
+    def test_rejects_wrong_extension(self):
+        assert ccm.MIGRATION_FILE_RE.match("044_test.sql") is None
+        assert ccm.MIGRATION_FILE_RE.match("044_test.up.txt") is None
+
+    def test_rejects_path_separator(self):
+        # Filename only — paths come pre-split via Path(line).name
+        assert ccm.MIGRATION_FILE_RE.match("044/test.up.sql") is None
+
+    def test_rejects_no_underscore(self):
+        # Naming convention requires <digits>_<name>
+        assert ccm.MIGRATION_FILE_RE.match("044.up.sql") is None
--- a/workspace-server/internal/handlers/a2a_corpus_test.go
+++ b/workspace-server/internal/handlers/a2a_corpus_test.go
@ -0,0 +1,318 @@
+package handlers
+
+// a2a_corpus_test.go — backward-compat replay gate for the A2A
+// JSON-RPC protocol surface. Every PR that touches
+// normalizeA2APayload OR bumps the a-2-a-sdk version pin runs
+// every shape in testdata/a2a_corpus/ through the current code
+// and asserts:
+//
+//   valid/   — every shape MUST parse without error and produce a
+//              canonical v0.3 payload (params.message.parts list).
+//
+//   invalid/ — every shape MUST be rejected with the documented
+//              status code and error substring. Pins the
+//              rejection contract so a future PR doesn't silently
+//              start accepting malformed payloads.
+//
+// Closes the gap that allowed the 2026-04-29 v0.2 → v0.3 silent-
+// drop bug (PR #2349). That bug shipped because the SDK bump PR
+// didn't replay v0.2-shaped inputs against the new code; the
+// shape-mismatch surfaced only in production when the receiver's
+// Pydantic validator silently rejected inbound messages.
+//
+// Adding to the corpus: see testdata/a2a_corpus/README.md.
+// Removing from valid/: breaking change, requires explicit
+// approval per the README.
+
+import (
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+const (
+	corpusValidDir   = "testdata/a2a_corpus/valid"
+	corpusInvalidDir = "testdata/a2a_corpus/invalid"
+)
+
+// metadataFields are the documentation-only keys the corpus loader
+// strips before passing the payload to normalizeA2APayload. They
+// are required for every corpus entry per the README policy.
+var metadataFields = []string{
+	"_comment",
+	"_added",
+	"_source",
+	"_expect_error",
+	"_expect_status",
+}
+
+// loadCorpusEntry reads one JSON file, parses it as a generic map,
+// extracts the metadata fields (including expected error/status for
+// invalid entries), strips them from the payload, and returns the
+// stripped JSON bytes ready for normalizeA2APayload.
+func loadCorpusEntry(t *testing.T, path string) (payload []byte, expectErr string, expectStatus int) {
+	t.Helper()
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("read %s: %v", path, err)
+	}
+	var doc map[string]interface{}
+	if err := json.Unmarshal(raw, &doc); err != nil {
+		t.Fatalf("parse %s as JSON: %v", path, err)
+	}
+	// Pull metadata before strip.
+	if v, ok := doc["_expect_error"].(string); ok {
+		expectErr = v
+	}
+	if v, ok := doc["_expect_status"].(float64); ok {
+		expectStatus = int(v)
+	}
+	for _, f := range metadataFields {
+		delete(doc, f)
+	}
+	payload, err = json.Marshal(doc)
+	if err != nil {
+		t.Fatalf("re-marshal %s after strip: %v", path, err)
+	}
+	return payload, expectErr, expectStatus
+}
+
+// listCorpus enumerates every .json file under dir and returns
+// (filename → full path). Sorted for stable test ordering.
+func listCorpus(t *testing.T, dir string) map[string]string {
+	t.Helper()
+	out := map[string]string{}
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		t.Fatalf("read %s: %v", dir, err)
+	}
+	for _, e := range entries {
+		if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") {
+			continue
+		}
+		out[e.Name()] = filepath.Join(dir, e.Name())
+	}
+	if len(out) == 0 {
+		t.Fatalf("corpus dir %s is empty — at least one entry is required", dir)
+	}
+	return out
+}
+
+// TestA2ACorpus_ValidShapesParse replays every entry in valid/
+// through normalizeA2APayload and asserts:
+//  1. No error returned.
+//  2. The output's params.message.parts is a non-empty list
+//     (v0.3 canonical shape — the compat shim must have converted
+//     any v0.2 content field into parts).
+//  3. The output's params.message.messageId is non-empty (the
+//     normalizer auto-fills if the sender omitted it).
+//  4. The output's method matches the input's method (the
+//     normalizer is method-agnostic).
+//
+// One subtest per corpus entry — failures point directly at the
+// offending shape file.
+func TestA2ACorpus_ValidShapesParse(t *testing.T) {
+	t.Parallel()
+	for name, path := range listCorpus(t, corpusValidDir) {
+		t.Run(name, func(t *testing.T) {
+			payload, _, _ := loadCorpusEntry(t, path)
+
+			normalized, method, perr := normalizeA2APayload(payload)
+			if perr != nil {
+				t.Fatalf("valid/%s: normalizeA2APayload returned error %d: %v",
+					name, perr.Status, perr.Response)
+			}
+
+			// Read back the normalized payload to verify shape invariants.
+			var parsed map[string]interface{}
+			if err := json.Unmarshal(normalized, &parsed); err != nil {
+				t.Fatalf("valid/%s: normalized output not valid JSON: %v", name, err)
+			}
+
+			// Method-agnostic check — input method survives normalization.
+			if input := mustGetString(t, parsed, "method"); input != method {
+				t.Errorf("valid/%s: method mismatch — got %q, want %q",
+					name, method, input)
+			}
+
+			// Canonical v0.3 shape invariants: params.message.parts is a
+			// non-empty list, messageId is non-empty.
+			params := mustGetMap(t, parsed, "params")
+			msg := mustGetMap(t, params, "message")
+
+			parts, ok := msg["parts"].([]interface{})
+			if !ok {
+				t.Errorf("valid/%s: params.message.parts is not a list (got %T)",
+					name, msg["parts"])
+				return
+			}
+			if len(parts) == 0 {
+				t.Errorf("valid/%s: params.message.parts is empty — compat shim should have converted content", name)
+			}
+
+			if id := mustGetString(t, msg, "messageId"); id == "" {
+				t.Errorf("valid/%s: params.message.messageId is empty after normalization", name)
+			}
+
+			// content must NOT survive into the output — the shim
+			// deletes it after converting to parts. If the shim left
+			// content in place, downstream pydantic v0.3 would still
+			// reject because it doesn't know that field.
+			if _, hasContent := msg["content"]; hasContent {
+				t.Errorf("valid/%s: params.message.content survived normalization (compat shim should delete it)", name)
+			}
+		})
+	}
+}
+
+// TestA2ACorpus_InvalidShapesRejected replays every entry in
+// invalid/ through normalizeA2APayload and asserts the rejection
+// matches the documented contract — same status code AND error
+// substring as recorded in the corpus entry's metadata.
+//
+// Catches the regression class "future PR adds permissive defaults
+// that silently accept what we used to reject loud."
+func TestA2ACorpus_InvalidShapesRejected(t *testing.T) {
+	t.Parallel()
+	for name, path := range listCorpus(t, corpusInvalidDir) {
+		t.Run(name, func(t *testing.T) {
+			payload, expectErr, expectStatus := loadCorpusEntry(t, path)
+
+			if expectErr == "" {
+				t.Fatalf("invalid/%s: missing _expect_error metadata", name)
+			}
+			if expectStatus == 0 {
+				t.Fatalf("invalid/%s: missing _expect_status metadata", name)
+			}
+
+			_, _, perr := normalizeA2APayload(payload)
+			if perr == nil {
+				t.Fatalf("invalid/%s: normalizeA2APayload returned no error — should have rejected", name)
+			}
+			if perr.Status != expectStatus {
+				t.Errorf("invalid/%s: status = %d, want %d", name, perr.Status, expectStatus)
+			}
+
+			body, _ := json.Marshal(perr.Response)
+			if !strings.Contains(string(body), expectErr) {
+				t.Errorf("invalid/%s: error response %q does not contain expected substring %q",
+					name, string(body), expectErr)
+			}
+		})
+	}
+}
+
+// TestA2ACorpus_MalformedJSONRejected covers the case where the
+// body isn't valid JSON at all. The corpus is JSON-only so this
+// can't be expressed as a corpus entry; pin the contract inline.
+func TestA2ACorpus_MalformedJSONRejected(t *testing.T) {
+	t.Parallel()
+	cases := []struct {
+		name    string
+		payload []byte
+	}{
+		{"truncated_object", []byte(`{"jsonrpc":"2.0","method":"message/send"`)},
+		{"not_json_at_all", []byte(`this is not json`)},
+		{"empty_body", []byte(``)},
+		{"only_whitespace", []byte(`   `)},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			_, _, perr := normalizeA2APayload(tc.payload)
+			if perr == nil {
+				t.Fatalf("expected error for %s, got none", tc.name)
+			}
+			if perr.Status != http.StatusBadRequest {
+				t.Errorf("status = %d, want %d", perr.Status, http.StatusBadRequest)
+			}
+			body, _ := json.Marshal(perr.Response)
+			if !strings.Contains(string(body), "invalid JSON") {
+				t.Errorf("expected 'invalid JSON' in response, got %q", string(body))
+			}
+		})
+	}
+}
+
+// TestA2ACorpus_HasMinimumCoverage pins the corpus's
+// representativeness. The corpus must have at least one v0.2
+// entry (string content) and at least one v0.3 entry (parts list)
+// — losing either side of the schema bridge would silently drop
+// the most important coverage.
+func TestA2ACorpus_HasMinimumCoverage(t *testing.T) {
+	t.Parallel()
+	files := listCorpus(t, corpusValidDir)
+	hasV02 := false
+	hasV03 := false
+	for name := range files {
+		if strings.Contains(name, "v0_2_") {
+			hasV02 = true
+		}
+		if strings.Contains(name, "v0_3_") {
+			hasV03 = true
+		}
+	}
+	if !hasV02 {
+		t.Error("corpus has no v0_2_*.json entries — backward-compat coverage missing")
+	}
+	if !hasV03 {
+		t.Error("corpus has no v0_3_*.json entries — forward (canonical) coverage missing")
+	}
+}
+
+// TestA2ACorpus_EveryEntryHasMetadata pins the README policy:
+// every corpus entry MUST have _comment, _added, _source. Catches
+// the bad commit shape "added entry without explanation" before
+// review.
+func TestA2ACorpus_EveryEntryHasMetadata(t *testing.T) {
+	t.Parallel()
+	for _, dir := range []string{corpusValidDir, corpusInvalidDir} {
+		for name, path := range listCorpus(t, dir) {
+			t.Run(filepath.Base(dir)+"/"+name, func(t *testing.T) {
+				raw, err := os.ReadFile(path)
+				if err != nil {
+					t.Fatalf("read %s: %v", path, err)
+				}
+				var doc map[string]interface{}
+				if err := json.Unmarshal(raw, &doc); err != nil {
+					t.Fatalf("parse %s: %v", path, err)
+				}
+				required := []string{"_comment", "_added", "_source"}
+				if dir == corpusInvalidDir {
+					required = append(required, "_expect_error", "_expect_status")
+				}
+				for _, key := range required {
+					if _, ok := doc[key]; !ok {
+						t.Errorf("missing required metadata field %q", key)
+					}
+				}
+			})
+		}
+	}
+}
+
+func mustGetMap(t *testing.T, m map[string]interface{}, key string) map[string]interface{} {
+	t.Helper()
+	v, ok := m[key].(map[string]interface{})
+	if !ok {
+		t.Fatalf("expected %q to be a map, got %T", key, m[key])
+	}
+	return v
+}
+
+func mustGetString(t *testing.T, m map[string]interface{}, key string) string {
+	t.Helper()
+	v, ok := m[key].(string)
+	if !ok {
+		t.Fatalf("expected %q to be a string, got %T", key, m[key])
+	}
+	return v
+}
+
+// _ silences the unused-import linter for fmt in case future
+// helpers don't use it. Currently used by the t.Helper-style
+// formatters above (kept inline for clarity).
+var _ = fmt.Sprintf
--- a/workspace-server/internal/handlers/a2a_proxy.go
+++ b/workspace-server/internal/handlers/a2a_proxy.go
@ -13,6 +13,7 @@ import (
 	"errors"
 	"io"
 	"log"
+	"net"
 	"net/http"
 	"os"
 	"strconv"
@ -92,13 +93,47 @@ func isSystemCaller(callerID string) bool {
 const maxProxyResponseBody = 10 << 20

 // a2aClient is a shared HTTP client for proxying A2A requests to workspace agents.
-// No client-level timeout — timeouts are enforced per-request via context
-// deadlines: canvas = 5 min (Rule 3), agent-to-agent = 30 min (DoS cap). Do NOT
-// set a Client.Timeout here: it is enforced independently of ctx deadlines and
-// would pre-empt legitimate slow cold-start flows (e.g. Claude Code first-token
-// over OAuth can take 30-60s on boot). Callers that want a safety net should
-// build a context.WithTimeout themselves.
-var a2aClient = &http.Client{}
+//
+// Timeout model — three independent budgets, none of which gets in each other's way:
+//
+//   1. Client.Timeout — DELIBERATELY UNSET. Client.Timeout is a hard wall on
+//      the entire request including streamed body reads, and would pre-empt
+//      legitimate slow cold-start flows (Claude Code first-token over OAuth
+//      can take 30-60s on boot; long-running agent synthesis can stream
+//      tokens for minutes). Total-request budget is enforced per-request
+//      via context deadline (canvas = idle-only, agent-to-agent = 30 min ceiling).
+//
+//   2. Transport.DialContext — 10s connect timeout. When a workspace's EC2
+//      black-holes TCP connects (instance terminated mid-flight, security group
+//      flipped, NACL bug), the OS default is 75s on Linux / 21s on macOS — long
+//      enough that Cloudflare's ~100s edge timeout can fire first and surface
+//      a generic 502 page to canvas. 10s is well above realistic intra-region
+//      latencies and well below CF's edge timeout.
+//
+//   3. Transport.ResponseHeaderTimeout — 60s. From request-body-end to
+//      response-headers-start. Covers cold-start first-byte (the 30-60s OAuth
+//      flow above), with margin. Body streaming after headers is governed by
+//      the per-request context deadline, NOT this timeout — so multi-minute
+//      agent responses still work fine.
+//
+// The point of (2) and (3) is to surface a *structured* 503 from
+// handleA2ADispatchError when the workspace agent is unreachable, so canvas
+// gets `{"error":"workspace agent unreachable","restarting":true}` instead
+// of Cloudflare's opaque 502 error page. Without these, dead workspaces hang
+// long enough that CF gives up first and shows its own page.
+var a2aClient = &http.Client{
+	Transport: &http.Transport{
+		DialContext: (&net.Dialer{
+			Timeout:   10 * time.Second,
+			KeepAlive: 30 * time.Second,
+		}).DialContext,
+		ResponseHeaderTimeout: 60 * time.Second,
+		TLSHandshakeTimeout:   10 * time.Second,
+		// MaxIdleConns / IdleConnTimeout: stdlib defaults are fine; agent
+		// fan-in is bounded by the platform's broadcaster fan-out, not by
+		// connection-pool sizing.
+	},
+}

 type proxyA2AError struct {
 	Status   int
@ -146,6 +181,35 @@ func isUpstreamBusyError(err error) bool {
 		strings.Contains(msg, "connection reset")
 }

+// isUpstreamDeadStatus returns true when the upstream HTTP status indicates
+// the workspace agent is unreachable / unresponsive at the network layer
+// (vs an agent-authored 5xx with a real body). Used by the proxy to gate
+// reactive container-dead detection + auto-restart.
+//
+//   - 502 Bad Gateway, 503 Service Unavailable, 504 Gateway Timeout: standard
+//     proxy-layer "upstream is broken" codes (Cloudflare, ELB, agent tunnel).
+//   - 521 Web Server Is Down: Cloudflare can't open TCP to origin (most
+//     direct dead-EC2 signal).
+//   - 522 Connection Timed Out: Cloudflare opened TCP but no response within
+//     ~15s — typical of SG/NACL flap or agent process hung.
+//   - 523 Origin Is Unreachable: Cloudflare can't route to origin (DNS or
+//     network-path failure).
+//   - 524 A Timeout Occurred: TCP succeeded, but origin didn't return
+//     headers within ~100s — agent process alive but wedged.
+//
+// We always probe IsRunning before acting, so a transient false positive
+// from this set just costs one CP API call.
+func isUpstreamDeadStatus(status int) bool {
+	switch status {
+	case http.StatusBadGateway, // 502
+		http.StatusServiceUnavailable, // 503
+		http.StatusGatewayTimeout,     // 504
+		521, 522, 523, 524:            // CF dead-origin family
+		return true
+	}
+	return false
+}
+
 func (e *proxyA2AError) Error() string {
 	if e == nil || e.Response == nil {
 		return "proxy a2a error"
@ -422,6 +486,43 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
 		if errMsg == "" {
 			errMsg = http.StatusText(resp.StatusCode)
 		}
+
+		// Upstream returned 502/503/504 (gateway/proxy failure). This is
+		// the "agent process is dead but the tunnel between us and the
+		// workspace is still up" signal — handleA2ADispatchError's
+		// network-error path doesn't run because Do() succeeded at the
+		// HTTP layer. Without this branch, the dead-agent failure mode
+		// surfaces to canvas as a generic 502 (and CF in front of the
+		// platform masks it with its own error page, hiding any
+		// structured response we might write).
+		//
+		// Treatment matches handleA2ADispatchError's container-dead path:
+		//   1. Probe IsRunning via maybeMarkContainerDead. If the
+		//      container truly is dead, mark workspace offline + kick
+		//      a restart goroutine.
+		//   2. Return a structured 503 with restarting=true + Retry-After
+		//      so canvas shows a useful "agent is restarting" message
+		//      (and CF doesn't intercept the 503 the way it does 502).
+		// If IsRunning reports the container is alive, we leave the
+		// upstream status untouched — the agent legitimately returned
+		// 502/503/504 (e.g. it's returning its own Bad-Gateway from
+		// some downstream call) and we shouldn't mistakenly recycle it.
+		//
+		// Empty body is the strong signal here — a CF-tunnel "no-origin"
+		// 502 has 0 bytes; an agent-authored 502 typically has a JSON
+		// error body. We probe IsRunning regardless (it's the
+		// authoritative check) but the empty-body case is what makes
+		// this fix necessary.
+		if isUpstreamDeadStatus(resp.StatusCode) {
+			if h.maybeMarkContainerDead(ctx, workspaceID) {
+				return 0, nil, &proxyA2AError{
+					Status:   http.StatusServiceUnavailable,
+					Headers:  map[string]string{"Retry-After": "15"},
+					Response: gin.H{"error": "workspace agent unreachable — container restart triggered", "restarting": true, "retry_after": 15},
+				}
+			}
+		}
+
 		return resp.StatusCode, respBody, &proxyA2AError{
 			Status:   resp.StatusCode,
 			Response: gin.H{"error": errMsg},
--- a/workspace-server/internal/handlers/a2a_proxy_helpers.go
+++ b/workspace-server/internal/handlers/a2a_proxy_helpers.go
@ -141,22 +141,46 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
 }

 // maybeMarkContainerDead runs the reactive health check after a forward error.
-// If the workspace's Docker container is no longer running (and the workspace
-// isn't external), it marks the workspace offline, clears Redis state,
-// broadcasts WORKSPACE_OFFLINE, and triggers an async restart. Returns true
-// when the container was found dead.
+// If the workspace's compute (Docker container OR EC2 instance) is no longer
+// running (and the workspace isn't external), it marks the workspace offline,
+// clears Redis state, broadcasts WORKSPACE_OFFLINE, and triggers an async
+// restart. Returns true when the compute was found dead.
+//
+// Provisioner selection (mutually exclusive in production):
+//   - h.provisioner != nil  → local Docker deployment; IsRunning does docker inspect.
+//   - h.cpProv != nil       → SaaS / EC2 deployment; IsRunning calls CP's
+//                              /cp/workspaces/:id/status to read the EC2 state.
+//
+// Pre-fix this function ONLY consulted h.provisioner — for SaaS tenants
+// (h.provisioner=nil, h.cpProv=set) it short-circuited to false on every
+// call, so a dead EC2 agent would propagate upstream 502/503/504 to canvas
+// with no auto-recovery and Cloudflare in front would mask the response with
+// its own error page. The 2026-04-30 hongmingwang.moleculesai.app
+// canvas-chat-to-dead-workspace incident traces to exactly this gap.
 func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspaceID string) bool {
 	var wsRuntime string
 	db.DB.QueryRowContext(ctx, `SELECT COALESCE(runtime, 'langgraph') FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsRuntime)
-	if h.provisioner == nil || wsRuntime == "external" {
+	if wsRuntime == "external" {
 		return false
 	}
-	running, inspectErr := h.provisioner.IsRunning(ctx, workspaceID)
+	if h.provisioner == nil && h.cpProv == nil {
+		return false
+	}
+
+	var running bool
+	var inspectErr error
+	if h.provisioner != nil {
+		running, inspectErr = h.provisioner.IsRunning(ctx, workspaceID)
+	} else {
+		// SaaS path: ask the CP about the EC2 state. Same (true, err) on
+		// transport errors contract — keeps the caller on the alive path
+		// instead of triggering a restart cascade on a flaky CP call.
+		running, inspectErr = h.cpProv.IsRunning(ctx, workspaceID)
+	}
 	if inspectErr != nil {
-		// Transient Docker-daemon error (timeout, socket EOF, etc.). Post-
-		// #386, IsRunning returns (true, err) in this case — caller stays
-		// on the alive path and does not trigger a restart cascade. Log
-		// so the defect is visible without being destructive.
+		// Transient backend error (Docker daemon EOF, CP HTTP 5xx, etc.).
+		// IsRunning's contract returns (true, err) in this case so we stay
+		// on the alive path without triggering a restart cascade.
 		log.Printf("ProxyA2A: IsRunning for %s returned transient error (assuming alive): %v", workspaceID, inspectErr)
 	}
 	if running {
--- a/workspace-server/internal/handlers/a2a_proxy_test.go
+++ b/workspace-server/internal/handlers/a2a_proxy_test.go
@ -16,6 +16,7 @@ import (
 	"time"

 	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
 	"github.com/gin-gonic/gin"
 )

@ -244,6 +245,117 @@ func TestProxyA2A_AgentReturnsError(t *testing.T) {
 	}
 }

+// TestProxyA2A_Upstream502_TriggersContainerDeadCheck — when the agent
+// tunnel returns 502 (the "tunnel up but no origin" failure mode that
+// surfaces a Cloudflare error page to canvas), proxyA2A must consult
+// IsRunning on cpProv. If the EC2 instance truly is dead, the response
+// becomes a structured 503 with restarting=true (not the upstream 502
+// which CF would mask), and the workspace flips to status='offline' so
+// the next reactive poll sees the right state. This is the
+// 2026-04-30 hongmingwang.moleculesai.app canvas-chat-to-dead-workspace
+// regression: upstream 502 was previously propagated as-is, CF masked
+// it, and no auto-restart fired.
+func TestProxyA2A_Upstream502_TriggersContainerDeadCheck(t *testing.T) {
+	mock := setupTestDB(t)
+	mr := setupTestRedis(t)
+	allowLoopbackForTest(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+	cp := &fakeCPProv{running: false}
+	handler.SetCPProvisioner(cp)
+
+	// Agent tunnel returns 502 with empty body — the CF "no-origin" shape.
+	agentServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusBadGateway)
+	}))
+	defer agentServer.Close()
+
+	mr.Set(fmt.Sprintf("ws:%s:url", "ws-tunnel-dead"), agentServer.URL)
+	expectBudgetCheck(mock, "ws-tunnel-dead")
+	// Activity log fires (delivery_confirmed is true on Do() success regardless
+	// of upstream status — handler's existing logA2ASuccess path runs first
+	// and logs as success because the dispatch did get a response).
+	mock.ExpectExec("INSERT INTO activity_logs").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	// maybeMarkContainerDead's runtime lookup, then the offline-flip UPDATE.
+	mock.ExpectQuery(`SELECT COALESCE\(runtime, 'langgraph'\) FROM workspaces WHERE id =`).
+		WithArgs("ws-tunnel-dead").
+		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
+	mock.ExpectExec(`UPDATE workspaces SET status = 'offline'`).
+		WithArgs("ws-tunnel-dead").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-tunnel-dead"}}
+	body := `{"method":"message/send","params":{"message":{"role":"user","parts":[{"text":"hi"}]}}}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/ws-tunnel-dead/a2a", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.ProxyA2A(c)
+
+	time.Sleep(80 * time.Millisecond)
+
+	// Caller sees a structured 503 (NOT the upstream 502 which CF would mask).
+	if w.Code != http.StatusServiceUnavailable {
+		t.Fatalf("upstream 502 should translate to 503 once cpProv reports dead; got %d: %s", w.Code, w.Body.String())
+	}
+	if !strings.Contains(w.Body.String(), "restarting") {
+		t.Errorf("response body should mention restart trigger; got %s", w.Body.String())
+	}
+	if w.Header().Get("Retry-After") != "15" {
+		t.Errorf("Retry-After header should be 15 to throttle canvas-side retry loop; got %q", w.Header().Get("Retry-After"))
+	}
+	if cp.calls != 1 {
+		t.Errorf("cpProv.IsRunning must be consulted exactly once; got %d calls", cp.calls)
+	}
+}
+
+// TestProxyA2A_Upstream502_AliveAgent_PropagatesAsIs — the safety check:
+// if cpProv reports the EC2 IS running, the upstream 502 is propagated
+// as-is. Don't recycle a healthy agent on a transient hiccup — the agent
+// might have legitimately returned 502 (e.g. a downstream service it
+// called returned 502 and it forwarded). Net behavior matches pre-fix
+// for the alive-agent case.
+func TestProxyA2A_Upstream502_AliveAgent_PropagatesAsIs(t *testing.T) {
+	mock := setupTestDB(t)
+	mr := setupTestRedis(t)
+	allowLoopbackForTest(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+	cp := &fakeCPProv{running: true}
+	handler.SetCPProvisioner(cp)
+
+	agentServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		w.WriteHeader(http.StatusBadGateway)
+		fmt.Fprint(w, `{"error":"downstream service returned 502"}`)
+	}))
+	defer agentServer.Close()
+
+	mr.Set(fmt.Sprintf("ws:%s:url", "ws-alive-502"), agentServer.URL)
+	expectBudgetCheck(mock, "ws-alive-502")
+	mock.ExpectExec("INSERT INTO activity_logs").WillReturnResult(sqlmock.NewResult(0, 1))
+	// IsRunning runtime lookup runs but no UPDATE follows (running=true).
+	mock.ExpectQuery(`SELECT COALESCE\(runtime, 'langgraph'\) FROM workspaces WHERE id =`).
+		WithArgs("ws-alive-502").
+		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-alive-502"}}
+	body := `{"method":"message/send","params":{"message":{"role":"user","parts":[{"text":"hi"}]}}}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/ws-alive-502/a2a", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.ProxyA2A(c)
+	time.Sleep(50 * time.Millisecond)
+
+	if w.Code != http.StatusBadGateway {
+		t.Fatalf("alive agent 502 should propagate as 502; got %d: %s", w.Code, w.Body.String())
+	}
+}
+
 // ==================== ProxyA2A — messageId injection ====================

 func TestProxyA2A_MessageIDInjected(t *testing.T) {
@ -527,8 +639,8 @@ func TestProxyA2A_CallerIDDerivedFromBearer(t *testing.T) {
 	mr.Set(fmt.Sprintf("ws:%s:url", "ws-target"), agentServer.URL)

 	// 1. Bearer-derive lookup → returns ws-caller
-	mock.ExpectQuery(`SELECT t\.workspace_id\s+FROM workspace_auth_tokens t.*JOIN workspaces`).
-		WillReturnRows(sqlmock.NewRows([]string{"workspace_id"}).AddRow("ws-caller"))
+	mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
+		WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-caller"))

 	// 2. validateCallerToken's HasAnyLiveToken / ValidateToken queries fall
 	//    through to fail-open (no expectations set) — same pattern as
@ -654,7 +766,7 @@ func TestProxyA2A_BearerDeriveFailureFallsThrough(t *testing.T) {
 	// Bearer-derive lookup fails (no live row) — collapses to ErrInvalidToken
 	// inside WorkspaceFromToken; ProxyA2A swallows the error and proceeds with
 	// callerID="".
-	mock.ExpectQuery(`SELECT t\.workspace_id\s+FROM workspace_auth_tokens t.*JOIN workspaces`).
+	mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
 		WillReturnError(sql.ErrNoRows)

 	expectBudgetCheck(mock, "ws-target")
@ -807,6 +919,46 @@ func TestIsUpstreamBusyError(t *testing.T) {
 	}
 }

+// TestIsUpstreamDeadStatus locks in the status-code matrix that gates
+// reactive container-dead detection. Order matters: the helper exists so
+// the proxy + any future caller (e.g. a sweeper) classify CF dead-origin
+// codes the same way. Drift here would re-introduce the SaaS-blind bug
+// for whichever code we forgot.
+func TestIsUpstreamDeadStatus(t *testing.T) {
+	cases := []struct {
+		name   string
+		status int
+		want   bool
+	}{
+		// Standard proxy-layer dead-upstream codes
+		{"502 BadGateway", 502, true},
+		{"503 ServiceUnavailable", 503, true},
+		{"504 GatewayTimeout", 504, true},
+		// Cloudflare dead-origin family
+		{"521 WebServerDown", 521, true},
+		{"522 ConnectionTimedOut", 522, true},
+		{"523 OriginUnreachable", 523, true},
+		{"524 OriginTimedOut", 524, true},
+		// Negative cases — must NOT trigger restart
+		{"200 OK", 200, false},
+		{"400 BadRequest (agent rejected payload)", 400, false},
+		{"401 Unauthorized", 401, false},
+		{"404 NotFound (no such session)", 404, false},
+		{"408 RequestTimeout (client-side)", 408, false},
+		{"429 TooManyRequests (rate limited, agent alive)", 429, false},
+		{"500 InternalServerError (agent crashed mid-request)", 500, false},
+		{"501 NotImplemented", 501, false},
+		{"505 HTTPVersionNotSupported", 505, false},
+		{"520 WebServerReturnedUnknown (agent returned malformed)", 520, false},
+		{"525 SSLHandshakeFailed (TLS misconfig, not dead origin)", 525, false},
+	}
+	for _, tc := range cases {
+		if got := isUpstreamDeadStatus(tc.status); got != tc.want {
+			t.Errorf("%s: isUpstreamDeadStatus(%d) = %v, want %v", tc.name, tc.status, got, tc.want)
+		}
+	}
+}
+
 // ==================== ProxyA2A — upstream timeout returns 503 busy + Retry-After ====================

 // Verifies the full error-shaping contract for the 503-busy path:
@ -1640,6 +1792,143 @@ func TestMaybeMarkContainerDead_NilProvisioner(t *testing.T) {
 	}
 }

+// SaaS path: h.provisioner=nil but h.cpProv is wired and reports the EC2
+// instance is NOT running. maybeMarkContainerDead must consult cpProv,
+// flip the workspace to status='offline', clear keys, broadcast OFFLINE,
+// and return true so the caller surfaces the structured 503. Pre-fix
+// (#NNN) it returned false unconditionally on h.provisioner==nil, so
+// dead EC2 agents leaked upstream 502 to canvas with no recovery.
+func TestMaybeMarkContainerDead_CPOnly_NotRunning(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+	cp := &fakeCPProv{running: false}
+	handler.SetCPProvisioner(cp)
+
+	mock.ExpectQuery(`SELECT COALESCE\(runtime, 'langgraph'\) FROM workspaces WHERE id =`).
+		WithArgs("ws-saas-dead").
+		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
+	mock.ExpectExec(`UPDATE workspaces SET status = 'offline'`).
+		WithArgs("ws-saas-dead").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	got := handler.maybeMarkContainerDead(context.Background(), "ws-saas-dead")
+	if !got {
+		t.Fatal("expected true (cpProv reports not running) — without cpProv consultation, SaaS dead-agent recovery is impossible")
+	}
+	if cp.calls != 1 {
+		t.Errorf("expected exactly 1 IsRunning call on cpProv; got %d", cp.calls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// SaaS path: h.cpProv reports running=true → maybeMarkContainerDead must
+// return false (don't restart a healthy agent on a transient upstream
+// hiccup). This is the safety check that prevents over-eager recycling.
+func TestMaybeMarkContainerDead_CPOnly_Running(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+	cp := &fakeCPProv{running: true}
+	handler.SetCPProvisioner(cp)
+
+	mock.ExpectQuery(`SELECT COALESCE\(runtime, 'langgraph'\) FROM workspaces WHERE id =`).
+		WithArgs("ws-saas-alive").
+		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
+
+	if got := handler.maybeMarkContainerDead(context.Background(), "ws-saas-alive"); got {
+		t.Error("expected false when cpProv reports running — must not recycle a healthy agent")
+	}
+	if cp.calls != 1 {
+		t.Errorf("expected exactly 1 IsRunning call on cpProv; got %d", cp.calls)
+	}
+}
+
+// SaaS-path runRestartCycle: when h.provisioner is nil and h.cpProv is set,
+// the auto-restart cycle MUST call cpProv.Stop (not Docker provisioner.Stop).
+// Pre-fix this dispatched only to h.provisioner.Stop, NPE'd on nil, was
+// silently swallowed by coalesceRestart's recover-without-re-raise, and
+// left the workspace stuck in status='provisioning' forever — making
+// reactive auto-restart on SaaS effectively dead code. The independent
+// review of PR #2362 caught this gap.
+//
+// We drive runRestartCycle directly (not via RestartByID/coalesceRestart)
+// so we don't fight the goroutine's timing in a unit test. The full
+// restart chain (provisionWorkspaceCP) needs its own mocked DB rows that
+// would explode the surface area of this test; what we care about here
+// is the dispatch decision, which is observable on cpProv.stopCalls.
+// stopForRestart is the dispatch helper extracted from runRestartCycle so the
+// branch logic can be tested without spawning the async sendRestartContext
+// goroutine that the full cycle fires. Pre-fix runRestartCycle's Stop dispatch
+// only called the Docker path, so on SaaS (h.provisioner=nil) the cycle NPE'd
+// silently and left the workspace stuck in status='provisioning'.
+func TestStopForRestart_SaaSPath_DispatchesViaCPProv(t *testing.T) {
+	setupTestRedis(t)
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+	cp := &fakeCPProv{}
+	handler.SetCPProvisioner(cp)
+
+	handler.stopForRestart(context.Background(), "ws-saas-restart")
+
+	if cp.stopCalls != 1 {
+		t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d", cp.stopCalls)
+	}
+	if cp.startCalls != 0 {
+		t.Fatalf("expected cpProv.Start NOT to be called by stopForRestart; got %d", cp.startCalls)
+	}
+}
+
+// Both nil → no-op, no panic, no DB / broadcast side effects. Guards the
+// dispatcher against being invoked on a misconfigured handler. Important
+// because runRestartCycle's surrounding flow (status='provisioning' UPDATE
+// + broadcast) MUST happen even when both provisioners are nil — but
+// stopForRestart itself is a pure dispatcher and shouldn't touch state.
+func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+	// no provisioner, no cpProv, no DB expectations set on mock — any
+	// unexpected query/exec will produce a sqlmock error.
+	handler.stopForRestart(context.Background(), "ws-orphan")
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("stopForRestart no-provisioner path should not touch DB: %v", err)
+	}
+}
+
+// fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise
+// the SaaS / EC2-backed reactive-health path.
+//
+// Methods all record calls. Start/Stop/GetConsoleOutput return nil/empty by
+// default — the maybeMarkContainerDead happy path triggers an async
+// `go h.RestartByID(...)` which calls Stop, so the previous "panic on
+// unexpected call" pattern was unsafe (the panic fires on a goroutine,
+// after the assertions ran). Tests that want to ASSERT a method is unused
+// can check `calls == 0` after a sync barrier.
+type fakeCPProv struct {
+	running    bool
+	calls      int
+	stopCalls  int
+	startCalls int
+}
+
+func (f *fakeCPProv) Start(_ context.Context, _ provisioner.WorkspaceConfig) (string, error) {
+	f.startCalls++
+	return "", nil
+}
+func (f *fakeCPProv) Stop(_ context.Context, _ string) error {
+	f.stopCalls++
+	return nil
+}
+func (f *fakeCPProv) GetConsoleOutput(_ context.Context, _ string) (string, error) {
+	return "", nil
+}
+func (f *fakeCPProv) IsRunning(_ context.Context, _ string) (bool, error) {
+	f.calls++
+	return f.running, nil
+}
+
 // external runtime → false regardless of provisioner.
 func TestMaybeMarkContainerDead_ExternalRuntime(t *testing.T) {
 	mock := setupTestDB(t)
--- a/workspace-server/internal/handlers/admin_test_token_test.go
+++ b/workspace-server/internal/handlers/admin_test_token_test.go
@ -129,3 +129,97 @@ func TestAdminTestToken_HappyPath_TokenValidates(t *testing.T) {
 }

 func sqlErrNoRows() error { return sql.ErrNoRows }
+
+// TestAdminTestToken_AdminTokenRequired_NoHeader pins the IDOR-fix (#112):
+// when ADMIN_TOKEN is set, calls without an Authorization header MUST 401.
+// Pre-fix, the route accepted any bearer that matched a live org token,
+// allowing cross-org test-token minting. The current code uses
+// subtle.ConstantTimeCompare against ADMIN_TOKEN explicitly. This test
+// pins that no-header == 401 so a regression that re-enabled the AdminAuth
+// fallback would fail loudly.
+func TestAdminTestToken_AdminTokenRequired_NoHeader(t *testing.T) {
+	setupTestDB(t)
+	t.Setenv("MOLECULE_ENV", "development")
+	t.Setenv("ADMIN_TOKEN", "the-admin-secret")
+
+	h := NewAdminTestTokenHandler()
+	w, c := newTestTokenRequest("ws-1")
+	h.GetTestToken(c)
+
+	if w.Code != http.StatusUnauthorized {
+		t.Fatalf("expected 401 with ADMIN_TOKEN set + no Authorization, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
+// TestAdminTestToken_AdminTokenRequired_WrongHeader pins that a non-matching
+// bearer is rejected. Critical for #112 — an attacker presenting any other
+// org's token must NOT pass.
+func TestAdminTestToken_AdminTokenRequired_WrongHeader(t *testing.T) {
+	setupTestDB(t)
+	t.Setenv("MOLECULE_ENV", "development")
+	t.Setenv("ADMIN_TOKEN", "the-admin-secret")
+
+	h := NewAdminTestTokenHandler()
+	w, c := newTestTokenRequest("ws-1")
+	c.Request.Header.Set("Authorization", "Bearer wrong-token")
+	h.GetTestToken(c)
+
+	if w.Code != http.StatusUnauthorized {
+		t.Fatalf("expected 401 with wrong Authorization, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
+// TestAdminTestToken_AdminTokenRequired_CorrectHeader pins the success
+// path through the ADMIN_TOKEN gate. Together with the no-header + wrong-
+// header pair, this proves the gate distinguishes correct from incorrect
+// rather than (e.g.) erroring on every request.
+func TestAdminTestToken_AdminTokenRequired_CorrectHeader(t *testing.T) {
+	mock := setupTestDB(t)
+	t.Setenv("MOLECULE_ENV", "development")
+	t.Setenv("ADMIN_TOKEN", "the-admin-secret")
+
+	mock.ExpectQuery("SELECT id FROM workspaces WHERE id =").
+		WithArgs("ws-1").
+		WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-1"))
+	mock.ExpectExec("INSERT INTO workspace_auth_tokens").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	h := NewAdminTestTokenHandler()
+	w, c := newTestTokenRequest("ws-1")
+	c.Request.Header.Set("Authorization", "Bearer the-admin-secret")
+	h.GetTestToken(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 with correct ADMIN_TOKEN, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("sqlmock expectations not met — INSERT into workspace_auth_tokens did not run, suggesting the gate short-circuited the success path: %v", err)
+	}
+}
+
+// TestAdminTestToken_AdminTokenEmpty_GateBypassedSafely pins that when
+// ADMIN_TOKEN is unset (typical local-dev setup), the explicit gate is
+// bypassed and the route works without an Authorization header. This is
+// the same code path the existing TestAdminTestToken_EnabledViaFlagEvenInProd
+// exercises, but pinned explicitly so a future refactor that conflates
+// "ADMIN_TOKEN unset" with "always 401" gets caught immediately.
+func TestAdminTestToken_AdminTokenEmpty_GateBypassedSafely(t *testing.T) {
+	mock := setupTestDB(t)
+	t.Setenv("MOLECULE_ENV", "development")
+	t.Setenv("ADMIN_TOKEN", "")
+
+	mock.ExpectQuery("SELECT id FROM workspaces WHERE id =").
+		WithArgs("ws-1").
+		WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-1"))
+	mock.ExpectExec("INSERT INTO workspace_auth_tokens").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	h := NewAdminTestTokenHandler()
+	w, c := newTestTokenRequest("ws-1")
+	// Note: NO Authorization header — the gate is unset, so this MUST work.
+	h.GetTestToken(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 with ADMIN_TOKEN empty + no Authorization, got %d: %s", w.Code, w.Body.String())
+	}
+}
--- a/workspace-server/internal/handlers/chat_files.go
+++ b/workspace-server/internal/handlers/chat_files.go
@ -29,7 +29,7 @@ package handlers
 // conversation payloads.

 import (
-	"errors"
+	"context"
 	"fmt"
 	"io"
 	"log"
@ -40,7 +40,6 @@ import (
 	"time"

 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
-	"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
 	"github.com/gin-gonic/gin"
 )

@ -83,6 +82,69 @@ const chatUploadMaxBytes = 50 * 1024 * 1024
 // reference.
 const chatUploadDir = "/workspace/.molecule/chat-uploads"

+// resolveWorkspaceForwardCreds resolves the workspace's URL +
+// platform_inbound_secret for an /internal/* forward, applying
+// lazy-heal on a missing inbound secret (RFC #2312 backfill — the
+// 2026-04-30 fix that closes the existing-workspace gap left by the
+// shared-mint refactor).
+//
+// On any failure path the function HAS ALREADY written the appropriate
+// status + JSON body to c (404 / 503 / 500) and returns ok=false.
+// On success returns the URL + secret + ok=true.
+//
+// op is the human-readable feature label ("upload"/"download") used
+// in log messages and the 503 RFC-#2312 detail copy so operators can
+// distinguish which feature ran.
+//
+// Centralized here (rather than inline in Upload + Download) so the
+// next forward-time condition we add — secret rotation, audit, etc. —
+// goes in ONE place. Drift between the two handlers is the same class
+// of bug as the original SaaS provision drift fixed in #2366; this
+// extraction prevents that class on the consumer side.
+func resolveWorkspaceForwardCreds(c *gin.Context, ctx context.Context, workspaceID, op string) (wsURL, secret string, ok bool) {
+	if err := db.DB.QueryRowContext(ctx,
+		`SELECT COALESCE(url, '') FROM workspaces WHERE id = $1`, workspaceID,
+	).Scan(&wsURL); err != nil {
+		log.Printf("chat_files %s: workspace lookup failed for %s: %v", op, workspaceID, err)
+		c.JSON(http.StatusNotFound, gin.H{"error": "workspace not found"})
+		return "", "", false
+	}
+	if wsURL == "" {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace url not registered yet"})
+		return "", "", false
+	}
+	// Trust note: workspaces.url passes validateAgentURL at /registry/
+	// register write time, blocking SSRF-shaped URLs. We rely on that
+	// upstream gate rather than re-validating here. Tracked at #2316
+	// for follow-up: forward-time re-validation as defense-in-depth.
+
+	secret, healed, err := readOrLazyHealInboundSecret(ctx, workspaceID, "chat_files "+op)
+	if err != nil {
+		// Either a non-NoInboundSecret read error (DB hiccup) or a mint
+		// failure during lazy-heal. The chat_files contract is to surface
+		// 503 with the RFC-#2312 reprovision hint in both cases — the user
+		// can't proceed and needs ops attention.
+		c.JSON(http.StatusServiceUnavailable, gin.H{
+			"error":  "workspace not yet enrolled in v2 " + op + " (RFC #2312)",
+			"detail": "Failed to mint inbound secret. Reprovision the workspace if this persists.",
+		})
+		return "", "", false
+	}
+	if healed {
+		// The platform now has the secret but the workspace's
+		// /configs/.platform_inbound_secret is still empty until the next
+		// /registry/register response propagates it. User retries after
+		// the workspace's next heartbeat picks up the new secret (~30s).
+		c.JSON(http.StatusServiceUnavailable, gin.H{
+			"error":               "workspace re-registering — please retry in 30 seconds",
+			"detail":              "Inbound secret was just minted. Workspace will pick it up on its next heartbeat.",
+			"retry_after_seconds": 30,
+		})
+		return "", "", false
+	}
+	return wsURL, secret, true
+}
+
 // urlPathEscape percent-encodes every byte outside the RFC 3986
 // unreserved set — stricter than net/url.PathEscape (which leaves
 // "/" unescaped because it's legal in URL paths). Filenames must
@ -168,42 +230,8 @@ func (h *ChatFilesHandler) Upload(c *gin.Context) {

 	ctx := c.Request.Context()

-	// Resolve workspace URL + inbound secret. Both must be present;
-	// either one missing means the workspace was provisioned before
-	// migration 044 or the row got into a bad state. Surface as 503
-	// rather than silently failing — operators should notice.
-	var wsURL string
-	if err := db.DB.QueryRowContext(ctx,
-		`SELECT COALESCE(url, '') FROM workspaces WHERE id = $1`, workspaceID,
-	).Scan(&wsURL); err != nil {
-		log.Printf("chat_files Upload: workspace lookup failed for %s: %v", workspaceID, err)
-		c.JSON(http.StatusNotFound, gin.H{"error": "workspace not found"})
-		return
-	}
-	if wsURL == "" {
-		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace url not registered yet"})
-		return
-	}
-	// Trust note: workspaces.url passes validateAgentURL at /registry/
-	// register write time, blocking SSRF-shaped URLs. We rely on that
-	// upstream gate rather than re-validating here. Tracked at #2316
-	// for follow-up: forward-time re-validation as defense-in-depth.
-
-	secret, err := wsauth.ReadPlatformInboundSecret(ctx, db.DB, workspaceID)
-	if err != nil {
-		if errors.Is(err, wsauth.ErrNoInboundSecret) {
-			// Workspace predates migration 044 OR the provisioner's
-			// secret-mint hop failed. Both are operational issues,
-			// not user errors. Log loudly so ops can backfill.
-			log.Printf("chat_files Upload: no platform_inbound_secret for %s — workspace needs reprovision (#2312)", workspaceID)
-			c.JSON(http.StatusServiceUnavailable, gin.H{
-				"error":  "workspace not yet enrolled in v2 upload (RFC #2312)",
-				"detail": "Reprovisioning the workspace will mint the platform_inbound_secret it's missing.",
-			})
-			return
-		}
-		log.Printf("chat_files Upload: read platform_inbound_secret failed for %s: %v", workspaceID, err)
-		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to read workspace secret"})
+	wsURL, secret, ok := resolveWorkspaceForwardCreds(c, ctx, workspaceID, "upload")
+	if !ok {
 		return
 	}

@ -305,34 +333,8 @@ func (h *ChatFilesHandler) Download(c *gin.Context) {

 	ctx := c.Request.Context()

-	// Resolve workspace URL + inbound secret. Same shape as Upload —
-	// see chat_files.go::Upload for the rationale on why each missing-
-	// piece path surfaces as 404 / 503.
-	var wsURL string
-	if err := db.DB.QueryRowContext(ctx,
-		`SELECT COALESCE(url, '') FROM workspaces WHERE id = $1`, workspaceID,
-	).Scan(&wsURL); err != nil {
-		log.Printf("chat_files Download: workspace lookup failed for %s: %v", workspaceID, err)
-		c.JSON(http.StatusNotFound, gin.H{"error": "workspace not found"})
-		return
-	}
-	if wsURL == "" {
-		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace url not registered yet"})
-		return
-	}
-
-	secret, err := wsauth.ReadPlatformInboundSecret(ctx, db.DB, workspaceID)
-	if err != nil {
-		if errors.Is(err, wsauth.ErrNoInboundSecret) {
-			log.Printf("chat_files Download: no platform_inbound_secret for %s — workspace needs reprovision (#2312)", workspaceID)
-			c.JSON(http.StatusServiceUnavailable, gin.H{
-				"error":  "workspace not yet enrolled in v2 download (RFC #2312)",
-				"detail": "Reprovisioning the workspace will mint the platform_inbound_secret it's missing.",
-			})
-			return
-		}
-		log.Printf("chat_files Download: read platform_inbound_secret failed for %s: %v", workspaceID, err)
-		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to read workspace secret"})
+	wsURL, secret, ok := resolveWorkspaceForwardCreds(c, ctx, workspaceID, "download")
+	if !ok {
 		return
 	}

--- a/workspace-server/internal/handlers/chat_files_test.go
+++ b/workspace-server/internal/handlers/chat_files_test.go
@ -112,28 +112,88 @@ func TestChatUpload_WorkspaceNotInDB(t *testing.T) {
 	}
 }

-func TestChatUpload_NoInboundSecret(t *testing.T) {
+// TestChatUpload_NoInboundSecret_LazyHeal pins the lazy-heal flow
+// added 2026-04-30 alongside the SaaS shared-prepare refactor:
+//
+//   1. Reading the workspace's platform_inbound_secret returns NULL
+//      (legacy row from before RFC #2312).
+//   2. Handler MUST call wsauth.IssuePlatformInboundSecret (an UPDATE
+//      on the workspaces row) to backfill the secret, so the next
+//      upload after the workspace's heartbeat picks it up succeeds
+//      without operator action.
+//   3. Response is 503 with retry_after_seconds=30 — the workspace's
+//      local /configs/.platform_inbound_secret is also empty, so the
+//      forward this request would do still fails. The user retries
+//      after the next register response delivers the new secret.
+//
+// Pre-fix (before the lazy-heal): handlers returned 503 with
+// "Reprovision the workspace" — accurate, but every legacy workspace
+// would 503 forever until ops manually triggered a reprovision.
+func TestChatUpload_NoInboundSecret_LazyHeal(t *testing.T) {
 	mock := setupTestDB(t)
 	setupTestRedis(t)

 	// Legacy row: URL set but platform_inbound_secret is NULL.
 	wsID := "00000000-0000-0000-0000-000000000041"
 	expectURL(mock, wsID, "http://127.0.0.1:1")
-	expectInboundSecret(mock, wsID, nil) // NULL
+	expectInboundSecret(mock, wsID, nil) // NULL — triggers lazy-heal
+	// Lazy-heal mint MUST land. If this expectation isn't matched,
+	// the upload handler skipped the backfill and ops would have to
+	// manually reprovision every legacy workspace.
+	mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`).
+		WithArgs(sqlmock.AnyArg(), wsID).
+		WillReturnResult(sqlmock.NewResult(0, 1))

 	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
 	body, ct := uploadFixture(t)
 	c, w := makeUploadRequest(t, wsID, body, ct)
 	h.Upload(c)

-	// 503 with detail steering ops to reprovision. NOT 200, NOT a
-	// silent no-bearer forward (which would land as a 401 that the
-	// user can't action).
 	if w.Code != http.StatusServiceUnavailable {
 		t.Errorf("expected 503 when platform_inbound_secret missing, got %d: %s", w.Code, w.Body.String())
 	}
+	// Lazy-heal-success body steers the user to retry; the failure
+	// body steers them to reprovision. Distinguishing them pins which
+	// branch ran.
+	if !strings.Contains(w.Body.String(), "retry") {
+		t.Errorf("expected lazy-heal success response (retry hint), got: %s", w.Body.String())
+	}
+	if !strings.Contains(w.Body.String(), "30") {
+		t.Errorf("expected retry_after_seconds=30 in body, got: %s", w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("sqlmock expectations not met — lazy-heal mint did NOT run, regression of #2312 backfill: %v", err)
+	}
+}
+
+// TestChatUpload_NoInboundSecret_LazyHealFailure pins the alternate
+// branch: the platform_inbound_secret is NULL AND the lazy-heal mint
+// itself fails (e.g. DB unreachable). Handler must surface the
+// reprovision-steering error rather than silently swallowing.
+func TestChatUpload_NoInboundSecret_LazyHealFailure(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	wsID := "00000000-0000-0000-0000-000000000042"
+	expectURL(mock, wsID, "http://127.0.0.1:1")
+	expectInboundSecret(mock, wsID, nil) // NULL — triggers lazy-heal
+	mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`).
+		WithArgs(sqlmock.AnyArg(), wsID).
+		WillReturnError(sql.ErrConnDone) // mint fails
+
+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
+	body, ct := uploadFixture(t)
+	c, w := makeUploadRequest(t, wsID, body, ct)
+	h.Upload(c)
+
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503 when lazy-heal fails, got %d: %s", w.Code, w.Body.String())
+	}
 	if !strings.Contains(w.Body.String(), "RFC #2312") {
-		t.Errorf("expected detail to reference RFC #2312, got: %s", w.Body.String())
+		t.Errorf("expected detail to reference RFC #2312 on lazy-heal failure, got: %s", w.Body.String())
+	}
+	if !strings.Contains(w.Body.String(), "Reprovision") {
+		t.Errorf("expected reprovision hint on mint failure, got: %s", w.Body.String())
 	}
 }

@ -372,13 +432,22 @@ func TestChatDownload_WorkspaceNotInDB(t *testing.T) {
 	}
 }

-func TestChatDownload_NoInboundSecret(t *testing.T) {
+// TestChatDownload_NoInboundSecret_LazyHeal — same lazy-heal flow
+// as TestChatUpload_NoInboundSecret_LazyHeal but on the Download
+// handler. Pinned separately because Upload + Download have
+// independent code paths into ReadPlatformInboundSecret; a partial
+// regression that healed Upload but skipped Download is the kind of
+// drift we want to fail the test, not ship.
+func TestChatDownload_NoInboundSecret_LazyHeal(t *testing.T) {
 	mock := setupTestDB(t)
 	setupTestRedis(t)

 	wsID := "00000000-0000-0000-0000-000000000051"
 	expectURL(mock, wsID, "http://127.0.0.1:1")
 	expectInboundSecret(mock, wsID, nil)
+	mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`).
+		WithArgs(sqlmock.AnyArg(), wsID).
+		WillReturnResult(sqlmock.NewResult(0, 1))

 	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
 	c, w := makeDownloadRequest(t, wsID, "/workspace/foo.txt")
@ -387,8 +456,34 @@ func TestChatDownload_NoInboundSecret(t *testing.T) {
 	if w.Code != http.StatusServiceUnavailable {
 		t.Errorf("expected 503 when platform_inbound_secret missing, got %d: %s", w.Code, w.Body.String())
 	}
+	if !strings.Contains(w.Body.String(), "retry") {
+		t.Errorf("expected lazy-heal success response (retry hint), got: %s", w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("sqlmock expectations not met — Download lazy-heal mint did NOT run: %v", err)
+	}
+}
+
+func TestChatDownload_NoInboundSecret_LazyHealFailure(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	wsID := "00000000-0000-0000-0000-000000000052"
+	expectURL(mock, wsID, "http://127.0.0.1:1")
+	expectInboundSecret(mock, wsID, nil)
+	mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`).
+		WithArgs(sqlmock.AnyArg(), wsID).
+		WillReturnError(sql.ErrConnDone)
+
+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
+	c, w := makeDownloadRequest(t, wsID, "/workspace/foo.txt")
+	h.Download(c)
+
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503 when lazy-heal fails, got %d: %s", w.Code, w.Body.String())
+	}
 	if !strings.Contains(w.Body.String(), "RFC #2312") {
-		t.Errorf("expected detail to reference RFC #2312, got: %s", w.Body.String())
+		t.Errorf("expected detail to reference RFC #2312 on lazy-heal failure, got: %s", w.Body.String())
 	}
 }

--- a/workspace-server/internal/handlers/registry.go
+++ b/workspace-server/internal/handlers/registry.go
@ -449,14 +449,21 @@ func (h *RegistryHandler) Register(c *gin.Context) {
 	// outbound auth_token's "issue once" lifecycle. Returning it here is
 	// the only delivery path for SaaS, where the platform's CP provisioner
 	// has no volume to write into.
-	if secret, secretErr := wsauth.ReadPlatformInboundSecret(ctx, db.DB, payload.ID); secretErr == nil {
+	//
+	// Lazy-heal (2026-04-30): if the column is NULL (legacy workspace
+	// provisioned before the shared-mint refactor), mint inline and
+	// include in the response. Without this, legacy workspaces would
+	// need two round-trips before chat upload works — chat_files
+	// lazy-heals platform-side on first attempt, then the workspace
+	// must heartbeat to receive the freshly-minted secret.
+	// Heal-on-register collapses that to one round-trip.
+	if secret, _, healErr := readOrLazyHealInboundSecret(ctx, payload.ID, "Registry"); healErr == nil {
 		response["platform_inbound_secret"] = secret
-	} else if !errors.Is(secretErr, wsauth.ErrNoInboundSecret) {
-		// ErrNoInboundSecret is the expected case for legacy workspaces
-		// that predate migration 044 — quiet. Other errors (DB hiccup, etc.)
-		// log loud so ops notices.
-		log.Printf("Registry: read platform_inbound_secret for %s failed: %v", payload.ID, secretErr)
 	}
+	// Errors are non-fatal here — the workspace is online and can serve
+	// non-/internal traffic. The lazy-heal helper has already logged
+	// whichever sub-step failed (read or mint). If the secret never lands,
+	// chat upload surfaces the issue loudly with the RFC-#2312 hint.

 	c.JSON(http.StatusOK, response)
 }
--- a/workspace-server/internal/handlers/registry_test.go
+++ b/workspace-server/internal/handlers/registry_test.go
@ -991,7 +991,19 @@ func TestRegister_ReturnsPlatformInboundSecret_RFC2312_PRF(t *testing.T) {
 // that predate migration 044 (NULL platform_inbound_secret column) still
 // get a successful registration — the field is just omitted from the
 // response. The Register handler logs the absence quietly.
-func TestRegister_NoInboundSecret_OmitsField(t *testing.T) {
+// TestRegister_NoInboundSecret_LazyHeals — legacy workspace path:
+// when ReadPlatformInboundSecret returns ErrNoInboundSecret (NULL
+// column), Register MUST mint inline and include the freshly-minted
+// secret in the response. Without this, legacy workspaces would need
+// two round-trips before chat upload works (chat_files heals
+// platform-side → workspace must heartbeat → next chat upload).
+//
+// Pre-fix this test asserted the field was ABSENT; that was correct
+// for the missing behavior, but happened to pass even with my
+// register-time lazy-heal change because sqlmock unmatched UPDATE
+// caused the mint to fail and fall back to omit-field. Splitting
+// into success + failure tests pins both branches.
+func TestRegister_NoInboundSecret_LazyHeals(t *testing.T) {
 	mock := setupTestDB(t)
 	setupTestRedis(t)
 	broadcaster := newTestBroadcaster()
@ -1002,7 +1014,6 @@ func TestRegister_NoInboundSecret_OmitsField(t *testing.T) {
 	mock.ExpectQuery("SELECT COUNT\\(\\*\\) FROM workspace_auth_tokens").
 		WithArgs(wsID).
 		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
-	// resolveDeliveryMode — no row yet, default push (#2339).
 	mock.ExpectQuery("SELECT delivery_mode FROM workspaces WHERE id").
 		WithArgs(wsID).
 		WillReturnError(sql.ErrNoRows)
@ -1019,6 +1030,12 @@ func TestRegister_NoInboundSecret_OmitsField(t *testing.T) {
 	mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`).
 		WithArgs(wsID).
 		WillReturnRows(sqlmock.NewRows([]string{"platform_inbound_secret"}).AddRow(nil))
+	// Lazy-heal mint MUST land. If this expectation isn't matched, the
+	// register handler skipped backfill and legacy workspaces would
+	// need 2 round-trips before chat upload works.
+	mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`).
+		WithArgs(sqlmock.AnyArg(), wsID).
+		WillReturnResult(sqlmock.NewResult(0, 1))

 	w := httptest.NewRecorder()
 	c, _ := gin.CreateTestContext(w)
@ -1029,12 +1046,72 @@ func TestRegister_NoInboundSecret_OmitsField(t *testing.T) {
 	handler.Register(c)

 	if w.Code != http.StatusOK {
-		t.Fatalf("expected 200 even without inbound secret, got %d", w.Code)
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	var resp map[string]interface{}
+	_ = json.Unmarshal(w.Body.Bytes(), &resp)
+	secret, present := resp["platform_inbound_secret"]
+	if !present {
+		t.Errorf("expected platform_inbound_secret to be PRESENT (lazy-healed), got response: %v", resp)
+	}
+	if s, ok := secret.(string); !ok || s == "" {
+		t.Errorf("expected non-empty platform_inbound_secret string, got %T %v", secret, secret)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("sqlmock expectations not met — register-time lazy-heal mint did NOT run, regression of #2312 backfill: %v", err)
+	}
+}
+
+// TestRegister_NoInboundSecret_LazyHealMintFailureOmitsField pins the
+// alternate branch: if the lazy-heal mint itself fails (DB hiccup),
+// Register MUST still respond 200 (workspace is online) but omit the
+// field. The next register call will retry the heal.
+func TestRegister_NoInboundSecret_LazyHealMintFailureOmitsField(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	const wsID = "00000000-0000-0000-0000-000000002313"
+
+	mock.ExpectQuery("SELECT COUNT\\(\\*\\) FROM workspace_auth_tokens").
+		WithArgs(wsID).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
+	mock.ExpectQuery("SELECT delivery_mode FROM workspaces WHERE id").
+		WithArgs(wsID).
+		WillReturnError(sql.ErrNoRows)
+	mock.ExpectExec("INSERT INTO workspaces").WillReturnResult(sqlmock.NewResult(0, 1))
+	mock.ExpectQuery("SELECT url FROM workspaces WHERE id").
+		WithArgs(wsID).
+		WillReturnRows(sqlmock.NewRows([]string{"url"}).AddRow("http://localhost:9100"))
+	mock.ExpectExec("INSERT INTO structure_events").WillReturnResult(sqlmock.NewResult(0, 1))
+	mock.ExpectQuery("SELECT COUNT\\(\\*\\) FROM workspace_auth_tokens").
+		WithArgs(wsID).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
+	mock.ExpectExec("INSERT INTO workspace_auth_tokens").WillReturnResult(sqlmock.NewResult(1, 1))
+	mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`).
+		WithArgs(wsID).
+		WillReturnRows(sqlmock.NewRows([]string{"platform_inbound_secret"}).AddRow(nil))
+	// Mint fails — handler must NOT 500; just omit field + log.
+	mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`).
+		WithArgs(sqlmock.AnyArg(), wsID).
+		WillReturnError(sql.ErrConnDone)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Request = httptest.NewRequest("POST", "/registry/register",
+		bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","agent_card":{"name":"x"}}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Register(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 even when lazy-heal fails (workspace is online), got %d: %s", w.Code, w.Body.String())
 	}
 	var resp map[string]interface{}
 	_ = json.Unmarshal(w.Body.Bytes(), &resp)
 	if _, present := resp["platform_inbound_secret"]; present {
-		t.Errorf("expected platform_inbound_secret to be ABSENT for legacy workspace, got: %v", resp["platform_inbound_secret"])
+		t.Errorf("expected platform_inbound_secret to be ABSENT when mint failed, got: %v", resp["platform_inbound_secret"])
 	}
 }

--- a/workspace-server/internal/handlers/team.go
+++ b/workspace-server/internal/handlers/team.go
@ -1,7 +1,6 @@
 package handlers

 import (
-	"context"
 	"database/sql"
 	"encoding/json"
 	"log"
@ -11,23 +10,32 @@ import (

 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
 	"github.com/gin-gonic/gin"
 	"github.com/google/uuid"
 	"gopkg.in/yaml.v3"
 )

+// TeamHandler delegates child-workspace provisioning to wh so child
+// workspaces go through the same prepare/mint/preflight pipeline that
+// every other provision path uses. Pre-fix (issue #2367): Expand
+// directly invoked h.provisioner.Start which skipped mintWorkspaceSecrets,
+// leaving every team-expanded child with NULL platform_inbound_secret +
+// NULL auth_token — same drift class as the SaaS bug fixed in #2366.
 type TeamHandler struct {
 	broadcaster *events.Broadcaster
 	provisioner *provisioner.Provisioner
+	wh          *WorkspaceHandler
 	platformURL string
 	configsDir  string
 }

-func NewTeamHandler(b *events.Broadcaster, p *provisioner.Provisioner, platformURL, configsDir string) *TeamHandler {
+func NewTeamHandler(b *events.Broadcaster, p *provisioner.Provisioner, wh *WorkspaceHandler, platformURL, configsDir string) *TeamHandler {
 	return &TeamHandler{
 		broadcaster: b,
 		provisioner: p,
+		wh:          wh,
 		platformURL: platformURL,
 		configsDir:  configsDir,
 	}
@ -116,26 +124,25 @@ func (h *TeamHandler) Expand(c *gin.Context) {
 			"parent_id": parentID,
 		})

-		// Provision if template exists
-		if h.provisioner != nil && sub.Config != "" {
+		// Delegate child-workspace provisioning to the shared
+		// provision pipeline. Issue #2367: previously Expand called
+		// h.provisioner.Start directly, bypassing mintWorkspaceSecrets
+		// and every other preflight (secrets, env mutators, identity
+		// injection, missing-env). That left every child with NULL
+		// platform_inbound_secret and never-issued auth_token. Now
+		// children go through the same provisionWorkspace path as
+		// Create/Restart, so adding a future provision-time step
+		// automatically covers Expand too.
+		if h.wh != nil && sub.Config != "" {
 			templatePath := filepath.Join(h.configsDir, sub.Config)
 			if _, err := os.Stat(templatePath); err == nil {
-				pluginsPath, _ := filepath.Abs(filepath.Join(h.configsDir, "..", "plugins"))
-				go func(wID, tPath, pPath string, t int) {
-					provCtx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
-					defer cancel()
-					cfg := provisioner.WorkspaceConfig{
-						WorkspaceID:  wID,
-						TemplatePath: tPath,
-						PluginsPath:  pPath,
-						Tier:         t,
-						EnvVars:      map[string]string{"PARENT_ID": parentID},
-						PlatformURL:  h.platformURL,
-					}
-					if _, err := h.provisioner.Start(provCtx, cfg); err != nil {
-						log.Printf("Expand: provision failed for %s: %v", wID, err)
-					}
-				}(childID, templatePath, pluginsPath, tier)
+				parent := parentID // copy for closure
+				go h.wh.provisionWorkspace(childID, templatePath, nil, models.CreateWorkspacePayload{
+					Name:     childName,
+					Role:     sub.Role,
+					Tier:     tier,
+					ParentID: &parent,
+				})
 			}
 		}

--- a/workspace-server/internal/handlers/team_test.go
+++ b/workspace-server/internal/handlers/team_test.go
@ -34,7 +34,7 @@ func TestTeamCollapse_NoChildren(t *testing.T) {
 	mock := setupTestDB(t)
 	setupTestRedis(t)
 	broadcaster := newTestBroadcaster()
-	handler := NewTeamHandler(broadcaster, nil, "http://localhost:8080", "/tmp/configs")
+	handler := NewTeamHandler(broadcaster, nil, nil, "http://localhost:8080", "/tmp/configs")

 	// No children
 	mock.ExpectQuery("SELECT id, name FROM workspaces WHERE parent_id").
@ -66,7 +66,7 @@ func TestTeamCollapse_WithChildren(t *testing.T) {
 	mock := setupTestDB(t)
 	setupTestRedis(t)
 	broadcaster := newTestBroadcaster()
-	handler := NewTeamHandler(broadcaster, nil, "http://localhost:8080", "/tmp/configs")
+	handler := NewTeamHandler(broadcaster, nil, nil, "http://localhost:8080", "/tmp/configs")

 	// Two children
 	mock.ExpectQuery("SELECT id, name FROM workspaces WHERE parent_id").
@ -122,7 +122,7 @@ func TestTeamCollapse_WithChildren(t *testing.T) {
 func TestTeamExpand_WorkspaceNotFound(t *testing.T) {
 	mock := setupTestDB(t)
 	setupTestRedis(t)
-	handler := NewTeamHandler(newTestBroadcaster(), nil, "http://localhost:8080", "/tmp/configs")
+	handler := NewTeamHandler(newTestBroadcaster(), nil, nil, "http://localhost:8080", "/tmp/configs")

 	mock.ExpectQuery("SELECT name, tier, status FROM workspaces WHERE id").
 		WithArgs("ws-missing").
@ -143,7 +143,7 @@ func TestTeamExpand_WorkspaceNotFound(t *testing.T) {
 func TestTeamExpand_NoConfigFound(t *testing.T) {
 	mock := setupTestDB(t)
 	setupTestRedis(t)
-	handler := NewTeamHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+	handler := NewTeamHandler(newTestBroadcaster(), nil, nil, "http://localhost:8080", t.TempDir())

 	mock.ExpectQuery("SELECT name, tier, status FROM workspaces WHERE id").
 		WithArgs("ws-1").
@ -167,7 +167,7 @@ func TestTeamExpand_EmptySubWorkspaces(t *testing.T) {
 	setupTestRedis(t)

 	configDir := makeTeamConfigDir(t, "myagent", "name: MyAgent\nsub_workspaces: []\n")
-	handler := NewTeamHandler(newTestBroadcaster(), nil, "http://localhost:8080", configDir)
+	handler := NewTeamHandler(newTestBroadcaster(), nil, nil, "http://localhost:8080", configDir)

 	mock.ExpectQuery("SELECT name, tier, status FROM workspaces WHERE id").
 		WithArgs("ws-1").
@ -199,7 +199,7 @@ sub_workspaces:
    role: code-reviewer
 `
 	configDir := makeTeamConfigDir(t, "teamlead", yaml)
-	handler := NewTeamHandler(broadcaster, nil, "http://localhost:8080", configDir)
+	handler := NewTeamHandler(broadcaster, nil, nil, "http://localhost:8080", configDir)

 	mock.ExpectQuery("SELECT name, tier, status FROM workspaces WHERE id").
 		WithArgs("ws-lead").
--- a/workspace-server/internal/handlers/testdata/a2a_corpus/README.md
+++ b/workspace-server/internal/handlers/testdata/a2a_corpus/README.md
@ -0,0 +1,81 @@
+# A2A protocol replay corpus
+
+Captures every A2A JSON-RPC message shape the platform has ever
+accepted, so a future PR that bumps a-2-a-sdk or modifies
+`normalizeA2APayload` can be tested against historical inputs
+before merging.
+
+This is the gate that would have caught the 2026-04-29 v0.2 → v0.3
+silent-drop bug (PR #2349). The bug shipped because the SDK bump
+PR didn't replay v0.2-shaped inputs against the new code; the
+shape-mismatch surfaced only in production when the receiver's
+Pydantic validator silently rejected inbound messages.
+
+## Layout
+
+- `valid/` — every shape that has ever been accepted. Each PR that
+  changes the protocol code OR bumps the SDK pin runs every entry
+  through `normalizeA2APayload` and asserts it parses without
+  error. Removing an entry from this directory is a breaking
+  change and requires explicit operator approval.
+
+- `invalid/` — shapes that MUST be rejected with the right error
+  type. These pin the rejection contract — a future PR that
+  silently accepts a malformed shape (because, say, it added a
+  permissive default) breaks the gate.
+
+## When to add a corpus entry
+
+- A new SDK version is released and adds a shape we want to support.
+  Capture a representative payload (PII-scrubbed) before bumping
+  the SDK pin. The same PR that bumps the SDK adds the corpus
+  entry AND any required compat shim in `normalizeA2APayload`.
+
+- Production logs surface a shape we hadn't seen before. Capture
+  it after PII-scrubbing. Even if it currently rejects, decide
+  whether to support it (move to `valid/`) or to keep rejecting
+  loudly (move to `invalid/`).
+
+## When NOT to add an entry
+
+- Test scaffolding. The corpus is the SHAPE record, not unit-test
+  coverage. Use the regular handler tests for functional coverage.
+
+- Hypothetical future shapes. Add only what we have evidence for
+  — either a real payload or an SDK release note.
+
+## Removal policy
+
+Removing an entry from `valid/` is a breaking change for any sender
+emitting that shape. Requires:
+
+1. A migration plan (deprecation window, sender notification).
+2. A separate PR with a single-line removal + a comment explaining
+   the deprecation timeline.
+3. Approval from someone outside the PR author's team.
+
+Removing an entry from `invalid/` widens what we accept. Lower bar:
+just verify the new behavior is intentional and the corpus entry
+moves to `valid/`.
+
+## Anatomy of a corpus entry
+
+```json
+{
+  "_comment": "v0.2 string content — basic text message via message/send",
+  "_added": "2026-04-30",
+  "_source": "PR #2349 incident, real payload from sender workspace",
+  "jsonrpc": "2.0",
+  "method": "message/send",
+  "id": "test-id",
+  "params": {
+    "message": {
+      "content": "hello"
+    }
+  }
+}
+```
+
+`_comment`, `_added`, `_source` are documentation that the test
+loader strips before passing the payload to the parser. They are
+required for every entry.
--- a/workspace-server/internal/handlers/testdata/a2a_corpus/invalid/content_is_bool.json
+++ b/workspace-server/internal/handlers/testdata/a2a_corpus/invalid/content_is_bool.json
@ -0,0 +1,17 @@
+{
+  "_comment": "content as bool is not a valid type. Same rejection class as integer — pinned separately so the test failure message identifies WHICH wrong type is the regression.",
+  "_added": "2026-04-30",
+  "_source": "Synthetic — type-rejection branch coverage",
+  "_expect_error": "invalid params.message.content type",
+  "_expect_status": 400,
+  "jsonrpc": "2.0",
+  "method": "message/send",
+  "id": "msg-test-invalid-bool",
+  "params": {
+    "message": {
+      "messageId": "msg-bad-3",
+      "role": "user",
+      "content": true
+    }
+  }
+}
--- a/workspace-server/internal/handlers/testdata/a2a_corpus/invalid/content_is_integer.json
+++ b/workspace-server/internal/handlers/testdata/a2a_corpus/invalid/content_is_integer.json
@ -0,0 +1,17 @@
+{
+  "_comment": "content as integer is not a valid type. The compat shim accepts string (v0.2 plain text) or list (v0.2 Part list); anything else is rejected with HTTP 400 and a hint.",
+  "_added": "2026-04-30",
+  "_source": "Synthetic — pins the type-rejection branch",
+  "_expect_error": "invalid params.message.content type",
+  "_expect_status": 400,
+  "jsonrpc": "2.0",
+  "method": "message/send",
+  "id": "msg-test-invalid-int",
+  "params": {
+    "message": {
+      "messageId": "msg-bad-2",
+      "role": "user",
+      "content": 42
+    }
+  }
+}
--- a/workspace-server/internal/handlers/testdata/a2a_corpus/invalid/no_content_or_parts.json
+++ b/workspace-server/internal/handlers/testdata/a2a_corpus/invalid/no_content_or_parts.json
@ -0,0 +1,16 @@
+{
+  "_comment": "params.message has neither content nor parts. Pre-PR-2349 the SDK silently rejected this post-handler-dispatch and the rejection was invisible to the sender. Now normalizeA2APayload returns a loud HTTP 400.",
+  "_added": "2026-04-30",
+  "_source": "PR #2349 incident edge case",
+  "_expect_error": "params.message must contain either 'parts' (v0.3) or 'content' (v0.2 compat)",
+  "_expect_status": 400,
+  "jsonrpc": "2.0",
+  "method": "message/send",
+  "id": "msg-test-invalid-empty",
+  "params": {
+    "message": {
+      "messageId": "msg-bad-1",
+      "role": "user"
+    }
+  }
+}
--- a/workspace-server/internal/handlers/testdata/a2a_corpus/valid/no_jsonrpc_envelope.json
+++ b/workspace-server/internal/handlers/testdata/a2a_corpus/valid/no_jsonrpc_envelope.json
@ -0,0 +1,15 @@
+{
+  "_comment": "Body without an outer jsonrpc envelope. The normalizer wraps it in {jsonrpc:2.0, id:<uuid>, method, params}. Some legacy senders post the bare params + method without the envelope.",
+  "_added": "2026-04-30",
+  "_source": "Pre-RFC sender behavior",
+  "method": "message/send",
+  "params": {
+    "message": {
+      "messageId": "msg-8",
+      "role": "user",
+      "parts": [
+        {"kind": "text", "text": "no envelope here"}
+      ]
+    }
+  }
+}
--- a/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_2_list_content.json
+++ b/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_2_list_content.json
@ -0,0 +1,18 @@
+{
+  "_comment": "v0.2 with content already a list (e.g. multimodal Part shape). Some pre-v0.3 senders constructed the list form even before the v0.3 rename — usually because they were copying from working v0.3 examples but still pinned to a-2-a-sdk v0.2 by mistake. The compat shim accepts both list and string for content.",
+  "_added": "2026-04-30",
+  "_source": "Synthetic — covers normalizeA2APayload's []interface{} switch case",
+  "jsonrpc": "2.0",
+  "method": "message/send",
+  "id": "msg-test-v02-list",
+  "params": {
+    "message": {
+      "messageId": "msg-2",
+      "role": "user",
+      "content": [
+        {"kind": "text", "text": "hello"},
+        {"kind": "text", "text": "world"}
+      ]
+    }
+  }
+}
--- a/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_2_string_content.json
+++ b/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_2_string_content.json
@ -0,0 +1,15 @@
+{
+  "_comment": "v0.2 string content — the most common pre-rename shape. Caused the 2026-04-29 silent-drop bug: a-2-a-sdk v0.3 renamed params.message.content (string) -> params.message.parts (Part list). Shipped PR #2349 to compat-shim this in normalizeA2APayload.",
+  "_added": "2026-04-30",
+  "_source": "PR #2349 incident reproduction",
+  "jsonrpc": "2.0",
+  "method": "message/send",
+  "id": "msg-test-v02-string",
+  "params": {
+    "message": {
+      "messageId": "msg-1",
+      "role": "user",
+      "content": "hello world"
+    }
+  }
+}
--- a/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_2_string_content_no_message_id.json
+++ b/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_2_string_content_no_message_id.json
@ -0,0 +1,14 @@
+{
+  "_comment": "v0.2 without messageId — normalizeA2APayload must add a generated UUID. Some early senders omitted messageId entirely and relied on the platform to generate one.",
+  "_added": "2026-04-30",
+  "_source": "Synthetic, covers the messageId-injection branch",
+  "jsonrpc": "2.0",
+  "method": "message/send",
+  "id": "msg-test-v02-no-id",
+  "params": {
+    "message": {
+      "role": "user",
+      "content": "no message id supplied"
+    }
+  }
+}
--- a/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_3_long_text.json
+++ b/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_3_long_text.json
@ -0,0 +1,17 @@
+{
+  "_comment": "10KB text part. Pins that the normalizer doesn't choke on large payloads (no buffer assumption). Real briefs from the Brief-Pipeline experiment were occasionally this size.",
+  "_added": "2026-04-30",
+  "_source": "Synthetic — large-payload smoke",
+  "jsonrpc": "2.0",
+  "method": "message/send",
+  "id": "msg-test-long",
+  "params": {
+    "message": {
+      "messageId": "msg-10",
+      "role": "user",
+      "parts": [
+        {"kind": "text", "text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."}
+      ]
+    }
+  }
+}
--- a/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_3_parts_multi_text.json
+++ b/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_3_parts_multi_text.json
@ -0,0 +1,18 @@
+{
+  "_comment": "v0.3 with multiple text parts. Used by senders that interleave system + user text or that segment a long input.",
+  "_added": "2026-04-30",
+  "_source": "Synthetic — pins multi-Part list handling",
+  "jsonrpc": "2.0",
+  "method": "message/send",
+  "id": "msg-test-v03-multi",
+  "params": {
+    "message": {
+      "messageId": "msg-4",
+      "role": "user",
+      "parts": [
+        {"kind": "text", "text": "Context: you are a helpful assistant."},
+        {"kind": "text", "text": "Question: what is 2+2?"}
+      ]
+    }
+  }
+}
--- a/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_3_parts_text_only.json
+++ b/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_3_parts_text_only.json
@ -0,0 +1,17 @@
+{
+  "_comment": "v0.3 canonical: params.message.parts is a list of Part objects. This is what the platform's normalizer produces internally and what the a-2-a-sdk v0.3 Pydantic validator expects.",
+  "_added": "2026-04-30",
+  "_source": "a-2-a-sdk v0.3 release notes example",
+  "jsonrpc": "2.0",
+  "method": "message/send",
+  "id": "msg-test-v03-parts",
+  "params": {
+    "message": {
+      "messageId": "msg-3",
+      "role": "user",
+      "parts": [
+        {"kind": "text", "text": "hello world"}
+      ]
+    }
+  }
+}
--- a/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_3_parts_with_context.json
+++ b/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_3_parts_with_context.json
@ -0,0 +1,18 @@
+{
+  "_comment": "v0.3 with contextId — multi-turn conversation thread identifier. The a-2-a-sdk groups messages with the same contextId into a single conversation history.",
+  "_added": "2026-04-30",
+  "_source": "Conversation-thread feature path",
+  "jsonrpc": "2.0",
+  "method": "message/send",
+  "id": "msg-test-v03-context",
+  "params": {
+    "message": {
+      "messageId": "msg-6",
+      "contextId": "ctx-conversation-7",
+      "role": "user",
+      "parts": [
+        {"kind": "text", "text": "Continuing from before — what did you find?"}
+      ]
+    }
+  }
+}
--- a/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_3_parts_with_file.json
+++ b/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_3_parts_with_file.json
@ -0,0 +1,25 @@
+{
+  "_comment": "v0.3 multimodal: text Part + file Part. Used by chat-attachment flow (PR #114).",
+  "_added": "2026-04-30",
+  "_source": "Real shape from agent-to-agent file attachment path",
+  "jsonrpc": "2.0",
+  "method": "message/send",
+  "id": "msg-test-v03-file",
+  "params": {
+    "message": {
+      "messageId": "msg-5",
+      "role": "user",
+      "parts": [
+        {"kind": "text", "text": "Please review the attached log."},
+        {
+          "kind": "file",
+          "file": {
+            "name": "incident.log",
+            "mimeType": "text/plain",
+            "uri": "https://staging-api.moleculesai.app/files/abc123"
+          }
+        }
+      ]
+    }
+  }
+}
--- a/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_3_streaming_method.json
+++ b/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_3_streaming_method.json
@ -0,0 +1,17 @@
+{
+  "_comment": "v0.3 with method=message/stream (vs message/send). The streaming path is otherwise identical at the request shape level — it differs in the response (event stream vs single response). Pins that the normalizer is method-agnostic.",
+  "_added": "2026-04-30",
+  "_source": "Streaming dispatch path",
+  "jsonrpc": "2.0",
+  "method": "message/stream",
+  "id": "msg-test-v03-stream",
+  "params": {
+    "message": {
+      "messageId": "msg-7",
+      "role": "user",
+      "parts": [
+        {"kind": "text", "text": "Stream me a long answer please."}
+      ]
+    }
+  }
+}
--- a/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_3_unicode_text.json
+++ b/workspace-server/internal/handlers/testdata/a2a_corpus/valid/v0_3_unicode_text.json
@ -0,0 +1,17 @@
+{
+  "_comment": "Unicode + emoji content. Pins that JSON encoding round-trips correctly through normalizeA2APayload's Marshal/Unmarshal pair. The 2026-03 emoji-truncation bug was at a different layer (display) but the underlying JSON path is the same.",
+  "_added": "2026-04-30",
+  "_source": "Synthetic — covers unicode round-trip",
+  "jsonrpc": "2.0",
+  "method": "message/send",
+  "id": "msg-test-unicode",
+  "params": {
+    "message": {
+      "messageId": "msg-9",
+      "role": "user",
+      "parts": [
+        {"kind": "text", "text": "Hello 你好 こんにちは 🚀 — ünicode test"}
+      ]
+    }
+  }
+}
--- a/workspace-server/internal/handlers/workspace_provision.go
+++ b/workspace-server/internal/handlers/workspace_provision.go
@ -28,130 +28,19 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
 	ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
 	defer cancel()

-	// Load global secrets first, then workspace-specific secrets (which override globals).
-	envVars := map[string]string{}
-
-	// 1. Global secrets (platform-wide defaults). Uses DecryptVersioned
-	// so plaintext rows written before encryption was enabled (#85)
-	// keep working. A decrypt failure aborts provisioning — silent skip
-	// used to manifest as opaque "missing OAuth token" preflight crashes.
-	globalRows, globalErr := db.DB.QueryContext(ctx,
-		`SELECT key, encrypted_value, encryption_version FROM global_secrets`)
-	if globalErr == nil {
-		defer globalRows.Close()
-		for globalRows.Next() {
-			var k string
-			var v []byte
-			var ver int
-			if globalRows.Scan(&k, &v, &ver) == nil {
-				decrypted, decErr := crypto.DecryptVersioned(v, ver)
-				if decErr != nil {
-					log.Printf("Provisioner: FATAL — failed to decrypt global secret %s (version=%d): %v — aborting provision of workspace %s", k, ver, decErr, workspaceID)
-					h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
-						"error": "failed to decrypt global secret",
-					})
-					db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', updated_at = now() WHERE id = $1`, workspaceID)
-					return
-				}
-				envVars[k] = string(decrypted)
-			}
-		}
-	}
-
-	// 2. Workspace-specific secrets (override globals with same key)
-	rows, err := db.DB.QueryContext(ctx,
-		`SELECT key, encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = $1`, workspaceID)
-	if err == nil {
-		defer rows.Close()
-		for rows.Next() {
-			var k string
-			var v []byte
-			var ver int
-			if rows.Scan(&k, &v, &ver) == nil {
-				decrypted, decErr := crypto.DecryptVersioned(v, ver)
-				if decErr != nil {
-					log.Printf("Provisioner: FATAL — failed to decrypt workspace secret %s (version=%d) for %s: %v — aborting provision", k, ver, workspaceID, decErr)
-					h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
-						"error": "failed to decrypt workspace secret",
-					})
-					db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', updated_at = now() WHERE id = $1`, workspaceID)
-					return
-				}
-				envVars[k] = string(decrypted)
-			}
-		}
-	}
-
-	pluginsPath, _ := filepath.Abs(filepath.Join(h.configsDir, "..", "plugins"))
-	awarenessNamespace := h.loadAwarenessNamespace(ctx, workspaceID)
-
-	// Per-agent git identity (Option 3 of agent-separation rollout).
-	// Sets GIT_AUTHOR_* / GIT_COMMITTER_* so commits from each workspace
-	// carry a distinct author in `git log` / `git blame` — instead of
-	// every agent appearing as whoever the shared PAT belongs to. PR +
-	// issue authorship is still tied to GITHUB_TOKEN (shared PAT); that
-	// gets solved by the GitHub App migration (Option 1, follow-up PR).
-	// Runs after secret loads so an operator can still override via a
-	// workspace_secret named GIT_AUTHOR_NAME if they want custom identity.
-	applyAgentGitIdentity(envVars, payload.Name)
-	applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model)
-
-	// Propagate the workspace's role into env so role-aware plugins
-	// (gh-identity — molecule-core#1957) can read it without the
-	// plugin interface having to carry the full payload. Role is
-	// cosmetic metadata — no auth weight on it — safe to surface as env.
-	if payload.Role != "" {
-		envVars["MOLECULE_AGENT_ROLE"] = payload.Role
-	}
-
-	// Plugin extension point: run any registered EnvMutators (e.g.
-	// github-app-auth, vault-secrets) AFTER built-in identity injection so
-	// plugins can override or augment GIT_AUTHOR_*, GITHUB_TOKEN, etc.
-	// A failure here aborts provisioning — a missing GitHub App token
-	// would manifest later as opaque "git push 401" loops, and the agent
-	// never recovers. Failing fast here surfaces the cause to the operator.
-	if err := h.envMutators.Run(ctx, workspaceID, envVars); err != nil {
-		log.Printf("Provisioner: env mutator chain failed for %s: %v", workspaceID, err)
-		// F1086 / #1206: broadcast and db last_sample_error use generic messages —
-		// env mutator errors (missing tokens, vault paths, etc.) can include
-		// internal credential URIs and file paths that must not reach the caller.
-		h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
-			"error": "plugin env mutator chain failed",
-		})
-		if _, dbErr := db.DB.ExecContext(ctx,
-			`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
-			workspaceID, "plugin env mutator chain failed"); dbErr != nil {
-
-			log.Printf("Provisioner: failed to mark workspace %s as failed after mutator error: %v", workspaceID, dbErr)
-		}
+	prepared, abort := h.prepareProvisionContext(ctx, workspaceID, templatePath, configFiles, payload, resetClaudeSession)
+	if prepared == nil {
+		log.Printf("Provisioner: prepare failed for %s: %s", workspaceID, abort.Msg)
+		h.markProvisionFailed(ctx, workspaceID, abort.Msg, abort.Extra)
 		return
 	}
-
-	// Preflight: refuse to launch when config.yaml declares required env vars
-	// that are not set. Without this, a missing CLAUDE_CODE_OAUTH_TOKEN (or
-	// similar) crashes the in-container preflight, the container never calls
-	// /registry/register, and the workspace sits in `provisioning` until a
-	// sweeper flips it or the user retries. Failing fast here gives the user
-	// an immediate, actionable error in the Events tab.
-	if missing := missingRequiredEnv(configFiles, envVars); len(missing) > 0 {
-		msg := formatMissingEnvError(missing)
-		log.Printf("Provisioner: %s (workspace=%s)", msg, workspaceID)
-		if _, dbErr := db.DB.ExecContext(ctx,
-			`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
-			workspaceID, msg); dbErr != nil {
-			log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr)
-		}
-		h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
-			"error":   msg,
-			"missing": missing,
-		})
-		return
-	}
-
-	cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
-	cfg.ResetClaudeSession = resetClaudeSession // #12
+	cfg := prepared.Config

 	// Preflight #17: detect + auto-recover the "empty config volume" crashloop.
+	// Docker-specific — SaaS mode delegates volume management to the
+	// control-plane EC2 launcher and never has a local Docker volume to
+	// probe. Runs AFTER prepareProvisionContext because the recovered
+	// template still needs the same env-and-cfg surface the prepare built.
 	//
 	// When the caller supplies neither a template dir nor in-memory configFiles
 	// (the auto-restart path), probe the existing Docker named volume. If the
@ -194,7 +83,7 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
 							workspaceID, filepath.Base(runtimeTemplate))
 						templatePath = runtimeTemplate
 						// Rebuild cfg with the recovered template path so Start() sees it.
-						cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
+						cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, prepared.EnvVars, prepared.PluginsPath, prepared.AwarenessNamespace)
 						cfg.ResetClaudeSession = resetClaudeSession
 						recovered = true
 						break
@ -209,46 +98,27 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
 			if !recovered {
 				msg := fmt.Sprintf("cannot start workspace %s: no config.yaml source and config volume is empty — delete the workspace or provide a template", workspaceID)
 				log.Printf("Provisioner: %s", msg)
-				if _, dbErr := db.DB.ExecContext(ctx,
-					`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
-					workspaceID, msg); dbErr != nil {
-					log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr)
-				}
-				h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
-					"error": msg,
-				})
+				h.markProvisionFailed(ctx, workspaceID, msg, nil)
 				return
 			}
 		}
 	}

-	// Issue/rotate the workspace auth token and inject the plaintext into the
-	// config volume so the workspace always has a valid bearer credential on
-	// disk, even after a container rebuild wiped the volume (issue #418).
-	//
-	// We must rotate (revoke-then-issue) rather than reuse because the DB only
-	// stores sha256(plaintext) — we cannot reconstruct the original token to
-	// write it back. The new plaintext is written into /configs/.auth_token via
-	// WriteFilesToContainer, which runs immediately after ContainerStart and
-	// wins the race against the Python adapter's startup time (~1-2 s).
-	h.issueAndInjectToken(ctx, workspaceID, &cfg)
-	h.issueAndInjectInboundSecret(ctx, workspaceID, &cfg)
+	// Issue/rotate the workspace auth token + platform→workspace inbound secret
+	// and inject both into the config volume. See mintWorkspaceSecrets doc for
+	// the shared invariant — every provision path MUST mint here. Plaintext
+	// is written into /configs/.auth_token + /configs/.platform_inbound_secret
+	// via WriteFilesToContainer, which runs immediately after ContainerStart
+	// and wins the race against the Python adapter's startup time (~1-2 s).
+	h.mintWorkspaceSecrets(ctx, workspaceID, &cfg)

 	url, err := h.provisioner.Start(ctx, cfg)
 	if err != nil {
 		// F1086 / #1206: persist a generic message so the canvas and
 		// GET /workspaces/:id expose something actionable without leaking
 		// docker/error internals (image pull messages, volume paths, etc.).
-		errMsg := "workspace start failed"
-		log.Printf("Provisioner: %s for %s: %v", errMsg, workspaceID, err)
-		if _, dbErr := db.DB.ExecContext(ctx,
-			`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
-			workspaceID, "workspace start failed"); dbErr != nil {
-			log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr)
-		}
-		h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
-			"error": "workspace start failed",
-		})
+		log.Printf("Provisioner: workspace start failed for %s: %v", workspaceID, err)
+		h.markProvisionFailed(ctx, workspaceID, "workspace start failed", nil)
 	} else if url != "" {
 		// Pre-store the host-accessible URL (http://127.0.0.1:<port>) so the A2A proxy can reach the container.
 		// The registry's ON CONFLICT preserves URLs starting with http://127.0.0.1 when the agent self-registers.
@ -706,6 +576,13 @@ func applyRuntimeModelEnv(envVars map[string]string, runtime, model string) {
 // loadWorkspaceSecrets loads global + workspace-specific secrets into a map.
 // Returns nil map + error string on decrypt failure. Shared by both Docker
 // and control plane provisioning paths to avoid duplication.
+//
+// F1086 / #1206: the returned error string is the SAFE-CANNED message that
+// gets persisted to workspaces.last_sample_error AND broadcast as the
+// WORKSPACE_PROVISION_FAILED payload. Internal detail (the secret key name,
+// the encryption version, the decrypt-error text) is logged here, never
+// returned to the caller, so it can't leak via the canvas event stream
+// (cf. TestProvisionWorkspace_NoInternalErrorsInBroadcast).
 func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]string, string) {
 	envVars := map[string]string{}
 	globalRows, globalErr := db.DB.QueryContext(ctx,
@ -719,7 +596,8 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s
 			if globalRows.Scan(&k, &v, &ver) == nil {
 				decrypted, decErr := crypto.DecryptVersioned(v, ver)
 				if decErr != nil {
-					return nil, fmt.Sprintf("cannot decrypt global secret %s: %v", k, decErr)
+					log.Printf("Provisioner: FATAL — failed to decrypt global secret %s (version=%d): %v — aborting provision of workspace %s", k, ver, decErr, workspaceID)
+					return nil, "failed to decrypt global secret"
 				}
 				envVars[k] = string(decrypted)
 			}
@ -736,7 +614,8 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s
 			if wsRows.Scan(&k, &v, &ver) == nil {
 				decrypted, decErr := crypto.DecryptVersioned(v, ver)
 				if decErr != nil {
-					return nil, fmt.Sprintf("cannot decrypt workspace secret %s: %v", k, decErr)
+					log.Printf("Provisioner: FATAL — failed to decrypt workspace secret %s (version=%d) for %s: %v — aborting provision", k, ver, workspaceID, decErr)
+					return nil, "failed to decrypt workspace secret"
 				}
 				envVars[k] = string(decrypted)
 			}
@ -746,53 +625,43 @@ func loadWorkspaceSecrets(ctx context.Context, workspaceID string) (map[string]s
 }

 // provisionWorkspaceCP provisions a workspace via the control plane API.
+//
+// Mode-specific work this function owns: cpProv.Start (delegates EC2
+// launch to control plane) + persist instance_id in DB. The shared
+// setup (secrets, env mutators, mint of auth_token + inbound_secret)
+// lives in prepareProvisionContext + mintWorkspaceSecrets and is
+// called by both this function and the Docker-mode counterpart.
+//
+// Pre-#2026-04-30: this function did NOT call mintWorkspaceSecrets.
+// That left every prod workspace with a NULL platform_inbound_secret
+// column → 503 on every chat upload (RFC #2312). The bug shipped
+// because the Docker and SaaS provision paths had drifted: Docker
+// got the mint when #2312 landed; SaaS was missed. Refactored to
+// share so the next mint added can't be silently forgotten on one
+// side.
 func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string, configFiles map[string][]byte, payload models.CreateWorkspacePayload) {
 	ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
 	defer cancel()

-	envVars, decryptErr := loadWorkspaceSecrets(ctx, workspaceID)
-	if decryptErr != "" {
-		log.Printf("CPProvisioner: %s for %s", decryptErr, workspaceID)
-		db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
-			workspaceID, decryptErr)
+	prepared, abort := h.prepareProvisionContext(ctx, workspaceID, templatePath, configFiles, payload, false)
+	if prepared == nil {
+		log.Printf("CPProvisioner: prepare failed for %s: %s", workspaceID, abort.Msg)
+		h.markProvisionFailed(ctx, workspaceID, abort.Msg, abort.Extra)
 		return
 	}

-	applyAgentGitIdentity(envVars, payload.Name)
-	applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model)
-	// Propagate role for role-aware plugins (#1957). See provisionWorkspace
-	// above for rationale.
-	if payload.Role != "" {
-		envVars["MOLECULE_AGENT_ROLE"] = payload.Role
-	}
-	if err := h.envMutators.Run(ctx, workspaceID, envVars); err != nil {
-		log.Printf("CPProvisioner: env mutator failed for %s: %v", workspaceID, err)
-		// F1086 / #1206: env mutator errors (missing tokens, vault paths) must not
-		// leak into last_sample_error — use generic message.
-		db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
-				workspaceID, "plugin env mutator chain failed")
-		return
-	}
+	// Mint the workspace's auth_token + platform_inbound_secret now,
+	// before cpProv.Start. Both modes write to the DB column; the
+	// workspace receives the plaintext via /registry/register response
+	// (registry.go:344-362) on its first heartbeat after boot.
+	h.mintWorkspaceSecrets(ctx, workspaceID, &prepared.Config)

-	cfg := provisioner.WorkspaceConfig{
-		WorkspaceID: workspaceID,
-		Tier:        payload.Tier,
-		Runtime:     payload.Runtime,
-		EnvVars:     envVars,
-		PlatformURL: h.platformURL,
-	}
-
-	machineID, err := h.cpProv.Start(ctx, cfg)
+	machineID, err := h.cpProv.Start(ctx, prepared.Config)
 	if err != nil {
 		// F1086 / #1206: CP errors can include machine type, AMI IDs, VPC
 		// paths — use generic message for broadcast and last_sample_error.
-		errMsg := "workspace start failed"
-		log.Printf("CPProvisioner: %s for %s: %v", errMsg, workspaceID, err)
-		h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
-			"error": "provisioning failed",
-		})
-		db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
-			workspaceID, "provisioning failed")
+		log.Printf("CPProvisioner: workspace start failed for %s: %v", workspaceID, err)
+		h.markProvisionFailed(ctx, workspaceID, "provisioning failed", nil)
 		return
 	}

@ -809,13 +678,4 @@ func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string
 	}

 	log.Printf("CPProvisioner: workspace %s started as machine %s via control plane", workspaceID, machineID)
-	// Token issuance is deliberately deferred to the workspace's first
-	// /registry/register call. Minting here without also delivering the
-	// plaintext to the workspace (via user-data or a follow-up callback)
-	// would leave a live token in DB that the workspace has no copy of —
-	// RegistryHandler.requireWorkspaceToken would then 401 every
-	// /registry/register attempt because the workspace is no longer in the
-	// "no live tokens → bootstrap-allowed" state. The register handler
-	// already mints a token on first successful register and returns it in
-	// the response body for the workspace to persist.
 }
--- a/workspace-server/internal/handlers/workspace_provision_shared.go
+++ b/workspace-server/internal/handlers/workspace_provision_shared.go
@ -0,0 +1,224 @@
+package handlers
+
+// workspace_provision_shared.go — mode-agnostic provision-time
+// orchestration that BOTH provisionWorkspaceOpts (Docker) and
+// provisionWorkspaceCP (SaaS) call. Extracted because the original
+// per-mode functions had drifted: the SaaS path forgot to call
+// issueAndInjectInboundSecret, which produced silent-503 chat upload
+// errors for every prod workspace (RFC #2312 — discovered 2026-04-30).
+//
+// The drift class this module closes: when a new provision-time setup
+// step is added, it goes in ONE place and both modes pick it up. New
+// steps that legitimately differ per mode stay in the per-mode
+// functions and are explicitly documented there.
+//
+// What's shared (this file):
+//   - Loading secrets (global + workspace) from Postgres
+//   - Applying git identity, runtime model env, role propagation
+//   - Running env mutators
+//   - Preflight: missing required env
+//   - Building provisioner.WorkspaceConfig
+//   - Minting auth_token + platform_inbound_secret (#2312)
+//
+// What's mode-specific (kept in the per-mode functions):
+//   - Docker: empty-config-volume preflight + auto-recover via
+//     runtime template (#1858), then provisioner.Start with local
+//     Docker daemon, then WriteFilesToContainer for config volume.
+//   - SaaS: cpProv.Start which delegates EC2 launch to control plane,
+//     then persist returned instance_id, then defer .auth_token /
+//     .platform_inbound_secret delivery to the workspace's first
+//     /registry/register response (registry.go:344-362).
+//
+// Architectural test (workspace_provision_shared_test.go) asserts
+// every code path that takes a workspaceID and starts a provision
+// MUST call mintWorkspaceSecrets — same shape as the
+// audit-coverage AST gate from #335.
+
+import (
+	"context"
+	"errors"
+	"log"
+	"path/filepath"
+
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
+)
+
+// readOrLazyHealInboundSecret reads the workspace's
+// platform_inbound_secret. On ErrNoInboundSecret it mints inline
+// (lazy-heal — RFC #2312 backfill for workspaces provisioned before
+// the shared-mint refactor closed the gap).
+//
+// Returns:
+//   - (secret, false, nil) — secret was already present
+//   - (secret, true,  nil) — secret was missing; just minted
+//   - ("",     false, err) — read failed (non-NoInboundSecret) OR mint failed
+//
+// opLabel prefixes log lines so operators can tell which feature
+// triggered the heal (e.g. "Registry", "chat_files Upload"). Centralized
+// here so the next "what to do when the secret is missing" decision —
+// rotation, audit, alerting — goes in ONE place. Same drift-prevention
+// rationale as resolveWorkspaceForwardCreds and mintWorkspaceSecrets.
+func readOrLazyHealInboundSecret(ctx context.Context, workspaceID, opLabel string) (secret string, healed bool, err error) {
+	s, readErr := wsauth.ReadPlatformInboundSecret(ctx, db.DB, workspaceID)
+	if readErr == nil {
+		return s, false, nil
+	}
+	if !errors.Is(readErr, wsauth.ErrNoInboundSecret) {
+		log.Printf("%s: read platform_inbound_secret failed for %s: %v", opLabel, workspaceID, readErr)
+		return "", false, readErr
+	}
+	minted, mintErr := wsauth.IssuePlatformInboundSecret(ctx, db.DB, workspaceID)
+	if mintErr != nil {
+		log.Printf("%s: lazy-heal mint of platform_inbound_secret failed for %s: %v", opLabel, workspaceID, mintErr)
+		return "", false, mintErr
+	}
+	log.Printf("%s: lazy-healed platform_inbound_secret for %s (#2312 backfill)", opLabel, workspaceID)
+	return minted, true, nil
+}
+
+// preparedProvisionContext carries the computed env + cfg through
+// the per-mode provisioner-Start step. Returned from
+// prepareProvisionContext when the caller proceeds; nil + non-empty
+// abort message when the caller must mark the workspace failed.
+type preparedProvisionContext struct {
+	EnvVars            map[string]string
+	PluginsPath        string
+	AwarenessNamespace string
+	Config             provisioner.WorkspaceConfig
+}
+
+// provisionAbort describes why prepareProvisionContext refused to
+// produce a context. Msg is the caller-safe summary that gets
+// persisted to workspaces.last_sample_error AND broadcast. Extra
+// is the structured payload to surface alongside (e.g. the missing
+// env list for the missing-env failure class). Both fields are
+// zero-valued on the success return.
+type provisionAbort struct {
+	Msg   string
+	Extra map[string]interface{}
+}
+
+// prepareProvisionContext does the mode-agnostic setup work that
+// both Docker and SaaS provisioners need before their respective
+// Start() call. Returns the prepared cfg or an abort message that
+// the caller MUST broadcast as WORKSPACE_PROVISION_FAILED and
+// persist as workspaces.last_sample_error.
+//
+// The function does NOT broadcast / DB-write the failure itself —
+// failure surface is mode-aware (Docker logs differ from SaaS logs
+// in observed format, and the failure-class taxonomy is shared
+// between modes but the broadcast key may evolve per mode in the
+// future).
+func (h *WorkspaceHandler) prepareProvisionContext(
+	ctx context.Context,
+	workspaceID, templatePath string,
+	configFiles map[string][]byte,
+	payload models.CreateWorkspacePayload,
+	resetClaudeSession bool,
+) (*preparedProvisionContext, *provisionAbort) {
+	envVars, decryptErr := loadWorkspaceSecrets(ctx, workspaceID)
+	if decryptErr != "" {
+		return nil, &provisionAbort{Msg: decryptErr}
+	}
+
+	pluginsPath, _ := filepath.Abs(filepath.Join(h.configsDir, "..", "plugins"))
+	awarenessNamespace := h.loadAwarenessNamespace(ctx, workspaceID)
+
+	// Per-agent git identity (#1957) — must run after secret loads so
+	// a workspace_secret named GIT_AUTHOR_NAME can override.
+	applyAgentGitIdentity(envVars, payload.Name)
+	applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model)
+	if payload.Role != "" {
+		envVars["MOLECULE_AGENT_ROLE"] = payload.Role
+	}
+	// PARENT_ID is consumed by workspace/coordinator.py to track the
+	// parent-child relationship at runtime. Sourced from payload so
+	// every provision path that knows about a parent (currently:
+	// TeamHandler.Expand) injects it without having to thread env
+	// through provisioner.WorkspaceConfig manually.
+	if payload.ParentID != nil && *payload.ParentID != "" {
+		envVars["PARENT_ID"] = *payload.ParentID
+	}
+
+	// Plugin extension point: env mutators run AFTER built-in identity
+	// injection so plugins can override or augment identity vars.
+	if err := h.envMutators.Run(ctx, workspaceID, envVars); err != nil {
+		return nil, &provisionAbort{Msg: "plugin env mutator chain failed"}
+	}
+
+	// Preflight #5: refuse to launch when config.yaml declares required
+	// env vars that are not set. Skipped in SaaS mode when configFiles
+	// is nil (CP-mode's cfg is built without local config bytes — the
+	// CP-side launches the workspace with envVars but doesn't inspect
+	// the config.yaml the same way Docker does). Future: lift the
+	// preflight to a pure-data check that doesn't need configFiles
+	// bytes, so SaaS mode can run it too.
+	if configFiles != nil {
+		if missing := missingRequiredEnv(configFiles, envVars); len(missing) > 0 {
+			msg := formatMissingEnvError(missing)
+			return nil, &provisionAbort{
+				Msg:   msg,
+				Extra: map[string]interface{}{"error": msg, "missing": missing},
+			}
+		}
+	}
+
+	cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
+	cfg.ResetClaudeSession = resetClaudeSession
+
+	return &preparedProvisionContext{
+		EnvVars:            envVars,
+		PluginsPath:        pluginsPath,
+		AwarenessNamespace: awarenessNamespace,
+		Config:             cfg,
+	}, nil
+}
+
+// mintWorkspaceSecrets issues + persists the workspace auth token
+// AND the platform→workspace inbound secret (#2312). Both modes MUST
+// call this — Docker mints + writes to local config volume; SaaS
+// mints and persists only to the DB column (the workspace fetches
+// via /registry/register response).
+//
+// Pre-2026-04-30: the SaaS path (provisionWorkspaceCP) was missing
+// the inbound-secret mint, leaving every prod workspace with a NULL
+// platform_inbound_secret column and 503-ing every chat upload. This
+// extracted helper is the structural fix — both paths share one
+// function so adding a new mint can't be silently forgotten on one
+// side.
+//
+// Failure model: best-effort. Each underlying issue function logs +
+// returns silently on its own failure (token mint failure → workspace
+// 401s on first /internal call; secret mint failure → first chat
+// upload 503s with the same message we used to surface for old-
+// workspaces). The shared helper does NOT abort the provision because
+// the workspace can still come up and serve non-internal traffic; the
+// 401/503 surfaces the missing secret loudly when the user actually
+// tries to use the affected feature.
+func (h *WorkspaceHandler) mintWorkspaceSecrets(ctx context.Context, workspaceID string, cfg *provisioner.WorkspaceConfig) {
+	h.issueAndInjectToken(ctx, workspaceID, cfg)
+	h.issueAndInjectInboundSecret(ctx, workspaceID, cfg)
+}
+
+// markProvisionFailed is the standard "abort with message" path used
+// by both provision modes. Wraps the broadcast + DB update in one
+// call so the failure shape stays consistent across modes.
+func (h *WorkspaceHandler) markProvisionFailed(ctx context.Context, workspaceID, msg string, extra map[string]interface{}) {
+	if extra == nil {
+		extra = map[string]interface{}{"error": msg}
+	} else if _, hasErr := extra["error"]; !hasErr {
+		extra["error"] = msg
+	}
+	h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, extra)
+	if _, dbErr := db.DB.ExecContext(ctx,
+		`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
+		workspaceID, msg); dbErr != nil {
+		// Non-fatal: the broadcast already fired, the operator sees the
+		// failure event in the canvas. The DB row stays at whatever
+		// status it had — provisioning event log is the source of truth.
+		log.Printf("markProvisionFailed: db update failed for %s: %v", workspaceID, dbErr)
+	}
+}
--- a/workspace-server/internal/handlers/workspace_provision_shared_test.go
+++ b/workspace-server/internal/handlers/workspace_provision_shared_test.go
@ -0,0 +1,395 @@
+package handlers
+
+// workspace_provision_shared_test.go — architectural test that pins
+// the invariant the shared-prepare refactor relies on: every code
+// path that provisions a workspace MUST call mintWorkspaceSecrets.
+//
+// Closes the drift class that produced the 2026-04-30 RFC #2312
+// silent-503 bug. Pre-fix: provisionWorkspaceCP forgot to mint
+// platform_inbound_secret because the SaaS path was implemented
+// after the Docker path and the original mint call wasn't carried
+// forward. Both modes now share mintWorkspaceSecrets via this
+// extracted helper; this test ensures it stays that way.
+//
+// Same shape as the audit-coverage gate from #335 (#2343 PR-5).
+// If this test fails: either add mintWorkspaceSecrets to the new
+// provision function, OR (if the function legitimately should NOT
+// mint) add it to provisionExemptFunctions with a one-line
+// justification.
+
+import (
+	"context"
+	"database/sql"
+	"go/ast"
+	"go/parser"
+	"go/token"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
+)
+
+// provisionExemptFunctions are functions that call a provision-start
+// method but legitimately do NOT need to mint (e.g. the wrapper
+// `provisionWorkspace` which delegates — the delegate mints; the
+// re-spawn loops inside Restart that re-enter provisionWorkspaceOpts).
+// Add an entry only with a one-line justification.
+var provisionExemptFunctions = map[string]string{
+	"provisionWorkspace": "thin wrapper that delegates to provisionWorkspaceOpts; the delegate mints",
+}
+
+// TestProvisionFunctions_AllCallMintWorkspaceSecrets asserts every
+// function in this package that triggers a workspace provision (i.e.
+// calls h.provisioner.Start or h.cpProv.Start) ALSO calls
+// mintWorkspaceSecrets at least once in the same body.
+//
+// Behavior-based — drift-resistant. A future provision function with
+// any name still trips this gate as long as it calls one of the
+// provisioner Start methods. This replaces an earlier name-list
+// version (PR #2366) that missed TeamHandler.Expand (issue #2367) —
+// the bug that motivated the upgrade.
+//
+// Same shape as the audit-coverage gate from #335 (#2343 PR-5).
+//
+// If this test fails: either add mintWorkspaceSecrets to the
+// offending function (preferred — usually you should delegate to
+// provisionWorkspace via h.wh), OR add it to provisionExemptFunctions
+// with a one-line justification.
+func TestProvisionFunctions_AllCallMintWorkspaceSecrets(t *testing.T) {
+	t.Parallel()
+
+	fset := token.NewFileSet()
+	entries, err := os.ReadDir(".")
+	if err != nil {
+		t.Fatalf("read dir: %v", err)
+	}
+
+	type missing struct {
+		file string
+		line int
+		fn   string
+	}
+	var violations []missing
+
+	for _, entry := range entries {
+		name := entry.Name()
+		if entry.IsDir() || !strings.HasSuffix(name, ".go") || strings.HasSuffix(name, "_test.go") {
+			continue
+		}
+		f, err := parser.ParseFile(fset, filepath.Join(".", name), nil, 0)
+		if err != nil {
+			t.Fatalf("parse %s: %v", name, err)
+		}
+
+		for _, decl := range f.Decls {
+			fn, ok := decl.(*ast.FuncDecl)
+			if !ok || fn.Body == nil {
+				continue
+			}
+			if _, exempt := provisionExemptFunctions[fn.Name.Name]; exempt {
+				continue
+			}
+			if !callsProvisionStart(fn.Body) {
+				continue
+			}
+			if !callsMintWorkspaceSecrets(fn.Body) {
+				violations = append(violations, missing{
+					file: name,
+					line: fset.Position(fn.Pos()).Line,
+					fn:   fn.Name.Name,
+				})
+			}
+		}
+	}
+
+	for _, v := range violations {
+		t.Errorf(
+			"%s:%d %s calls a provisioner Start (h.provisioner.Start or h.cpProv.Start) but does not call mintWorkspaceSecrets — every provision path MUST mint auth_token + platform_inbound_secret. Prefer delegating to h.wh.provisionWorkspace; only add %q to provisionExemptFunctions with a one-line justification if mint is genuinely inappropriate.",
+			v.file, v.line, v.fn, v.fn,
+		)
+	}
+}
+
+// callsProvisionStart reports whether the function body invokes a
+// provisioner-start method. Matches `<x>.provisioner.Start(...)` and
+// `<x>.cpProv.Start(...)` — both look like
+// `<recv>.<provField>.Start(...)` in the AST. Filtering on the
+// provisioner-field name (`provisioner` or `cpProv`) keeps the gate
+// from tripping on unrelated `.Start()` calls (e.g. http.Server.Start
+// in the same package).
+func callsProvisionStart(body *ast.BlockStmt) bool {
+	found := false
+	ast.Inspect(body, func(n ast.Node) bool {
+		if found {
+			return false
+		}
+		call, ok := n.(*ast.CallExpr)
+		if !ok {
+			return true
+		}
+		sel, ok := call.Fun.(*ast.SelectorExpr)
+		if !ok {
+			return true
+		}
+		if sel.Sel.Name != "Start" {
+			return true
+		}
+		inner, ok := sel.X.(*ast.SelectorExpr)
+		if !ok {
+			return true
+		}
+		switch inner.Sel.Name {
+		case "provisioner", "cpProv":
+			found = true
+			return false
+		}
+		return true
+	})
+	return found
+}
+
+// callsMintWorkspaceSecrets walks the function body and reports
+// whether mintWorkspaceSecrets is called anywhere — direct call OR
+// via a helper. Recursion to helpers is shallow: we only check
+// immediate calls in this function's body. The shared-prepare
+// refactor centralizes mint in mintWorkspaceSecrets itself, so a
+// direct call at the top-level is the expected pattern.
+func callsMintWorkspaceSecrets(body *ast.BlockStmt) bool {
+	found := false
+	ast.Inspect(body, func(n ast.Node) bool {
+		if found {
+			return false
+		}
+		call, ok := n.(*ast.CallExpr)
+		if !ok {
+			return true
+		}
+		sel, ok := call.Fun.(*ast.SelectorExpr)
+		if !ok {
+			return true
+		}
+		if sel.Sel.Name == "mintWorkspaceSecrets" {
+			found = true
+			return false
+		}
+		return true
+	})
+	return found
+}
+
+// TestMintWorkspaceSecrets_PersistsInboundSecretInSaaSMode is the
+// behavioral counterpart to the AST gate. It pins the structural
+// fix for the 2026-04-30 silent-503 chat upload bug (RFC #2312):
+// even in SaaS mode (where Docker file injection is skipped),
+// mintWorkspaceSecrets MUST persist platform_inbound_secret to the
+// workspaces row so platform-side handlers can read it back.
+//
+// Pre-fix: provisionWorkspaceCP never called the mint helper, so
+// every prod workspace had NULL platform_inbound_secret →
+// chat_files Upload returned 503 with "workspace not yet enrolled
+// in v2 upload" on every attempt.
+func TestMintWorkspaceSecrets_PersistsInboundSecretInSaaSMode(t *testing.T) {
+	t.Setenv("MOLECULE_DEPLOY_MODE", "saas")
+	mock := setupTestDB(t)
+
+	// First underlying call: revoke any existing live tokens. SaaS
+	// mode early-returns from issueAndInjectToken right after this,
+	// so IssueToken is NOT expected.
+	mock.ExpectExec(`UPDATE workspace_auth_tokens SET revoked_at = now\(\) WHERE workspace_id = \$1 AND revoked_at IS NULL`).
+		WithArgs("ws-saas-mint").
+		WillReturnResult(sqlmock.NewResult(0, 0))
+
+	// Second underlying call: persist the platform_inbound_secret.
+	// The structural fix — without this UPDATE, the bug recurs.
+	mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`).
+		WithArgs(sqlmock.AnyArg(), "ws-saas-mint").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	handler := NewWorkspaceHandler(&captureBroadcaster{}, nil, "http://localhost:8080", t.TempDir())
+	cfg := provisioner.WorkspaceConfig{WorkspaceID: "ws-saas-mint"}
+	handler.mintWorkspaceSecrets(context.Background(), "ws-saas-mint", &cfg)
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("sqlmock expectations not met — mintWorkspaceSecrets did not persist platform_inbound_secret in SaaS mode (this is the prod bug recurrence): %v", err)
+	}
+
+	// Sanity: SaaS mode must NOT have written .platform_inbound_secret
+	// into cfg.ConfigFiles — there's no Docker volume to deliver it.
+	if _, present := cfg.ConfigFiles[".platform_inbound_secret"]; present {
+		t.Errorf("SaaS mode should not inject .platform_inbound_secret into cfg.ConfigFiles (no Docker volume) — got entry")
+	}
+	if _, present := cfg.ConfigFiles[".auth_token"]; present {
+		t.Errorf("SaaS mode should not inject .auth_token into cfg.ConfigFiles (no Docker volume) — got entry")
+	}
+}
+
+// TestPrepareProvisionContext_ParentIDInjection pins the PARENT_ID env
+// contract added in #2367: when payload.ParentID is set (currently only
+// TeamHandler.Expand populates it), prepareProvisionContext MUST
+// surface it as envVars["PARENT_ID"] so workspace/coordinator.py can
+// read it on startup. Pre-fix #2367 the env was set inline in
+// TeamHandler.Expand on cfg.EnvVars; the refactor moved it into the
+// shared prepare so any future provision path with a parent_id
+// inherits it automatically.
+func TestPrepareProvisionContext_ParentIDInjection(t *testing.T) {
+	cases := []struct {
+		name       string
+		parentID   *string
+		expectKey  bool
+		expectVal  string
+	}{
+		{
+			name:      "parentID nil → no PARENT_ID env",
+			parentID:  nil,
+			expectKey: false,
+		},
+		{
+			name:      "parentID empty string → no PARENT_ID env",
+			parentID:  ptrStr(""),
+			expectKey: false,
+		},
+		{
+			name:      "parentID set → PARENT_ID env populated",
+			parentID:  ptrStr("ws-parent-123"),
+			expectKey: true,
+			expectVal: "ws-parent-123",
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			mock := setupTestDB(t)
+			// loadWorkspaceSecrets queries: empty rows + empty rows = clean prep.
+			mock.ExpectQuery(`SELECT key, encrypted_value, encryption_version FROM global_secrets`).
+				WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"}))
+			mock.ExpectQuery(`SELECT key, encrypted_value, encryption_version FROM workspace_secrets`).
+				WithArgs("ws-child").
+				WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"}))
+
+			handler := NewWorkspaceHandler(&captureBroadcaster{}, nil, "http://localhost:8080", t.TempDir())
+			payload := models.CreateWorkspacePayload{
+				Name:     "child",
+				Tier:     1,
+				ParentID: tc.parentID,
+			}
+			prepared, abort := handler.prepareProvisionContext(context.Background(), "ws-child", "/nonexistent", nil, payload, false)
+			if abort != nil {
+				t.Fatalf("unexpected abort: %s", abort.Msg)
+			}
+			val, present := prepared.EnvVars["PARENT_ID"]
+			if present != tc.expectKey {
+				t.Errorf("PARENT_ID present=%v, want %v (env=%v)", present, tc.expectKey, prepared.EnvVars)
+			}
+			if tc.expectKey && val != tc.expectVal {
+				t.Errorf("PARENT_ID=%q, want %q", val, tc.expectVal)
+			}
+		})
+	}
+}
+
+func ptrStr(s string) *string { return &s }
+
+// TestReadOrLazyHealInboundSecret pins the four branches of the
+// shared lazy-heal helper directly. Each call site (chat_files,
+// registry) has its own integration test, but those go through the
+// public handlers and conflate the helper's behavior with the
+// caller's response shape. This direct test pins the (secret, healed,
+// err) contract on its own so a future refactor that breaks the
+// helper signal — e.g., returning healed=true on a read-success path,
+// or swallowing a mint error — fails immediately.
+//
+// The four branches:
+//
+//   1. Secret already present → (s, false, nil)
+//   2. Secret missing, mint succeeds → (minted, true, nil)
+//   3. Secret missing, mint fails → ("", false, mint-err)
+//   4. Read fails (non-NoInboundSecret) → ("", false, read-err)
+func TestReadOrLazyHealInboundSecret(t *testing.T) {
+	t.Run("secret already present → no heal, no error", func(t *testing.T) {
+		mock := setupTestDB(t)
+		mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`).
+			WithArgs("ws-1").
+			WillReturnRows(sqlmock.NewRows([]string{"platform_inbound_secret"}).AddRow("present-secret"))
+
+		secret, healed, err := readOrLazyHealInboundSecret(context.Background(), "ws-1", "TestOp")
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if secret != "present-secret" {
+			t.Errorf("secret: got %q, want %q", secret, "present-secret")
+		}
+		if healed {
+			t.Errorf("healed should be false when secret was already present")
+		}
+		if err := mock.ExpectationsWereMet(); err != nil {
+			t.Errorf("unexpected sqlmock state — read happened but mint should NOT have: %v", err)
+		}
+	})
+
+	t.Run("secret missing → mint succeeds → returns healed=true", func(t *testing.T) {
+		mock := setupTestDB(t)
+		mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`).
+			WithArgs("ws-2").
+			WillReturnRows(sqlmock.NewRows([]string{"platform_inbound_secret"}).AddRow(nil))
+		mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`).
+			WithArgs(sqlmock.AnyArg(), "ws-2").
+			WillReturnResult(sqlmock.NewResult(0, 1))
+
+		secret, healed, err := readOrLazyHealInboundSecret(context.Background(), "ws-2", "TestOp")
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if secret == "" {
+			t.Error("expected a freshly-minted secret string, got empty")
+		}
+		if !healed {
+			t.Error("healed should be true after lazy-heal mint succeeded")
+		}
+		if err := mock.ExpectationsWereMet(); err != nil {
+			t.Errorf("sqlmock expectations not met — mint did NOT run: %v", err)
+		}
+	})
+
+	t.Run("secret missing → mint fails → returns err and not healed", func(t *testing.T) {
+		mock := setupTestDB(t)
+		mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`).
+			WithArgs("ws-3").
+			WillReturnRows(sqlmock.NewRows([]string{"platform_inbound_secret"}).AddRow(nil))
+		mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`).
+			WithArgs(sqlmock.AnyArg(), "ws-3").
+			WillReturnError(sql.ErrConnDone)
+
+		secret, healed, err := readOrLazyHealInboundSecret(context.Background(), "ws-3", "TestOp")
+		if err == nil {
+			t.Fatal("expected mint error, got nil")
+		}
+		if secret != "" {
+			t.Errorf("expected empty secret on mint failure, got %q", secret)
+		}
+		if healed {
+			t.Error("healed must be false when mint failed")
+		}
+	})
+
+	t.Run("read fails (non-NoInboundSecret) → returns err and not healed", func(t *testing.T) {
+		mock := setupTestDB(t)
+		mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`).
+			WithArgs("ws-4").
+			WillReturnError(sql.ErrConnDone)
+
+		secret, healed, err := readOrLazyHealInboundSecret(context.Background(), "ws-4", "TestOp")
+		if err == nil {
+			t.Fatal("expected read error, got nil")
+		}
+		if secret != "" {
+			t.Errorf("expected empty secret on read failure, got %q", secret)
+		}
+		if healed {
+			t.Error("healed must be false when read failed")
+		}
+	})
+}
--- a/workspace-server/internal/handlers/workspace_provision_test.go
+++ b/workspace-server/internal/handlers/workspace_provision_test.go
@ -1137,10 +1137,14 @@ func TestProvisionWorkspace_NoInternalErrorsInBroadcast(t *testing.T) {
 		WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"}).
 			AddRow("FAKE_KEY", []byte("any-bytes"), 99))
 	// On decrypt failure provisionWorkspace also marks the workspace as
-	// failed via UPDATE workspaces. Match-anything on the args so the
-	// test isn't coupled to the exact UPDATE column order.
+	// failed via UPDATE workspaces. Two args: workspace_id + the
+	// last_sample_error message ("failed to decrypt global secret").
+	// Pre-refactor (workspace_provision_shared.go) the decrypt-fail
+	// path skipped last_sample_error; the shared helper now always
+	// persists it so users see the failure in the UI without having
+	// to grep server logs.
 	mock.ExpectExec(`UPDATE workspaces SET status = 'failed'`).
-		WithArgs(sqlmock.AnyArg()).
+		WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg()).
 		WillReturnResult(sqlmock.NewResult(0, 1))

 	cap := &captureBroadcaster{}
@ -1206,6 +1210,10 @@ func (s *stubFailingCPProv) GetConsoleOutput(_ context.Context, _ string) (strin
 	panic("stubFailingCPProv.GetConsoleOutput not expected on the provisionWorkspaceCP failure path")
 }

+func (s *stubFailingCPProv) IsRunning(_ context.Context, _ string) (bool, error) {
+	panic("stubFailingCPProv.IsRunning not expected on the provisionWorkspaceCP failure path")
+}
+
 // TestProvisionWorkspaceCP_NoInternalErrorsInBroadcast asserts that
 // provisionWorkspaceCP never leaks err.Error() in
 // WORKSPACE_PROVISION_FAILED broadcasts. Regression test for #1206.
--- a/workspace-server/internal/handlers/workspace_restart.go
+++ b/workspace-server/internal/handlers/workspace_restart.go
@ -326,7 +326,12 @@ func (h *WorkspaceHandler) HibernateWorkspace(ctx context.Context, workspaceID s
 // in-flight runner picks up the pending request after its current cycle
 // completes, so writes that committed mid-restart are guaranteed to land.
 func (h *WorkspaceHandler) RestartByID(workspaceID string) {
-	if h.provisioner == nil {
+	// At least one of the two provisioners must be wired. Pre-fix this
+	// short-circuited on h.provisioner==nil alone, which silently disabled
+	// reactive auto-restart on every SaaS tenant (where the local Docker
+	// provisioner is intentionally nil). The runRestartCycle below now
+	// branches on which one is set for the Stop call.
+	if h.provisioner == nil && h.cpProv == nil {
 		return
 	}
 	coalesceRestart(workspaceID, func() { h.runRestartCycle(workspaceID) })
@ -391,6 +396,25 @@ func coalesceRestart(workspaceID string, cycle func()) {
 	}
 }

+// stopForRestart dispatches Stop to whichever provisioner is wired (Docker or
+// CP/EC2 — mutually exclusive in production). Docker provisioner.Stop kills
+// the local container; CP provisioner.Stop calls DELETE /cp/workspaces/:id
+// which terminates the EC2 instance. Pre-fix runRestartCycle only called the
+// Docker path, so on SaaS (h.provisioner=nil) the auto-restart cycle silently
+// NPE'd before reaching the reprovision step — which is why every SaaS dead-
+// agent incident pre-this-fix required manual restart from canvas.
+func (h *WorkspaceHandler) stopForRestart(ctx context.Context, workspaceID string) {
+	if h.provisioner != nil {
+		h.provisioner.Stop(ctx, workspaceID)
+		return
+	}
+	if h.cpProv != nil {
+		if err := h.cpProv.Stop(ctx, workspaceID); err != nil {
+			log.Printf("Auto-restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", workspaceID, err)
+		}
+	}
+}
+
 // runRestartCycle does the actual stop+provision work for one restart
 // iteration. Synchronous (waits for provisionWorkspace to complete) so the
 // outer pending-flag loop in RestartByID can correctly coalesce — if this
@ -426,7 +450,7 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {

 	log.Printf("Auto-restart: restarting %s (%s) runtime=%q (was: %s)", wsName, workspaceID, dbRuntime, status)

-	h.provisioner.Stop(ctx, workspaceID)
+	h.stopForRestart(ctx, workspaceID)

 	db.DB.ExecContext(ctx,
 		`UPDATE workspaces SET status = 'provisioning', url = '', updated_at = now() WHERE id = $1`, workspaceID)
@ -445,7 +469,21 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {
 	// SYNCHRONOUS provisionWorkspace: returns when the new container is up
 	// (or has failed). The outer loop relies on this to know when it's safe
 	// to start another restart cycle without racing this one's Stop call.
-	h.provisionWorkspace(workspaceID, "", nil, payload)
+	//
+	// Branch on which provisioner is wired — same dispatch as the other call
+	// sites in this package (workspace.go:431-433, workspace_restart.go:197+596).
+	// Pre-fix this only called the Docker variant, so on SaaS the auto-restart
+	// cycle would NPE inside provisionWorkspace's `h.provisioner.VolumeHasFile`
+	// call, get swallowed by coalesceRestart's recover()-without-re-raise (a
+	// platform-stability safeguard), and leave the workspace permanently
+	// stuck in status='provisioning' (the UPDATE above already ran). User-
+	// observable result before this fix on SaaS: dead workspace → manual
+	// canvas restart was the only recovery path.
+	if h.cpProv != nil {
+		h.provisionWorkspaceCP(workspaceID, "", nil, payload)
+	} else {
+		h.provisionWorkspace(workspaceID, "", nil, payload)
+	}
 	// sendRestartContext is a one-way notification to the new container; safe
 	// to fire async — the next restart cycle won't depend on it completing.
 	go h.sendRestartContext(workspaceID, restartData)
--- a/workspace-server/internal/middleware/wsauth_middleware_canvasorbearer_test.go
+++ b/workspace-server/internal/middleware/wsauth_middleware_canvasorbearer_test.go
@ -39,7 +39,7 @@ func TestCanvasOrBearer_ValidBearer_Passes(t *testing.T) {
 		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
 	mock.ExpectQuery(validateAnyTokenSelectQuery).
 		WithArgs(hash[:]).
-		WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("tok-1"))
+		WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-x"))
 	mock.ExpectExec(validateTokenUpdateQuery).
 		WithArgs("tok-1").
 		WillReturnResult(sqlmock.NewResult(0, 1))
--- a/workspace-server/internal/middleware/wsauth_middleware_test.go
+++ b/workspace-server/internal/middleware/wsauth_middleware_test.go
@ -30,7 +30,9 @@ const validateTokenSelectQuery = "SELECT t\\.id, t\\.workspace_id.*FROM workspac

 // validateAnyTokenQuery is matched for ValidateAnyToken (SELECT).
 // The JOIN on workspaces filters removed-workspace tokens (#682 defense-in-depth).
-const validateAnyTokenSelectQuery = "SELECT t\\.id.*FROM workspace_auth_tokens t.*JOIN workspaces"
+// Identical to validateTokenSelectQuery because both go through the
+// shared lookupTokenByHash helper (projects (id, workspace_id)).
+const validateAnyTokenSelectQuery = "SELECT t\\.id, t\\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces"

 // validateTokenUpdateQuery is matched for the best-effort last_used_at UPDATE.
 const validateTokenUpdateQuery = "UPDATE workspace_auth_tokens SET last_used_at"
@ -399,7 +401,7 @@ func TestAdminAuth_ValidBearer_Passes(t *testing.T) {
 	// ValidateAnyToken SELECT — token matches a live row.
 	mock.ExpectQuery(validateAnyTokenSelectQuery).
 		WithArgs(tokenHash[:]).
-		WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("tok-admin-1"))
+		WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-admin-1", "ws-admin"))

 	// Best-effort last_used_at UPDATE.
 	mock.ExpectExec(validateTokenUpdateQuery).
@ -1276,7 +1278,7 @@ func TestAdminAuth_623_ValidBearer_WithOrigin_Passes(t *testing.T) {
 		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
 	mock.ExpectQuery(validateAnyTokenSelectQuery).
 		WithArgs(tokenHash[:]).
-		WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("tok-1"))
+		WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-x"))
 	mock.ExpectExec(validateTokenUpdateQuery).
 		WithArgs("tok-1").
 		WillReturnResult(sqlmock.NewResult(0, 1))
@ -1472,7 +1474,7 @@ func TestAdminAuth_684_AdminTokenNotSet_FallsBackToWorkspaceToken(t *testing.T)

 	mock.ExpectQuery(validateAnyTokenSelectQuery).
 		WithArgs(tokenHash[:]).
-		WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("tok-ws-1"))
+		WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-ws-1", "ws-x"))

 	mock.ExpectExec(validateTokenUpdateQuery).
 		WithArgs("tok-ws-1").
--- a/workspace-server/internal/provisioner/cp_provisioner.go
+++ b/workspace-server/internal/provisioner/cp_provisioner.go
@ -29,6 +29,16 @@ type CPProvisionerAPI interface {
 	Start(ctx context.Context, cfg WorkspaceConfig) (string, error)
 	Stop(ctx context.Context, workspaceID string) error
 	GetConsoleOutput(ctx context.Context, workspaceID string) (string, error)
+	// IsRunning reports whether the workspace's compute (EC2 instance) is
+	// currently in the running state. Surfaced on the interface (rather than
+	// only on *CPProvisioner) so the a2a-proxy reactive-health path can
+	// detect dead EC2 agents the same way it detects dead Docker containers.
+	// Pre-#NNN, maybeMarkContainerDead only consulted the local Docker
+	// provisioner — for SaaS tenants (h.provisioner=nil) the check was a
+	// no-op, so a dead EC2 agent would leak 502/503 to canvas with no
+	// auto-recovery. (true, err) on transport errors keeps callers on the
+	// alive path; (false, nil) is the only definitive "dead" signal.
+	IsRunning(ctx context.Context, workspaceID string) (bool, error)
 }

 // Compile-time assertion: *CPProvisioner satisfies CPProvisionerAPI.
--- a/workspace-server/internal/router/router.go
+++ b/workspace-server/internal/router/router.go
@ -213,7 +213,7 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
 		r.GET("/approvals/pending", middleware.AdminAuth(db.DB), apph.ListAll)

 		// Team Expansion
-		teamh := handlers.NewTeamHandler(broadcaster, prov, platformURL, configsDir)
+		teamh := handlers.NewTeamHandler(broadcaster, prov, wh, platformURL, configsDir)
 		wsAuth.POST("/expand", teamh.Expand)
 		wsAuth.POST("/collapse", teamh.Collapse)

--- a/workspace-server/internal/wsauth/tokens.go
+++ b/workspace-server/internal/wsauth/tokens.go
@ -65,6 +65,41 @@ func IssueToken(ctx context.Context, db *sql.DB, workspaceID string) (string, er
 	return plaintext, nil
 }

+// lookupTokenByHash is the single source of truth for "find a live
+// workspace token by its sha256 hash, scoped to a non-removed workspace"
+// — the auth predicate every public token-validating function needs.
+//
+// Returns ErrInvalidToken on any miss (no row, removed workspace, DB
+// error). All three failure modes collapse to the same public error so
+// callers can't accidentally distinguish "bad token" vs. "wrong
+// workspace" vs. "DB hiccup" — that distinction is a side-channel
+// callers must not expose.
+//
+// Defense-in-depth (#682, #696, #697): the JOIN on workspaces filters
+// tokens belonging to removed workspaces. Future safety changes (e.g.
+// "also exclude paused workspaces from auth") go in ONE place; without
+// this helper, the same WHERE/JOIN was duplicated across ValidateToken,
+// WorkspaceFromToken, and ValidateAnyToken — same drift class as the
+// 2026-04-30 SaaS provision-mint bug fixed in #2366.
+//
+// SELECT projects both columns even when only one is needed by the
+// caller. The trivial perf cost is worth the single-source-of-truth
+// guarantee for the auth predicate.
+func lookupTokenByHash(ctx context.Context, db *sql.DB, hash []byte) (tokenID, workspaceID string, err error) {
+	err = db.QueryRowContext(ctx, `
+		SELECT t.id, t.workspace_id
+		FROM workspace_auth_tokens t
+		JOIN workspaces w ON w.id = t.workspace_id
+		WHERE t.token_hash = $1
+		  AND t.revoked_at IS NULL
+		  AND w.status != 'removed'
+	`, hash).Scan(&tokenID, &workspaceID)
+	if err != nil {
+		return "", "", ErrInvalidToken
+	}
+	return tokenID, workspaceID, nil
+}
+
 // ValidateToken confirms the presented plaintext matches a live row whose
 // workspace_id equals expectedWorkspaceID. On success it refreshes
 // last_used_at (best-effort — failure to update is logged by the caller,
@ -73,31 +108,15 @@ func IssueToken(ctx context.Context, db *sql.DB, workspaceID string) (string, er
 // The expectedWorkspaceID binding is required because a token is only
 // valid for the workspace it was issued to. A compromised token from
 // workspace A must never authenticate workspace B.
-//
-// Defense-in-depth (#697): the JOIN on workspaces filters out tokens that
-// belong to removed workspaces so that a deleted workspace's tokens cannot
-// be replayed against its former sub-routes even before the token row is
-// explicitly revoked. Mirrors the same guard added to ValidateAnyToken (#696).
 func ValidateToken(ctx context.Context, db *sql.DB, expectedWorkspaceID, plaintext string) error {
 	if plaintext == "" || expectedWorkspaceID == "" {
 		return ErrInvalidToken
 	}
 	hash := sha256.Sum256([]byte(plaintext))

-	var tokenID, workspaceID string
-	err := db.QueryRowContext(ctx, `
-		SELECT t.id, t.workspace_id
-		FROM workspace_auth_tokens t
-		JOIN workspaces w ON w.id = t.workspace_id
-		WHERE t.token_hash = $1
-		  AND t.revoked_at IS NULL
-		  AND w.status != 'removed'
-	`, hash[:]).Scan(&tokenID, &workspaceID)
+	tokenID, workspaceID, err := lookupTokenByHash(ctx, db, hash[:])
 	if err != nil {
-		// Includes sql.ErrNoRows — collapse to a single public-facing error
-		// so the handler can't accidentally leak which half of the check
-		// failed (bad token vs. wrong workspace).
-		return ErrInvalidToken
+		return err
 	}
 	if workspaceID != expectedWorkspaceID {
 		return ErrInvalidToken
@ -121,10 +140,6 @@ func ValidateToken(ctx context.Context, db *sql.DB, expectedWorkspaceID, plainte
 // error so handlers can't accidentally distinguish "no token" vs "wrong
 // workspace" — both should result in the same caller-facing response.
 //
-// Defense-in-depth (mirrors ValidateToken / ValidateAnyToken): the JOIN on
-// workspaces filters out tokens that belong to removed workspaces so a
-// deleted workspace's tokens cannot derive a callerID for activity logging.
-//
 // Does NOT update last_used_at — the calling handler chain typically also
 // runs the bearer through ValidateToken or ValidateAnyToken, which already
 // performs that update.
@ -134,17 +149,9 @@ func WorkspaceFromToken(ctx context.Context, db *sql.DB, plaintext string) (stri
 	}
 	hash := sha256.Sum256([]byte(plaintext))

-	var workspaceID string
-	err := db.QueryRowContext(ctx, `
-		SELECT t.workspace_id
-		FROM workspace_auth_tokens t
-		JOIN workspaces w ON w.id = t.workspace_id
-		WHERE t.token_hash = $1
-		  AND t.revoked_at IS NULL
-		  AND w.status != 'removed'
-	`, hash[:]).Scan(&workspaceID)
+	_, workspaceID, err := lookupTokenByHash(ctx, db, hash[:])
 	if err != nil {
-		return "", ErrInvalidToken
+		return "", err
 	}
 	return workspaceID, nil
 }
@ -231,27 +238,15 @@ func HasAnyLiveTokenGlobal(ctx context.Context, db *sql.DB) (bool, error) {
 // token (not scoped to a specific workspace). Used for admin/global routes
 // where workspace-scoped auth is not applicable — any authenticated agent may
 // access platform-wide settings.
-//
-// Defense-in-depth (#682): the JOIN on workspaces filters out tokens that
-// belong to removed workspaces so that a deleted workspace's tokens cannot
-// be replayed against admin endpoints.
 func ValidateAnyToken(ctx context.Context, db *sql.DB, plaintext string) error {
 	if plaintext == "" {
 		return ErrInvalidToken
 	}
 	hash := sha256.Sum256([]byte(plaintext))

-	var tokenID string
-	err := db.QueryRowContext(ctx, `
-		SELECT t.id
-		FROM workspace_auth_tokens t
-		JOIN workspaces w ON w.id = t.workspace_id
-		WHERE t.token_hash = $1
-		  AND t.revoked_at IS NULL
-		  AND w.status != 'removed'
-	`, hash[:]).Scan(&tokenID)
+	tokenID, _, err := lookupTokenByHash(ctx, db, hash[:])
 	if err != nil {
-		return ErrInvalidToken
+		return err
 	}

 	// Best-effort last_used_at update.
--- a/workspace-server/internal/wsauth/tokens_test.go
+++ b/workspace-server/internal/wsauth/tokens_test.go
@ -155,9 +155,12 @@ func TestWorkspaceFromToken_HappyPath(t *testing.T) {
 		t.Fatalf("IssueToken: %v", err)
 	}

-	mock.ExpectQuery(`SELECT t\.workspace_id\s+FROM workspace_auth_tokens t.*JOIN workspaces`).
+	// Shared lookupTokenByHash projects (id, workspace_id) — caller picks
+	// which fields to use. WorkspaceFromToken needs only workspace_id but
+	// the mock matches the unified SELECT that lookupTokenByHash issues.
+	mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
 		WithArgs(sqlmock.AnyArg()).
-		WillReturnRows(sqlmock.NewRows([]string{"workspace_id"}).AddRow("ws-source"))
+		WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-id-1", "ws-source"))

 	wsID, err := WorkspaceFromToken(context.Background(), db, tok)
 	if err != nil {
@ -180,7 +183,7 @@ func TestWorkspaceFromToken_EmptyTokenRejected(t *testing.T) {

 func TestWorkspaceFromToken_UnknownTokenRejected(t *testing.T) {
 	db, mock := setupMock(t)
-	mock.ExpectQuery(`SELECT t\.workspace_id\s+FROM workspace_auth_tokens t.*JOIN workspaces`).
+	mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
 		WillReturnError(sql.ErrNoRows)

 	if _, err := WorkspaceFromToken(context.Background(), db, "not-a-real-token"); err != ErrInvalidToken {
@ -192,9 +195,9 @@ func TestWorkspaceFromToken_UnknownTokenRejected(t *testing.T) {
 // must NOT yield a workspace_id usable for callerID derivation.
 func TestWorkspaceFromToken_RemovedWorkspaceRejected(t *testing.T) {
 	db, mock := setupMock(t)
-	mock.ExpectQuery(`SELECT t\.workspace_id\s+FROM workspace_auth_tokens t.*JOIN workspaces`).
+	mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
 		WithArgs(sqlmock.AnyArg()).
-		WillReturnRows(sqlmock.NewRows([]string{"workspace_id"})) // empty rows
+		WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"})) // empty rows

 	if _, err := WorkspaceFromToken(context.Background(), db, "token-for-removed-workspace"); err != ErrInvalidToken {
 		t.Errorf("removed workspace token: expected ErrInvalidToken, got %v", err)
@ -345,9 +348,12 @@ func TestValidateAnyToken_HappyPath(t *testing.T) {
 	}

 	// ValidateAnyToken: lookup by hash with removed-workspace JOIN.
-	mock.ExpectQuery(`SELECT t\.id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
+	// Shared lookupTokenByHash projects (id, workspace_id) — caller picks
+	// which fields to use. ValidateAnyToken needs only id but the mock
+	// matches the unified SELECT that lookupTokenByHash issues.
+	mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
 		WithArgs(sqlmock.AnyArg()).
-		WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("tok-id-global"))
+		WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-id-global", "ws-admin"))
 	// Best-effort last_used_at update.
 	mock.ExpectExec(`UPDATE workspace_auth_tokens SET last_used_at`).
 		WithArgs("tok-id-global").
@ -363,7 +369,7 @@ func TestValidateAnyToken_HappyPath(t *testing.T) {

 func TestValidateAnyToken_UnknownTokenRejected(t *testing.T) {
 	db, mock := setupMock(t)
-	mock.ExpectQuery(`SELECT t\.id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
+	mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
 		WillReturnError(sql.ErrNoRows)

 	if err := ValidateAnyToken(context.Background(), db, "not-a-real-token"); err != ErrInvalidToken {
@ -378,9 +384,9 @@ func TestValidateAnyToken_UnknownTokenRejected(t *testing.T) {
 func TestValidateAnyToken_RemovedWorkspaceRejected(t *testing.T) {
 	db, mock := setupMock(t)
 	// JOIN with w.status != 'removed' causes no rows — same as ErrNoRows.
-	mock.ExpectQuery(`SELECT t\.id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
+	mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
 		WithArgs(sqlmock.AnyArg()).
-		WillReturnRows(sqlmock.NewRows([]string{"id"})) // empty: workspace is removed
+		WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"})) // empty: workspace is removed

 	err := ValidateAnyToken(context.Background(), db, "token-for-removed-workspace")
 	if err != ErrInvalidToken {
--- a/workspace/tests/_signature_snapshot.py
+++ b/workspace/tests/_signature_snapshot.py
@ -0,0 +1,191 @@
+"""Shared inspect-based signature-snapshot helpers (#2364 item 2).
+
+Originally lived inline in tests/test_adapter_base_signature.py.
+Extracted here so each public-surface module gets its own
+test_*_signature.py + snapshot file without copy-pasting the
+introspection logic.
+
+Pattern (one snapshot file per module):
+
+    from tests._signature_snapshot import (
+        build_class_signature_record,
+        build_dataclass_record,
+        compare_against_snapshot,
+    )
+
+    SNAPSHOT_PATH = Path(__file__).parent / "snapshots" / "<module>_signature.json"
+
+    def _build_full_snapshot() -> dict:
+        from <module> import PublicClass, PublicDataclass
+        return {
+            "module": "<module>",
+            "classes": [build_class_signature_record(PublicClass)],
+            "dataclasses": [build_dataclass_record(PublicDataclass)],
+        }
+
+    def test_<module>_signature_matches_snapshot():
+        compare_against_snapshot(_build_full_snapshot(), SNAPSHOT_PATH)
+
+The snapshot is a stable JSON file — sort_keys + indent=2 — so
+diffs are reviewable in PR. Any drift trips the test with both
+expected and actual JSON in the failure message.
+"""
+
+import inspect
+import json
+from pathlib import Path
+
+import pytest
+
+
+def _annotation_repr(annotation: object) -> str:
+    """Stable string form of a type annotation. ``inspect`` returns the
+    runtime objects which don't compare cleanly — repr is the boring
+    correct answer for snapshotting."""
+    if annotation is inspect.Parameter.empty:
+        return ""
+    if isinstance(annotation, type):
+        return annotation.__name__
+    return str(annotation)
+
+
+def _parameter_record(p: inspect.Parameter) -> dict:
+    return {
+        "name": p.name,
+        "kind": p.kind.name,
+        "annotation": _annotation_repr(p.annotation),
+        "has_default": p.default is not inspect.Parameter.empty,
+    }
+
+
+def _signature_record(name: str, fn: object) -> dict:
+    sig = inspect.signature(fn)
+    return {
+        "name": name,
+        "is_async": inspect.iscoroutinefunction(fn),
+        "is_abstract": getattr(fn, "__isabstractmethod__", False),
+        "parameters": [_parameter_record(p) for p in sig.parameters.values()],
+        "return_annotation": _annotation_repr(sig.return_annotation),
+    }
+
+
+def build_class_signature_record(cls: type) -> dict:
+    """Snapshot a class's public method surface. Public = name doesn't
+    start with underscore. Static/class/abstract methods are unwrapped
+    so the underlying function signature is captured.
+
+    Returns: ``{class: <name>, methods: [<sorted method records>]}``
+    """
+    methods: list[dict] = []
+    for attr_name in sorted(vars(cls)):
+        if attr_name.startswith("_"):
+            continue
+        attr = vars(cls)[attr_name]
+        if isinstance(attr, staticmethod):
+            fn = attr.__func__
+        elif isinstance(attr, classmethod):
+            fn = attr.__func__
+        elif callable(attr):
+            fn = attr
+        else:
+            continue
+        methods.append(_signature_record(attr_name, fn))
+    return {"class": cls.__name__, "methods": methods}
+
+
+def build_module_functions_record(module: object, function_names: list[str] | None = None) -> dict:
+    """Snapshot a module's public top-level functions. By default, walks
+    every public callable defined IN the module (excludes re-exports
+    via __module__ check). Pass ``function_names`` explicitly to pin a
+    specific set when the module exports more than the contract surface
+    (e.g. internal helpers that intentionally aren't part of the gate).
+
+    Returns: ``{module: <name>, functions: [<sorted records>]}``
+    """
+    import types
+
+    fns: list[dict] = []
+    target_module = module.__name__
+
+    if function_names is not None:
+        for fn_name in sorted(function_names):
+            fn = getattr(module, fn_name, None)
+            if fn is None or not isinstance(fn, types.FunctionType):
+                # Caller asked for a name that isn't a function in the
+                # module — surface it as part of the snapshot so the
+                # error path stays in the failure-message-with-diff
+                # path rather than blowing up here.
+                fns.append({"name": fn_name, "missing": True})
+                continue
+            fns.append(_signature_record(fn_name, fn))
+    else:
+        for attr_name in sorted(vars(module)):
+            if attr_name.startswith("_"):
+                continue
+            attr = getattr(module, attr_name)
+            if not isinstance(attr, types.FunctionType):
+                continue
+            # Skip re-exports — only record functions defined IN this
+            # module so a `from foo import bar` doesn't pollute the
+            # snapshot.
+            if getattr(attr, "__module__", None) != target_module:
+                continue
+            fns.append(_signature_record(attr_name, attr))
+    return {"module": target_module, "functions": fns}
+
+
+def build_dataclass_record(cls: type) -> dict:
+    """Snapshot a dataclass's field shape. Captures field name + type
+    annotation + has_default per field, plus the @dataclass(frozen=...)
+    flag. Default values themselves are NOT recorded (would require
+    brittle value-shape stringifying for non-trivial defaults).
+
+    Returns: ``{name, frozen, fields: [<field records>]}``
+    """
+    import dataclasses as _dc
+
+    fields = []
+    for f in _dc.fields(cls):
+        fields.append({
+            "name": f.name,
+            "annotation": _annotation_repr(f.type) if not isinstance(f.type, str) else f.type,
+            "has_default": f.default is not _dc.MISSING or f.default_factory is not _dc.MISSING,
+        })
+    return {
+        "name": cls.__name__,
+        "frozen": getattr(cls, "__dataclass_params__").frozen,
+        "fields": fields,
+    }
+
+
+def compare_against_snapshot(actual: dict, snapshot_path: Path) -> None:
+    """Compare a built snapshot against a checked-in JSON file.
+
+    On first run (snapshot missing): writes the file and skips. Re-run
+    to verify it now passes — the snapshot file appears in the diff
+    of the PR introducing it.
+
+    On drift: fails the test with both expected and actual JSON in
+    the failure message so the reviewer sees the change without
+    re-running anything.
+    """
+    if not snapshot_path.exists():
+        snapshot_path.parent.mkdir(parents=True, exist_ok=True)
+        snapshot_path.write_text(json.dumps(actual, indent=2, sort_keys=True) + "\n")
+        pytest.skip(
+            f"snapshot did not exist; wrote {snapshot_path.name} — "
+            "re-run the test to verify it now passes"
+        )
+
+    expected = json.loads(snapshot_path.read_text())
+    if actual != expected:
+        actual_str = json.dumps(actual, indent=2, sort_keys=True)
+        expected_str = json.dumps(expected, indent=2, sort_keys=True)
+        pytest.fail(
+            f"Signature drifted from {snapshot_path.name}.\n\n"
+            "Update intentionally by deleting the snapshot file and re-running, "
+            "OR by editing it to match. The PR diff makes the change visible "
+            "to reviewers and to template repos that depend on this surface.\n\n"
+            f"=== EXPECTED ({snapshot_path.name}) ===\n{expected_str}\n\n"
+            f"=== ACTUAL (current source) ===\n{actual_str}\n"
+        )
--- a/workspace/tests/snapshots/adapter_base_signature.json
+++ b/workspace/tests/snapshots/adapter_base_signature.json
@ -0,0 +1,436 @@
+{
+  "class": "BaseAdapter",
+  "dataclasses": [
+    {
+      "fields": [
+        {
+          "annotation": "str",
+          "has_default": false,
+          "name": "system_prompt"
+        },
+        {
+          "annotation": "list",
+          "has_default": false,
+          "name": "loaded_skills"
+        },
+        {
+          "annotation": "list",
+          "has_default": false,
+          "name": "langchain_tools"
+        },
+        {
+          "annotation": "bool",
+          "has_default": false,
+          "name": "is_coordinator"
+        },
+        {
+          "annotation": "list",
+          "has_default": false,
+          "name": "children"
+        }
+      ],
+      "frozen": false,
+      "name": "SetupResult"
+    },
+    {
+      "fields": [
+        {
+          "annotation": "str",
+          "has_default": false,
+          "name": "model"
+        },
+        {
+          "annotation": "str | None",
+          "has_default": true,
+          "name": "system_prompt"
+        },
+        {
+          "annotation": "list[str]",
+          "has_default": true,
+          "name": "tools"
+        },
+        {
+          "annotation": "dict[str, typing.Any]",
+          "has_default": true,
+          "name": "runtime_config"
+        },
+        {
+          "annotation": "str",
+          "has_default": true,
+          "name": "config_path"
+        },
+        {
+          "annotation": "str",
+          "has_default": true,
+          "name": "workspace_id"
+        },
+        {
+          "annotation": "list[str]",
+          "has_default": true,
+          "name": "prompt_files"
+        },
+        {
+          "annotation": "int",
+          "has_default": true,
+          "name": "a2a_port"
+        },
+        {
+          "annotation": "Any",
+          "has_default": true,
+          "name": "heartbeat"
+        }
+      ],
+      "frozen": false,
+      "name": "AdapterConfig"
+    },
+    {
+      "fields": [
+        {
+          "annotation": "bool",
+          "has_default": true,
+          "name": "provides_native_heartbeat"
+        },
+        {
+          "annotation": "bool",
+          "has_default": true,
+          "name": "provides_native_scheduler"
+        },
+        {
+          "annotation": "bool",
+          "has_default": true,
+          "name": "provides_native_session"
+        },
+        {
+          "annotation": "bool",
+          "has_default": true,
+          "name": "provides_native_status_mgmt"
+        },
+        {
+          "annotation": "bool",
+          "has_default": true,
+          "name": "provides_native_retry"
+        },
+        {
+          "annotation": "bool",
+          "has_default": true,
+          "name": "provides_activity_decoration"
+        },
+        {
+          "annotation": "bool",
+          "has_default": true,
+          "name": "provides_channel_dispatch"
+        }
+      ],
+      "frozen": true,
+      "name": "RuntimeCapabilities"
+    }
+  ],
+  "methods": [
+    {
+      "is_abstract": false,
+      "is_async": false,
+      "name": "append_to_memory_hook",
+      "parameters": [
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "self"
+        },
+        {
+          "annotation": "AdapterConfig",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "config"
+        },
+        {
+          "annotation": "str",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "filename"
+        },
+        {
+          "annotation": "str",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "content"
+        }
+      ],
+      "return_annotation": "None"
+    },
+    {
+      "is_abstract": false,
+      "is_async": false,
+      "name": "capabilities",
+      "parameters": [
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "self"
+        }
+      ],
+      "return_annotation": "RuntimeCapabilities"
+    },
+    {
+      "is_abstract": true,
+      "is_async": true,
+      "name": "create_executor",
+      "parameters": [
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "self"
+        },
+        {
+          "annotation": "AdapterConfig",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "config"
+        }
+      ],
+      "return_annotation": "AgentExecutor"
+    },
+    {
+      "is_abstract": true,
+      "is_async": false,
+      "name": "description",
+      "parameters": [],
+      "return_annotation": "str"
+    },
+    {
+      "is_abstract": true,
+      "is_async": false,
+      "name": "display_name",
+      "parameters": [],
+      "return_annotation": "str"
+    },
+    {
+      "is_abstract": false,
+      "is_async": false,
+      "name": "get_config_schema",
+      "parameters": [],
+      "return_annotation": "dict"
+    },
+    {
+      "is_abstract": false,
+      "is_async": false,
+      "name": "idle_timeout_override",
+      "parameters": [
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "self"
+        }
+      ],
+      "return_annotation": "int | None"
+    },
+    {
+      "is_abstract": false,
+      "is_async": true,
+      "name": "inject_plugins",
+      "parameters": [
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "self"
+        },
+        {
+          "annotation": "AdapterConfig",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "config"
+        },
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "plugins"
+        }
+      ],
+      "return_annotation": "None"
+    },
+    {
+      "is_abstract": false,
+      "is_async": true,
+      "name": "install_plugins_via_registry",
+      "parameters": [
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "self"
+        },
+        {
+          "annotation": "AdapterConfig",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "config"
+        },
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "plugins"
+        }
+      ],
+      "return_annotation": "list"
+    },
+    {
+      "is_abstract": false,
+      "is_async": false,
+      "name": "memory_filename",
+      "parameters": [
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "self"
+        }
+      ],
+      "return_annotation": "str"
+    },
+    {
+      "is_abstract": true,
+      "is_async": false,
+      "name": "name",
+      "parameters": [],
+      "return_annotation": "str"
+    },
+    {
+      "is_abstract": false,
+      "is_async": false,
+      "name": "pre_stop_state",
+      "parameters": [
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "self"
+        }
+      ],
+      "return_annotation": "dict"
+    },
+    {
+      "is_abstract": false,
+      "is_async": false,
+      "name": "register_subagent_hook",
+      "parameters": [
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "self"
+        },
+        {
+          "annotation": "str",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "name"
+        },
+        {
+          "annotation": "dict",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "spec"
+        }
+      ],
+      "return_annotation": "None"
+    },
+    {
+      "is_abstract": false,
+      "is_async": false,
+      "name": "register_tool_hook",
+      "parameters": [
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "self"
+        },
+        {
+          "annotation": "str",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "name"
+        },
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "fn"
+        }
+      ],
+      "return_annotation": "None"
+    },
+    {
+      "is_abstract": false,
+      "is_async": false,
+      "name": "restore_state",
+      "parameters": [
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "self"
+        },
+        {
+          "annotation": "dict",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "snapshot"
+        }
+      ],
+      "return_annotation": "None"
+    },
+    {
+      "is_abstract": true,
+      "is_async": true,
+      "name": "setup",
+      "parameters": [
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "self"
+        },
+        {
+          "annotation": "AdapterConfig",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "config"
+        }
+      ],
+      "return_annotation": "None"
+    },
+    {
+      "is_abstract": false,
+      "is_async": true,
+      "name": "transcript_lines",
+      "parameters": [
+        {
+          "annotation": "",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "self"
+        },
+        {
+          "annotation": "int",
+          "has_default": true,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "since"
+        },
+        {
+          "annotation": "int",
+          "has_default": true,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "limit"
+        }
+      ],
+      "return_annotation": "dict"
+    }
+  ]
+}
--- a/workspace/tests/snapshots/runtime_wedge_signature.json
+++ b/workspace/tests/snapshots/runtime_wedge_signature.json
@ -0,0 +1,40 @@
+{
+  "functions": [
+    {
+      "is_abstract": false,
+      "is_async": false,
+      "name": "clear_wedge",
+      "parameters": [],
+      "return_annotation": "None"
+    },
+    {
+      "is_abstract": false,
+      "is_async": false,
+      "name": "is_wedged",
+      "parameters": [],
+      "return_annotation": "bool"
+    },
+    {
+      "is_abstract": false,
+      "is_async": false,
+      "name": "mark_wedged",
+      "parameters": [
+        {
+          "annotation": "str",
+          "has_default": false,
+          "kind": "POSITIONAL_OR_KEYWORD",
+          "name": "reason"
+        }
+      ],
+      "return_annotation": "None"
+    },
+    {
+      "is_abstract": false,
+      "is_async": false,
+      "name": "wedge_reason",
+      "parameters": [],
+      "return_annotation": "str"
+    }
+  ],
+  "module": "runtime_wedge"
+}
--- a/workspace/tests/snapshots/skill_loader_signature.json
+++ b/workspace/tests/snapshots/skill_loader_signature.json
@ -0,0 +1,62 @@
+{
+  "dataclasses": [
+    {
+      "fields": [
+        {
+          "annotation": "str",
+          "has_default": false,
+          "name": "id"
+        },
+        {
+          "annotation": "str",
+          "has_default": false,
+          "name": "name"
+        },
+        {
+          "annotation": "str",
+          "has_default": false,
+          "name": "description"
+        },
+        {
+          "annotation": "list[str]",
+          "has_default": true,
+          "name": "tags"
+        },
+        {
+          "annotation": "list[str]",
+          "has_default": true,
+          "name": "examples"
+        },
+        {
+          "annotation": "list[str]",
+          "has_default": true,
+          "name": "runtime"
+        }
+      ],
+      "frozen": false,
+      "name": "SkillMetadata"
+    },
+    {
+      "fields": [
+        {
+          "annotation": "SkillMetadata",
+          "has_default": false,
+          "name": "metadata"
+        },
+        {
+          "annotation": "str",
+          "has_default": false,
+          "name": "instructions"
+        },
+        {
+          "annotation": "list[typing.Any]",
+          "has_default": true,
+          "name": "tools"
+        }
+      ],
+      "frozen": false,
+      "name": "LoadedSkill"
+    }
+  ],
+  "module": "skill_loader.loader"
+}
--- a/workspace/tests/test_adapter_base_signature.py
+++ b/workspace/tests/test_adapter_base_signature.py
@ -0,0 +1,162 @@
+"""BaseAdapter public-API signature snapshot — drift gate (#2364 item 2).
+
+Every workspace template subclasses ``BaseAdapter``. Renaming, removing,
+or re-typing a method on the base class — or a field on the public
+dataclasses (SetupResult, AdapterConfig, RuntimeCapabilities) —
+silently breaks templates that rely on the old shape. Without a
+frozen snapshot, the next rename ships quietly and only surfaces when
+a template's CI catches the AttributeError days later.
+
+Helpers live in ``tests/_signature_snapshot.py`` so future surfaces
+(skill_loader, etc.) reuse the same introspection logic.
+
+When the failure is intentional:
+
+  1. Make the API change in ``adapter_base.py``.
+  2. Run the test once to see the diff in the failure message.
+  3. Update ``tests/snapshots/adapter_base_signature.json`` to match
+     the new shape (or delete it and re-run to regenerate). That
+     update IS the explicit acknowledgment that templates need
+     follow-up. Reviewer of the PR sees the snapshot diff in their
+     review and decides whether template repos need coordinated
+     updates.
+
+Same-shape pattern as PR #2363's A2A protocol-compat replay gate.
+Both close drift classes by snapshotting the structural surface that
+templates or callers depend on.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import pytest
+
+# Resolve workspace/ as the import root so adapter_base imports clean.
+WORKSPACE_DIR = Path(__file__).parent.parent
+if str(WORKSPACE_DIR) not in sys.path:
+    sys.path.insert(0, str(WORKSPACE_DIR))
+
+from tests._signature_snapshot import (  # noqa: E402
+    build_class_signature_record,
+    build_dataclass_record,
+    compare_against_snapshot,
+)
+
+SNAPSHOT_PATH = Path(__file__).parent / "snapshots" / "adapter_base_signature.json"
+
+
+def _build_full_snapshot() -> dict:
+    """Snapshot of BaseAdapter methods + the three public dataclasses
+    that form the call/return contract between the platform and every
+    adapter:
+
+      - SetupResult: returned by adapter._common_setup()
+      - AdapterConfig: passed into adapter setup hooks
+      - RuntimeCapabilities: returned by adapter.capabilities();
+        drives platform-side dispatch routing (#117). A field rename
+        here silently disables every native-capability flag every
+        adapter currently declares.
+    """
+    from adapter_base import AdapterConfig, BaseAdapter, RuntimeCapabilities, SetupResult
+
+    snap = build_class_signature_record(BaseAdapter)
+    snap["dataclasses"] = [
+        build_dataclass_record(SetupResult),
+        build_dataclass_record(AdapterConfig),
+        build_dataclass_record(RuntimeCapabilities),
+    ]
+    return snap
+
+
+def test_base_adapter_signature_matches_snapshot():
+    compare_against_snapshot(_build_full_snapshot(), SNAPSHOT_PATH)
+
+
+def test_snapshot_has_required_methods():
+    """Defense-in-depth: the snapshot must include the methods every
+    template overrides. If a future refactor accidentally drops one of
+    these from BaseAdapter (e.g., moves it to a mixin), the equality
+    test above passes if the snapshot file is also updated — but THIS
+    test catches the structural regression.
+
+    Add a method to ``required`` ONLY when removing it would break a
+    deployed template. The list is intentionally short.
+    """
+    if not SNAPSHOT_PATH.exists():
+        pytest.skip(f"{SNAPSHOT_PATH.name} not generated yet")
+
+    snapshot = json.loads(SNAPSHOT_PATH.read_text())
+    method_names = {m["name"] for m in snapshot["methods"]}
+
+    required = {
+        "name",  # runtime identifier — every template MUST implement
+        "display_name",  # UI-facing label
+        "description",  # short description
+        "capabilities",  # native vs platform-fallback declaration (#117)
+        "memory_filename",  # plugin-pipeline hook
+    }
+    missing = required - method_names
+    if missing:
+        pytest.fail(
+            f"BaseAdapter snapshot is missing required methods: {sorted(missing)}.\n"
+            "Either restore them on adapter_base.py, OR coordinate template "
+            "updates AND remove the entry from `required` in this test with "
+            "a justification."
+        )
+
+
+def test_snapshot_has_required_dataclass_fields():
+    """Defense-in-depth for the dataclass shapes — same rationale as
+    test_snapshot_has_required_methods but for fields that adapters
+    pattern-match on.
+
+    The most load-bearing case: RuntimeCapabilities flags drive
+    platform-side dispatch routing. Renaming a flag silently turns
+    every adapter's native-capability declaration into a no-op
+    (the platform fallback runs), with no AttributeError to surface
+    the breakage.
+    """
+    if not SNAPSHOT_PATH.exists():
+        pytest.skip(f"{SNAPSHOT_PATH.name} not generated yet")
+
+    snapshot = json.loads(SNAPSHOT_PATH.read_text())
+    dataclasses = {dc["name"]: dc for dc in snapshot.get("dataclasses", [])}
+
+    expected = {
+        "RuntimeCapabilities": {
+            # Each flag here drives a specific platform-side consumer
+            # (heartbeat, cron, session, etc). Removing one without
+            # coordinated platform-side migration silently drops back
+            # to the platform fallback — see project memory
+            # `project_runtime_native_pluggable.md`.
+            "provides_native_heartbeat",
+            "provides_native_scheduler",
+            "provides_native_session",
+        },
+        "AdapterConfig": {
+            "model",
+            "system_prompt",
+        },
+        "SetupResult": {
+            "system_prompt",
+            "loaded_skills",
+        },
+    }
+
+    for cls_name, required_fields in expected.items():
+        if cls_name not in dataclasses:
+            pytest.fail(
+                f"Public dataclass {cls_name} missing from snapshot — "
+                "either it was removed from adapter_base, OR the snapshot "
+                "wasn't regenerated after a refactor."
+            )
+        actual_fields = {f["name"] for f in dataclasses[cls_name]["fields"]}
+        missing = required_fields - actual_fields
+        if missing:
+            pytest.fail(
+                f"{cls_name} is missing required fields: {sorted(missing)}.\n"
+                "Either restore them on adapter_base.py, OR coordinate template "
+                "updates AND remove the entry from `expected` in this test "
+                "with a justification."
+            )
--- a/workspace/tests/test_runtime_wedge_signature.py
+++ b/workspace/tests/test_runtime_wedge_signature.py
@ -0,0 +1,94 @@
+"""runtime_wedge public-API signature snapshot — drift gate.
+
+``BaseAdapter`` docstring explicitly tells adapter authors to call
+``runtime_wedge.mark_wedged(reason)`` / ``clear_wedge()`` when their
+SDK hits a non-recoverable error class — the heartbeat thread reads
+``is_wedged()`` / ``wedge_reason()`` to flip the workspace to
+``degraded`` and surface the cause to the canvas.
+
+That's a public adapter-facing API. Renaming any of the four
+functions silently breaks every adapter that calls them: the import
+still resolves the module, the missing attribute raises
+``AttributeError`` only when the adapter actually hits its first
+SDK error — long after the rename merges.
+
+Same drift class as the BaseAdapter signature snapshot (#2378, #2380)
+and skill_loader gate (#2381), applied to the module-level
+function surface.
+"""
+
+import sys
+from pathlib import Path
+
+import pytest
+
+WORKSPACE_DIR = Path(__file__).parent.parent
+if str(WORKSPACE_DIR) not in sys.path:
+    sys.path.insert(0, str(WORKSPACE_DIR))
+
+from tests._signature_snapshot import (  # noqa: E402
+    build_module_functions_record,
+    compare_against_snapshot,
+)
+
+SNAPSHOT_PATH = Path(__file__).parent / "snapshots" / "runtime_wedge_signature.json"
+
+
+def _build_full_snapshot() -> dict:
+    """Pin only the four contract functions adapters call. Other module-
+    level helpers (``reset_for_test``, internal state) intentionally
+    aren't part of the snapshot — adapters MUST NOT depend on them.
+    """
+    import runtime_wedge
+
+    return build_module_functions_record(
+        runtime_wedge,
+        function_names=[
+            "is_wedged",
+            "wedge_reason",
+            "mark_wedged",
+            "clear_wedge",
+        ],
+    )
+
+
+def test_runtime_wedge_signature_matches_snapshot():
+    compare_against_snapshot(_build_full_snapshot(), SNAPSHOT_PATH)
+
+
+def test_snapshot_has_required_functions():
+    """Defense-in-depth: even if both source and snapshot are updated
+    together, removing any of the four adapter-facing functions
+    requires explicit edit here. The required set is the documented
+    public contract — see ``BaseAdapter`` docstring.
+    """
+    if not SNAPSHOT_PATH.exists():
+        pytest.skip(f"{SNAPSHOT_PATH.name} not generated yet")
+
+    import json
+    snapshot = json.loads(SNAPSHOT_PATH.read_text())
+    fn_names = {f["name"] for f in snapshot["functions"]}
+
+    required = {
+        "is_wedged",  # platform-side heartbeat reads this
+        "wedge_reason",  # surfaces the why on the canvas
+        "mark_wedged",  # adapters call this on non-recoverable errors
+        "clear_wedge",  # adapters call this on auto-recovery
+    }
+    missing = required - fn_names
+    if missing:
+        pytest.fail(
+            f"runtime_wedge snapshot is missing required functions: {sorted(missing)}.\n"
+            "Either restore them on runtime_wedge.py, OR coordinate adapter "
+            "updates AND remove the entry from `required` in this test "
+            "with a justification."
+        )
+
+    for fn in snapshot["functions"]:
+        if fn.get("missing"):
+            pytest.fail(
+                f"runtime_wedge.{fn['name']} resolved as a non-function — "
+                "either it was replaced by a different kind of attribute "
+                "(class? module-level alias?) which adapters' direct call "
+                "would break, OR it was removed entirely."
+            )
--- a/workspace/tests/test_skill_loader_signature.py
+++ b/workspace/tests/test_skill_loader_signature.py
@ -0,0 +1,108 @@
+"""skill_loader public-API signature snapshot — drift gate.
+
+Every workspace template's adapter pulls skills via
+``skill_loader.load_skills``. The returned ``LoadedSkill`` objects
+expose ``metadata`` (a ``SkillMetadata``) which adapters pattern-match
+on for runtime-compat filtering — see ``hermes`` and ``claude-code``
+adapters which inspect ``skill.metadata.runtime`` to decide whether
+to load a skill or skip it.
+
+Renaming a field on ``SkillMetadata`` (e.g. ``runtime`` → ``runtimes``)
+would silently break that filtering. The skill loader call still
+returns objects, but every adapter's ``if "*" in skill.metadata.runtime``
+check raises AttributeError at workspace boot — too late to be caught
+by the introducing PR's CI.
+
+Same drift class as the BaseAdapter signature snapshot (#2378, #2380),
+applied to a different public surface.
+"""
+
+import sys
+from pathlib import Path
+
+import pytest
+
+WORKSPACE_DIR = Path(__file__).parent.parent
+if str(WORKSPACE_DIR) not in sys.path:
+    sys.path.insert(0, str(WORKSPACE_DIR))
+
+from tests._signature_snapshot import (  # noqa: E402
+    build_dataclass_record,
+    compare_against_snapshot,
+)
+
+SNAPSHOT_PATH = Path(__file__).parent / "snapshots" / "skill_loader_signature.json"
+
+
+def _build_full_snapshot() -> dict:
+    """Snapshot the public dataclasses exported from skill_loader.
+
+    SkillMetadata fields are consumed by:
+      - adapter runtime filtering (``runtime`` field)
+      - canvas UI display (``name``, ``description``, ``tags``)
+      - skill discovery / search (``id``, ``examples``)
+
+    LoadedSkill is the return shape from ``load_skills`` and is held
+    in ``SetupResult.loaded_skills`` — every adapter consumes it.
+    """
+    from skill_loader.loader import LoadedSkill, SkillMetadata
+
+    return {
+        "module": "skill_loader.loader",
+        "dataclasses": [
+            build_dataclass_record(SkillMetadata),
+            build_dataclass_record(LoadedSkill),
+        ],
+    }
+
+
+def test_skill_loader_signature_matches_snapshot():
+    compare_against_snapshot(_build_full_snapshot(), SNAPSHOT_PATH)
+
+
+def test_snapshot_has_required_skill_metadata_fields():
+    """Defense-in-depth — adapters pattern-match on these specific
+    field names. Removing one without a coordinated update breaks
+    every adapter's skill-filter logic.
+    """
+    if not SNAPSHOT_PATH.exists():
+        pytest.skip(f"{SNAPSHOT_PATH.name} not generated yet")
+
+    import json
+    snapshot = json.loads(SNAPSHOT_PATH.read_text())
+    dataclasses = {dc["name"]: dc for dc in snapshot.get("dataclasses", [])}
+
+    expected = {
+        "SkillMetadata": {
+            "id",
+            "name",
+            "description",
+            # `runtime` drives per-adapter skill-compat filtering. If
+            # this field is renamed, every adapter's
+            # `if "*" in skill.metadata.runtime` check raises
+            # AttributeError at workspace boot.
+            "runtime",
+        },
+        "LoadedSkill": {
+            "metadata",
+            "instructions",
+            "tools",
+        },
+    }
+
+    for cls_name, required_fields in expected.items():
+        if cls_name not in dataclasses:
+            pytest.fail(
+                f"Public dataclass {cls_name} missing from snapshot — "
+                "either it was removed from skill_loader.loader, OR the "
+                "snapshot wasn't regenerated after a refactor."
+            )
+        actual_fields = {f["name"] for f in dataclasses[cls_name]["fields"]}
+        missing = required_fields - actual_fields
+        if missing:
+            pytest.fail(
+                f"{cls_name} is missing required fields: {sorted(missing)}.\n"
+                "Either restore them on skill_loader/loader.py, OR coordinate "
+                "adapter + template updates AND remove the entry from "
+                "`expected` in this test with a justification."
+            )