Merge branch 'staging' into feat/external-runtime-first-class

2026-04-26 16:34:17 -07:00 · 2026-04-26 16:34:17 -07:00 · a5e099d644
commit a5e099d644
parent 78afa0f544 fdf8b65c59
132 changed files with 13312 additions and 743 deletions
--- a/.env.example
+++ b/.env.example
@ -168,3 +168,18 @@ GSC_SERVICE_ACCOUNT=           # Search Console reporter service account email
 # Token goes in Authorization: Bearer header — never embed in the URL.
 MOLECULE_MCP_URL=                # e.g. https://api.molecule.ai or http://localhost:8080
 MOLECULE_MCP_TOKEN=              # workspace-scoped bearer token — NEVER COMMIT
+
+# ---- workspace-template image refresh ----
+# IMAGE_AUTO_REFRESH=true makes the platform poll GHCR every 5 min for digest
+# changes on each workspace-template-*:latest. When a digest moves the
+# platform pulls + force-recreates matching ws-* containers (same code path
+# as POST /admin/workspace-images/refresh). Closes the runtime CD chain to
+# zero operator steps.
+# Default in docker-compose.yml is "true" for local dev so the runtime → ws
+# loop is tight; explicit override here lets you turn it off when running a
+# long test that shouldn't be disturbed by a publish.
+IMAGE_AUTO_REFRESH=              # true|false; unset = inherit compose default (true for local dev)
+# GHCR_USER + GHCR_TOKEN are required only for private template images
+# (current workspace-template-* set is public; both can stay unset).
+GHCR_USER=
+GHCR_TOKEN=
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -0,0 +1,20 @@
+# Default reviewer routing for molecule-core.
+#
+# `*` matches every changed path, so every PR auto-requests review from
+# @hongmingwang-moleculeai. The agent-PR pattern is that the
+# HongmingWang-Rabbit (agent) account authors PRs; this file routes
+# them into the personal account's review queue automatically — no
+# manual `gh pr edit --add-reviewer` per PR.
+#
+# Why CODEOWNERS instead of branch-protection's review-from-anyone gate:
+# the gate just says "1 review needed"; CODEOWNERS specifies *which*
+# reviewer the request goes to. Without it, agent PRs sit unreviewed
+# until a human happens to look at the queue.
+#
+# Note: `require_code_owner_reviews` on the staging branch protection
+# is currently OFF, so the routing is informational rather than
+# enforced. Flip it on (in branch protection settings) if you want
+# CODEOWNERS approval to be the *required* review type. Until then,
+# any approving review still satisfies the 1-review gate — this just
+# makes sure the right person sees it.
+*  @hongmingwang-moleculeai
--- a/.github/workflows/canary-staging.yml
+++ b/.github/workflows/canary-staging.yml
@ -38,7 +38,14 @@ jobs:
  canary:
    name: Canary smoke
    runs-on: ubuntu-latest
-    timeout-minutes: 15
+    # 25 min headroom over the 15-min TLS-readiness deadline in
+    # tests/e2e/test_staging_full_saas.sh (#2107). Without the buffer
+    # the job is killed at the wall-clock 15:00 mark BEFORE the bash
+    # `fail` + diagnostic burst can fire, leaving every cancellation
+    # silent. Sibling staging E2E jobs run at 20-45 min — keeping
+    # canary tighter than them so a true wedge still surfaces here
+    # first.
+    timeout-minutes: 25

    env:
      MOLECULE_CP_URL: https://staging-api.moleculesai.app
@ -152,14 +159,34 @@ jobs:
          ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
        run: |
          set +e
+          # Slug prefix matches what test_staging_full_saas.sh emits
+          # in canary mode:
+          #   SLUG="e2e-canary-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
+          # Earlier this was `e2e-{today}-canary-` — that was the
+          # full-mode pattern (date FIRST, mode SECOND); canary slugs
+          # have mode FIRST, date SECOND. The mismatch silently
+          # never matched, leaving every cancelled-canary EC2 alive
+          # until the once-an-hour sweep eventually caught it
+          # (incident 2026-04-26 21:03Z: 1h25m EC2 leak before manual
+          # cleanup; same gap on three earlier cancellations today).
          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
            | python3 -c "
-          import json, sys
+          import json, sys, os
+          run_id = os.environ.get('GITHUB_RUN_ID', '')
          d = json.load(sys.stdin)
          today = __import__('datetime').date.today().strftime('%Y%m%d')
+          # Scope to slugs from THIS canary run when GITHUB_RUN_ID is
+          # available; the canary workflow sets E2E_RUN_ID='canary-\${run_id}'
+          # so the slug suffix is '-canary-\${run_id}-...'. Mirrors the
+          # full-mode safety net's per-run scoping (e2e-staging-saas.yml)
+          # added after the 2026-04-21 cross-run cleanup incident.
+          if run_id:
+              prefix = f'e2e-canary-{today}-canary-{run_id}'
+          else:
+              prefix = f'e2e-canary-{today}-'
          candidates = [o['slug'] for o in d.get('orgs', [])
-                        if o.get('slug','').startswith(f'e2e-{today}-canary-')
+                        if o.get('slug','').startswith(prefix)
                        and o.get('status') not in ('purged',)]
          print('\n'.join(candidates))
          " 2>/dev/null)
--- a/.github/workflows/publish-runtime.yml
+++ b/.github/workflows/publish-runtime.yml
@ -16,7 +16,11 @@ name: publish-runtime
 #      build/molecule_runtime/ with imports rewritten (`a2a_client` →
 #      `molecule_runtime.a2a_client`).
 #   2. Builds wheel + sdist with `python -m build`.
-#   3. Publishes to PyPI via twine + repo secret PYPI_TOKEN.
+#   3. Publishes to PyPI via the PyPA Trusted Publisher action (OIDC).
+#      No static API token is stored — PyPI verifies the workflow's
+#      OIDC claim against the trusted-publisher config registered for
+#      molecule-ai-workspace-runtime (Molecule-AI/molecule-core,
+#      publish-runtime.yml, environment pypi-publish).
 #
 # After publish: the 8 template repos pick up the new version on their
 # next image rebuild (their requirements.txt pin
@ -42,6 +46,9 @@ jobs:
  publish:
    runs-on: ubuntu-latest
    environment: pypi-publish
+    permissions:
+      contents: read
+      id-token: write   # PyPI Trusted Publisher (OIDC) — no PYPI_TOKEN needed
    steps:
      - uses: actions/checkout@v4

@ -84,8 +91,12 @@ jobs:
        run: |
          python -m twine check dist/*
          # Smoke-import the built wheel to catch import-rewrite mistakes
-          # before they hit PyPI. The package depends on a2a-sdk + httpx
-          # via pyproject; install those so the smoke import resolves.
+          # before they hit PyPI. Asserts on STABLE INVARIANTS only —
+          # symbols + classes that are part of the package's public
+          # contract (BaseAdapter interface, the canonical a2a sentinel,
+          # core submodules). Don't add feature-flag-style assertions
+          # here — they fire false-positive every time staging is mid-
+          # release of that feature.
          python -m venv /tmp/smoke
          /tmp/smoke/bin/pip install --quiet dist/*.whl
          WORKSPACE_ID=00000000-0000-0000-0000-000000000000 \
@ -94,16 +105,23 @@ jobs:
          from molecule_runtime import a2a_client, a2a_tools
          from molecule_runtime.builtin_tools import memory
          from molecule_runtime.adapters import get_adapter, BaseAdapter, AdapterConfig
-          assert a2a_client._A2A_QUEUED_PREFIX, 'queued prefix missing — chat-leak fix not in build'
+          # Stable invariants: package exports + BaseAdapter shape.
+          assert a2a_client._A2A_ERROR_PREFIX, 'a2a_client missing error sentinel'
+          assert callable(get_adapter), 'adapters.get_adapter must be callable'
+          assert hasattr(BaseAdapter, 'name'), 'BaseAdapter interface broken'
+          assert hasattr(AdapterConfig, '__init__'), 'AdapterConfig dataclass missing'
          print('✓ smoke import passed')
          "

-      - name: Publish to PyPI
-        working-directory: ${{ runner.temp }}/runtime-build
-        env:
-          TWINE_USERNAME: __token__
-          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
-        run: python -m twine upload dist/*
+      - name: Publish to PyPI (Trusted Publisher / OIDC)
+        # PyPI side is configured: project molecule-ai-workspace-runtime →
+        # publisher Molecule-AI/molecule-core, workflow publish-runtime.yml,
+        # environment pypi-publish. The action mints a short-lived OIDC
+        # token and exchanges it for a PyPI upload credential — no static
+        # API token in this repo's secrets.
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          packages-dir: ${{ runner.temp }}/runtime-build/dist/

  cascade:
    # After PyPI accepts the upload, fan out a repository_dispatch to each
--- a/.github/workflows/secret-scan.yml
+++ b/.github/workflows/secret-scan.yml
@ -0,0 +1,200 @@
+name: Secret scan
+
+# Hard CI gate. Refuses any PR / push whose diff additions contain a
+# recognisable credential. Defense-in-depth for the #2090-class incident
+# (2026-04-24): GitHub's hosted Copilot Coding Agent leaked a ghs_*
+# installation token into tenant-proxy/package.json via `npm init`
+# slurping the URL from a token-embedded origin remote. We can't fix
+# upstream's clone hygiene, so we gate here.
+#
+# Also the canonical reusable workflow for the rest of the org. Other
+# Molecule-AI repos enroll with a single 3-line workflow:
+#
+#   jobs:
+#     secret-scan:
+#       uses: Molecule-AI/molecule-core/.github/workflows/secret-scan.yml@staging
+#
+# Pin to @staging not @main — staging is the active default branch,
+# main lags via the staging-promotion workflow. Updates ride along
+# automatically on the next consumer workflow run.
+#
+# Same regex set as the runtime's bundled pre-commit hook
+# (molecule-ai-workspace-runtime: molecule_runtime/scripts/pre-commit-checks.sh).
+# Keep the two sides aligned when adding patterns.
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+  push:
+    branches: [main, staging]
+  # Required for GitHub merge queue: the queue's pre-merge CI run on
+  # `gh-readonly-queue/...` refs needs this check to fire so the queue
+  # gets a real result instead of stalling forever AWAITING_CHECKS.
+  merge_group:
+    types: [checks_requested]
+  # Reusable workflow entry point for other Molecule-AI repos.
+  workflow_call:
+
+jobs:
+  scan:
+    name: Scan diff for credential-shaped strings
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 2  # need previous commit to diff against on push events
+
+      # For pull_request events the diff base may be many commits behind
+      # HEAD and absent from the shallow clone. Fetch it explicitly.
+      - name: Fetch PR base SHA (pull_request events only)
+        if: github.event_name == 'pull_request'
+        run: git fetch --depth=1 origin ${{ github.event.pull_request.base.sha }}
+
+      # For merge_group events the queue's pre-merge ref is a commit on
+      # `gh-readonly-queue/...` whose parent is the queue's base_sha.
+      # That parent isn't part of the queue branch's shallow clone, so
+      # we fetch it explicitly. Without this the diff falls through to
+      # "no BASE → scan entire tree" mode and false-positives on legit
+      # test fixtures (e.g. canvas/src/lib/validation/__tests__/secret-formats.test.ts).
+      - name: Fetch merge_group base SHA (merge_group events only)
+        if: github.event_name == 'merge_group'
+        run: git fetch --depth=1 origin ${{ github.event.merge_group.base_sha }}
+
+      - name: Refuse if credential-shaped strings appear in diff additions
+        env:
+          # Plumb event-specific SHAs through env so the script doesn't
+          # need conditional `${{ ... }}` interpolation per event type.
+          # github.event.before/after only exist on push events;
+          # merge_group has its own base_sha/head_sha; pull_request has
+          # pull_request.base.sha / pull_request.head.sha.
+          PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
+          PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
+          MG_BASE_SHA: ${{ github.event.merge_group.base_sha }}
+          MG_HEAD_SHA: ${{ github.event.merge_group.head_sha }}
+          PUSH_BEFORE: ${{ github.event.before }}
+          PUSH_AFTER: ${{ github.event.after }}
+        run: |
+          # Pattern set covers GitHub family (the actual #2090 vector),
+          # Anthropic / OpenAI / Slack / AWS. Anchored on prefixes with low
+          # false-positive rates against agent-generated content. Mirror of
+          # molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh
+          # — keep aligned.
+          SECRET_PATTERNS=(
+            'ghp_[A-Za-z0-9]{36,}'           # GitHub PAT (classic)
+            'ghs_[A-Za-z0-9]{36,}'           # GitHub App installation token
+            'gho_[A-Za-z0-9]{36,}'           # GitHub OAuth user-to-server
+            'ghu_[A-Za-z0-9]{36,}'           # GitHub OAuth user
+            'ghr_[A-Za-z0-9]{36,}'           # GitHub OAuth refresh
+            'github_pat_[A-Za-z0-9_]{82,}'   # GitHub fine-grained PAT
+            'sk-ant-[A-Za-z0-9_-]{40,}'      # Anthropic API key
+            'sk-proj-[A-Za-z0-9_-]{40,}'     # OpenAI project key
+            'sk-svcacct-[A-Za-z0-9_-]{40,}'  # OpenAI service-account key
+            'xox[baprs]-[A-Za-z0-9-]{20,}'   # Slack tokens
+            'AKIA[0-9A-Z]{16}'               # AWS access key ID
+            'ASIA[0-9A-Z]{16}'               # AWS STS temp access key ID
+          )
+
+          # Determine the diff base. Each event type stores its SHAs in
+          # a different place — see the env block above.
+          case "${{ github.event_name }}" in
+            pull_request)
+              BASE="$PR_BASE_SHA"
+              HEAD="$PR_HEAD_SHA"
+              ;;
+            merge_group)
+              BASE="$MG_BASE_SHA"
+              HEAD="$MG_HEAD_SHA"
+              ;;
+            *)
+              BASE="$PUSH_BEFORE"
+              HEAD="$PUSH_AFTER"
+              ;;
+          esac
+
+          # On push events with shallow clones, BASE may be present in
+          # the event payload but absent from the local object DB
+          # (fetch-depth=2 doesn't always reach the previous commit
+          # across true merges). Try fetching it on demand. If the
+          # fetch fails — e.g. the SHA was force-overwritten — we fall
+          # through to the empty-BASE branch below, which scans the
+          # entire tree as if every file were new. Correct, just slow.
+          if [ -n "$BASE" ] && ! echo "$BASE" | grep -qE '^0+$'; then
+            if ! git cat-file -e "$BASE" 2>/dev/null; then
+              git fetch --depth=1 origin "$BASE" 2>/dev/null || true
+            fi
+          fi
+
+          # Files added or modified in this change.
+          if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$' || ! git cat-file -e "$BASE" 2>/dev/null; then
+            # New branch / no previous SHA / BASE unreachable — check the
+            # entire tree as added content. Slower, but correct on first
+            # push.
+            CHANGED=$(git ls-tree -r --name-only HEAD)
+            DIFF_RANGE=""
+          else
+            CHANGED=$(git diff --name-only --diff-filter=AM "$BASE" "$HEAD")
+            DIFF_RANGE="$BASE $HEAD"
+          fi
+
+          if [ -z "$CHANGED" ]; then
+            echo "No changed files to inspect."
+            exit 0
+          fi
+
+          # Self-exclude: this workflow file legitimately contains the
+          # pattern strings as regex literals. Without an exclude it would
+          # block its own merge.
+          SELF=".github/workflows/secret-scan.yml"
+
+          OFFENDING=""
+          for f in $CHANGED; do
+            [ "$f" = "$SELF" ] && continue
+            if [ -n "$DIFF_RANGE" ]; then
+              ADDED=$(git diff --no-color --unified=0 "$BASE" "$HEAD" -- "$f" 2>/dev/null | grep -E '^\+[^+]' || true)
+            else
+              # No diff range (new branch first push) — scan the full file
+              # contents as if every line were new.
+              ADDED=$(cat "$f" 2>/dev/null || true)
+            fi
+            [ -z "$ADDED" ] && continue
+            for pattern in "${SECRET_PATTERNS[@]}"; do
+              if echo "$ADDED" | grep -qE "$pattern"; then
+                OFFENDING="${OFFENDING}${f} (matched: ${pattern})\n"
+                break
+              fi
+            done
+          done
+
+          if [ -n "$OFFENDING" ]; then
+            echo "::error::Credential-shaped strings detected in diff additions:"
+            printf "$OFFENDING"
+            echo ""
+            echo "The actual matched values are NOT echoed here, deliberately —"
+            echo "round-tripping a leaked credential into CI logs widens the blast"
+            echo "radius (logs are searchable + retained)."
+            echo ""
+            echo "Recovery:"
+            echo "  1. Remove the secret from the file. Replace with an env var"
+            echo "     reference (e.g. \${{ secrets.GITHUB_TOKEN }} in workflows,"
+            echo "     process.env.X in code)."
+            echo "  2. If the credential was already pushed (this PR's commit"
+            echo "     history reaches a public ref), treat it as compromised —"
+            echo "     ROTATE it immediately, do not just remove it. The token"
+            echo "     remains valid in git history forever and may be in any"
+            echo "     log/cache that consumed this branch."
+            echo "  3. Force-push the cleaned commit (or stack a revert) and"
+            echo "     re-run CI."
+            echo ""
+            echo "If the match is a false positive (test fixture, docs example,"
+            echo "or this workflow's own regex literals): use a clearly-fake"
+            echo "placeholder like ghs_EXAMPLE_DO_NOT_USE that doesn't satisfy"
+            echo "the length suffix, OR add the file path to the SELF exclude"
+            echo "list in this workflow with a short reason."
+            echo ""
+            echo "Mirror of the regex set lives in the runtime's bundled"
+            echo "pre-commit hook (molecule-ai-workspace-runtime:"
+            echo "molecule_runtime/scripts/pre-commit-checks.sh) — keep aligned."
+            exit 1
+          fi
+
+          echo "✓ No credential-shaped strings in this change."
--- a/canvas/e2e/staging-setup.ts
+++ b/canvas/e2e/staging-setup.ts
@ -50,12 +50,13 @@ const WORKSPACE_ONLINE_TIMEOUT_MS = 20 * 60 * 1000;
 // TLS readiness depends on (1) Cloudflare DNS propagation through the
 // edge, (2) the tenant's CF Tunnel registering the new hostname, (3)
 // CF's edge ACME cert provisioning + cache. Each of these layers can
-// add 1-3 min on its own under heavy staging load. The original 3-min
-// cap blocked four cycles of staging→main PRs across 2026-04-24+.
-// 10 min stays inside the 20-min PROVISION_TIMEOUT envelope (so a
-// genuinely-stuck tenant still fails-loud at the provision step) but
-// absorbs the realistic worst case for a one-shot tenant TLS handshake.
-const TLS_TIMEOUT_MS = 10 * 60 * 1000;
+// add 1-3 min on its own under heavy staging load. Bumped 10→15 min
+// after a burst of canary failures correlated with CP changes (#2090).
+// Stays below the 20-min PROVISION_TIMEOUT envelope so a genuinely-
+// stuck tenant fails-loud at the provision step rather than
+// masquerading as a TLS issue. Kept aligned with
+// tests/e2e/test_staging_full_saas.sh.
+const TLS_TIMEOUT_MS = 15 * 60 * 1000;

 async function jsonFetch(
  url: string,
--- a/canvas/next.config.ts
+++ b/canvas/next.config.ts
@ -1,7 +1,100 @@
 import type { NextConfig } from "next";
+import { existsSync, readFileSync } from "node:fs";
+import { dirname, join } from "node:path";
+
+// Load NEXT_PUBLIC_* vars from the monorepo root .env so a fresh
+// `pnpm dev` works without a per-developer canvas/.env.local. Next.js
+// only auto-loads .env from the project root by default — but our
+// canonical config (NEXT_PUBLIC_PLATFORM_URL, NEXT_PUBLIC_WS_URL,
+// MOLECULE_ENV, etc.) lives at the monorepo root, gitignored, shared
+// by the Go platform binary. Without this, the canvas falls back to
+// `window.location` (`ws://localhost:3000/ws`) and the WS pill stays
+// "Reconnecting" forever because Next.js dev doesn't serve /ws.
+//
+// Mirrors workspace-server/cmd/server/dotenv.go's monorepo-rooted .env
+// loader. Both processes look for the SAME marker (`workspace-server/
+// go.mod`) so a developer renaming or relocating the repo only has to
+// update one heuristic. Production is unaffected: `output: "standalone"`
+// bakes resolved env into the build, and the marker file isn't shipped.
+loadMonorepoEnv();

 const nextConfig: NextConfig = {
  output: "standalone",
 };

 export default nextConfig;
+
+function loadMonorepoEnv() {
+  const root = findMonorepoRoot(__dirname);
+  if (!root) return;
+  const envPath = join(root, ".env");
+  if (!existsSync(envPath)) return;
+  const body = readFileSync(envPath, "utf8");
+  let loaded = 0;
+  let skipped = 0;
+  for (const line of body.split(/\r?\n/)) {
+    const kv = parseLine(line);
+    if (!kv) continue;
+    const [k, v] = kv;
+    // Existing env wins. NOTE: an explicitly-set empty string
+    // (`KEY=` exported from a parent shell, where Node represents it
+    // as `""` not `undefined`) counts as "set" — we keep the empty
+    // value rather than backfilling from the file. Matches Go's
+    // os.LookupEnv check in workspace-server/cmd/server/dotenv.go so
+    // both processes treat the same input identically. Operators who
+    // want the file value to win must `unset KEY` in the launching
+    // shell.
+    if (process.env[k] !== undefined) {
+      skipped++;
+      continue;
+    }
+    process.env[k] = v;
+    loaded++;
+  }
+  // eslint-disable-next-line no-console
+  console.log(
+    `[next.config] loaded ${loaded} vars from ${envPath} (${skipped} already set in env)`,
+  );
+}
+
+function findMonorepoRoot(start: string): string | null {
+  let dir = start;
+  for (let i = 0; i < 6; i++) {
+    if (existsSync(join(dir, "workspace-server", "go.mod"))) return dir;
+    const parent = dirname(dir);
+    if (parent === dir) break;
+    dir = parent;
+  }
+  return null;
+}
+
+// Mirror of workspace-server/cmd/server/dotenv.go's parseDotEnvLine
+// — same rules so the two loaders agree on every line in the shared
+// .env. If you change one parser, change the other.
+function parseLine(raw: string): [string, string] | null {
+  let line = raw.replace(/^/, "").trim();
+  if (line === "" || line.startsWith("#")) return null;
+  // `export ` prefix uses a literal space — `export\tFOO=bar` with a
+  // tab is intentionally rejected, matching the Go mirror in
+  // workspace-server/cmd/server/dotenv.go. Shells emit the prefix
+  // with a space; tabs would only appear in hand-mangled files.
+  if (line.startsWith("export ")) line = line.slice("export ".length).trimStart();
+  const eq = line.indexOf("=");
+  if (eq <= 0) return null;
+  const k = line.slice(0, eq).trim();
+  let v = line.slice(eq + 1).replace(/^[ \t]+/, "");
+  if (v.length >= 2 && (v[0] === '"' || v[0] === "'")) {
+    const quote = v[0];
+    const end = v.indexOf(quote, 1);
+    if (end >= 0) return [k, v.slice(1, end)];
+    // unterminated — fall through to bare-value handling
+  }
+  for (let i = 0; i < v.length; i++) {
+    if (v[i] !== "#") continue;
+    if (i === 0 || v[i - 1] === " " || v[i - 1] === "\t") {
+      v = v.slice(0, i);
+      break;
+    }
+  }
+  return [k, v.trim()];
+}
--- a/canvas/src/app/globals.css
+++ b/canvas/src/app/globals.css
@ -1,5 +1,9 @@
@import "xterm/css/xterm.css";
+/* Theme tokens MUST load before any feature stylesheet that
+   references them so custom properties are in scope. */
+@import "../styles/theme-tokens.css";
@import "../styles/settings-panel.css";
+@import "../styles/org-deploy.css";

@tailwind base;
@tailwind components;
@ -38,7 +42,20 @@ body {
 }

 .react-flow__node {
-  transition: box-shadow 0.2s ease;
+  /* Transform transition drives the "spawn from parent" motion —
+     org-deploy sets the node's initial position to the parent's
+     absolute coords, then repositions to the real slot, and this
+     transition interpolates the translate() in between.
+     Non-deploy workspace moves (drag, nest) get the same smoothing
+     for free. */
+  transition:
+    box-shadow var(--mol-duration-fast) ease,
+    transform var(--mol-duration-spawn) var(--mol-easing-bounce-out);
+}
+/* Drag events must feel instant — React Flow adds this class
+   for the lifetime of the gesture. */
+.react-flow__node.dragging {
+  transition: box-shadow var(--mol-duration-fast) ease;
 }

 /* Scrollbar styling */
--- a/canvas/src/app/page.tsx
+++ b/canvas/src/app/page.tsx
@ -7,13 +7,19 @@ import { CommunicationOverlay } from "@/components/CommunicationOverlay";
 import { Spinner } from "@/components/Spinner";
 import { connectSocket, disconnectSocket } from "@/store/socket";
 import { useCanvasStore } from "@/store/canvas";
-import { api } from "@/lib/api";
+import { api, PlatformUnavailableError } from "@/lib/api";
 import type { WorkspaceData } from "@/store/socket";

 export default function Home() {
  const hydrationError = useCanvasStore((s) => s.hydrationError);
  const setHydrationError = useCanvasStore((s) => s.setHydrationError);
  const [hydrating, setHydrating] = useState(true);
+  // Distinct from hydrationError: platform-down is its own UX path
+  // (different copy, different action — the user's next step is to
+  // check local services, not to retry the API call). Tracked
+  // separately rather than encoded into hydrationError so the
+  // generic-error branch can stay simple.
+  const [platformDown, setPlatformDown] = useState(false);

  useEffect(() => {
    connectSocket();
@ -28,8 +34,11 @@ export default function Home() {
        useCanvasStore.getState().setViewport(viewport);
      }
    }).catch((err) => {
-      // Initial hydration failed — show error banner to user
      console.error("Canvas: initial hydration failed", err);
+      if (err instanceof PlatformUnavailableError) {
+        setPlatformDown(true);
+        return;
+      }
      useCanvasStore.getState().setHydrationError(
        err instanceof Error && err.message ? err.message : "Failed to load canvas"
      );
@ -53,6 +62,10 @@ export default function Home() {
    );
  }

+  if (platformDown) {
+    return <PlatformDownDiagnostic />;
+  }
+
  return (
    <>
      <Canvas />
@ -83,3 +96,43 @@ export default function Home() {
    </>
  );
 }
+
+/**
+ * Dedicated diagnostic for the case where the platform reported its
+ * datastore (Postgres / Redis) is unreachable. Distinct from the
+ * generic API-error overlay: the user's next action is to check
+ * local services, not to retry the API call. Includes the exact
+ * commands for the common dev-host setup.
+ */
+function PlatformDownDiagnostic() {
+  return (
+    <div
+      role="alert"
+      className="fixed inset-0 flex flex-col items-center justify-center bg-zinc-950 text-zinc-300 gap-5 z-[9999] px-6"
+    >
+      <div className="text-amber-400 text-sm font-semibold uppercase tracking-wider">
+        Platform infrastructure unreachable
+      </div>
+      <p className="text-zinc-400 text-sm max-w-lg text-center leading-relaxed">
+        The platform server returned <code className="font-mono text-amber-300">503 platform_unavailable</code>.
+        That means it can&apos;t reach Postgres or Redis to validate your session.
+        Most common cause on a dev host: one of those services stopped.
+      </p>
+      <div className="bg-zinc-900/80 border border-zinc-700/50 rounded-lg px-4 py-3 max-w-lg w-full">
+        <div className="text-[10px] uppercase tracking-wider text-zinc-500 mb-2">Try first</div>
+        <pre className="text-[12px] text-zinc-300 font-mono whitespace-pre-wrap leading-relaxed">{`brew services start postgresql@14
+brew services start redis`}</pre>
+      </div>
+      <p className="text-[11px] text-zinc-500 max-w-lg text-center">
+        If both are running, check <code className="font-mono">/tmp/molecule-server.log</code> for
+        the underlying error. If you&apos;re on hosted SaaS, this is a platform incident — try again in a moment.
+      </p>
+      <button
+        onClick={() => window.location.reload()}
+        className="px-4 py-2 bg-blue-600 hover:bg-blue-500 text-white rounded-md text-sm mt-2"
+      >
+        Reload
+      </button>
+    </div>
+  );
+}
--- a/canvas/src/components/A2ATopologyOverlay.tsx
+++ b/canvas/src/components/A2ATopologyOverlay.tsx
@ -74,7 +74,11 @@ export function buildA2AEdges(
    });
  }

-  // 3. Build React Flow Edge objects
+  // 3. Build React Flow Edge objects. We tag every overlay edge with
+  //    type: "a2a" so React Flow renders it via our custom A2AEdge
+  //    component (canvas/A2AEdge.tsx). The custom component portals
+  //    its label out of the SVG layer so it (a) doesn't get hidden
+  //    behind workspace cards and (b) is clickable.
  return Array.from(map.values()).map(({ source, target, count, lastAt }) => {
    const isHot = now - lastAt < A2A_HOT_MS;
    const stroke = isHot ? "#8b5cf6" : "#3b82f6"; // violet-500 : blue-500
@ -84,6 +88,7 @@ export function buildA2AEdges(

    return {
      id: `a2a-${source}-${target}`,
+      type: "a2a",
      source,
      target,
      animated: isHot,
@ -96,22 +101,22 @@ export function buildA2AEdges(
      style: {
        stroke,
        strokeWidth: 2,
-        // Non-blocking: label overlay never intercepts pointer events
+        // Path itself stays non-interactive so node drags through
+        // the line still work. The clickable target is the label
+        // pill, which sets pointerEvents: all on its own div.
        pointerEvents: "none" as React.CSSProperties["pointerEvents"],
      },
+      // `label` keeps the same string for back-compat with any test
+      // that asserts on it (e.g. buildA2AEdges output shape). Custom
+      // edge reads the rich data from `data` so the label visual is
+      // not constrained to a string anymore.
      label,
-      labelStyle: {
-        fill: "#a1a1aa",   // zinc-400
-        fontSize: 10,
-        pointerEvents: "none" as React.CSSProperties["pointerEvents"],
+      data: {
+        count,
+        lastAt,
+        isHot,
+        label,
      },
-      labelBgStyle: {
-        fill: "#18181b",   // zinc-900
-        fillOpacity: 0.9,
-        pointerEvents: "none" as React.CSSProperties["pointerEvents"],
-      },
-      labelBgPadding: [4, 6] as [number, number],
-      labelBgBorderRadius: 4,
    };
  });
 }
--- a/canvas/src/components/Canvas.tsx
+++ b/canvas/src/components/Canvas.tsx
@ -36,11 +36,22 @@ import { DropTargetBadge } from "./canvas/DropTargetBadge";
 import { useDragHandlers } from "./canvas/useDragHandlers";
 import { useKeyboardShortcuts } from "./canvas/useKeyboardShortcuts";
 import { useCanvasViewport } from "./canvas/useCanvasViewport";
+import { A2AEdge } from "./canvas/A2AEdge";

 const nodeTypes = {
  workspaceNode: WorkspaceNode,
 };

+// Custom edge types. The default React Flow edge renders its label
+// inside the SVG group (always under nodes) with pointerEvents: none
+// inherited from the path. A2AEdge portals the label to a sibling
+// DOM layer so it renders above nodes and accepts clicks. Keep the
+// reference stable (module-scope const) so React Flow doesn't see a
+// new edgeTypes object on every render and warn about prop churn.
+const edgeTypes = {
+  a2a: A2AEdge,
+};
+
 const defaultEdgeOptions: Partial<Edge> = {
  animated: true,
  style: {
@ -58,14 +69,95 @@ export function Canvas() {
 }

 function CanvasInner() {
-  const nodes = useCanvasStore((s) => s.nodes);
+  const rawNodes = useCanvasStore((s) => s.nodes);
  const edges = useCanvasStore((s) => s.edges);
  const a2aEdges = useCanvasStore((s) => s.a2aEdges);
  const showA2AEdges = useCanvasStore((s) => s.showA2AEdges);
+  const deletingIds = useCanvasStore((s) => s.deletingIds);
  const allEdges = useMemo(
    () => (showA2AEdges ? [...edges, ...a2aEdges] : edges),
    [edges, a2aEdges, showA2AEdges],
  );
+  // Drag-lock during a system-owned operation (deploy OR delete).
+  // React Flow respects Node.draggable, which stops the gesture
+  // before it starts — preventDefault() on the drag-start callback
+  // isn't authoritative in v12. We project `draggable: false` onto
+  // each locked node before handing the array to ReactFlow; the
+  // drag-start handler in useDragHandlers remains as a belt-and-
+  // braces check.
+  //
+  // Perf: short-circuit when nothing is provisioning so the memo
+  // passes rawNodes through unchanged (identity-stable → RF
+  // reconciles nothing). When a deploy IS active, build an O(n)
+  // root index once and re-use it. Critically, do NOT spread every
+  // node — only mutate the locked ones — so unmodified nodes keep
+  // their object identity and RF's per-node memo short-circuits.
+  const nodes = useMemo(() => {
+    const anyProvisioning = rawNodes.some((n) => n.data.status === "provisioning");
+    const anyDeleting = deletingIds.size > 0;
+    if (!anyProvisioning && !anyDeleting) return rawNodes;
+
+    const byId = new Map<string, typeof rawNodes[number]>();
+    for (const n of rawNodes) byId.set(n.id, n);
+    const rootOf = new Map<string, string>();
+    const resolveRoot = (id: string): string => {
+      // Iterative walk guards against a pathological cycle (hostile
+      // data) — recursion would hit the stack limit on a deep tree.
+      const visited = new Set<string>();
+      let cursor: string | null = id;
+      while (cursor) {
+        if (visited.has(cursor)) break;
+        visited.add(cursor);
+        const cached = rootOf.get(cursor);
+        if (cached) {
+          for (const seenId of visited) rootOf.set(seenId, cached);
+          return cached;
+        }
+        const n = byId.get(cursor);
+        if (!n) break;
+        if (!n.data.parentId) {
+          for (const seenId of visited) rootOf.set(seenId, cursor);
+          return cursor;
+        }
+        cursor = n.data.parentId;
+      }
+      return id;
+    };
+
+    const provisioningByRoot = new Map<string, number>();
+    for (const n of rawNodes) {
+      if (n.data.status !== "provisioning") continue;
+      const rootId = resolveRoot(n.id);
+      provisioningByRoot.set(rootId, (provisioningByRoot.get(rootId) ?? 0) + 1);
+    }
+
+    let touched = false;
+    const next = rawNodes.map((n) => {
+      const rootId = resolveRoot(n.id);
+      const deployLocked = n.id !== rootId && (provisioningByRoot.get(rootId) ?? 0) > 0;
+      // Delete-locked: nothing in a subtree whose DELETE is in
+      // flight should be draggable, INCLUDING the root of that
+      // subtree (unlike deploy, there's no cancel — the delete
+      // is irrevocable at this point).
+      const deleteLocked = deletingIds.has(n.id);
+      const shouldLock = deployLocked || deleteLocked;
+      if (shouldLock && n.draggable !== false) {
+        touched = true;
+        return { ...n, draggable: false };
+      }
+      if (!shouldLock && n.draggable === false) {
+        // Node was locked in a prior render; deploy cancelled /
+        // completed, or delete failed and was reverted. Restore
+        // default dragability.
+        touched = true;
+        const { draggable: _d, ...rest } = n;
+        void _d;
+        return rest as typeof n;
+      }
+      return n; // identity-preserved
+    });
+    return touched ? next : rawNodes;
+  }, [rawNodes, deletingIds]);
  const onNodesChange = useCanvasStore((s) => s.onNodesChange);
  const selectNode = useCanvasStore((s) => s.selectNode);
  const selectedNodeId = useCanvasStore((s) => s.selectedNodeId);
@ -91,18 +183,45 @@ function CanvasInner() {
  // outside-click handler.
  const pendingDelete = useCanvasStore((s) => s.pendingDelete);
  const setPendingDelete = useCanvasStore((s) => s.setPendingDelete);
-  const removeNode = useCanvasStore((s) => s.removeNode);
+  const removeSubtree = useCanvasStore((s) => s.removeSubtree);
  const confirmDelete = useCallback(async () => {
    if (!pendingDelete) return;
    const { id } = pendingDelete;
    setPendingDelete(null);
+    // Compute the full subtree and mark it as "deleting" so every
+    // node in the chain renders dim + non-draggable during the
+    // network round-trip + the server-side cascade. Matches the
+    // deploy-lock UX: once a system-initiated operation owns this
+    // subtree, the user shouldn't be able to move its pieces
+    // around until it resolves.
+    const state = useCanvasStore.getState();
+    const subtree = new Set<string>();
+    const stack = [id];
+    while (stack.length) {
+      const nid = stack.pop()!;
+      subtree.add(nid);
+      for (const n of state.nodes) {
+        if (n.data.parentId === nid) stack.push(n.id);
+      }
+    }
+    state.beginDelete(subtree);
    try {
      await api.del(`/workspaces/${id}?confirm=true`);
-      removeNode(id);
+      // Mirror the server-side cascade locally — drop the parent AND
+      // every descendant in one atomic update. The per-descendant
+      // WORKSPACE_REMOVED WS events still arrive (and are no-ops
+      // because the nodes are already gone), but we no longer depend
+      // on them: a wedged WS used to leave orphan child cards on the
+      // canvas until the user refreshed the page.
+      removeSubtree(id);
+      state.endDelete(subtree);
    } catch (e) {
+      // Network or server error — restore the subtree to normal
+      // interaction and surface the error.
+      state.endDelete(subtree);
      showToast(e instanceof Error ? e.message : "Delete failed", "error");
    }
-  }, [pendingDelete, setPendingDelete, removeNode]);
+  }, [pendingDelete, setPendingDelete, removeSubtree]);

  const onPaneClick = useCallback(() => {
    selectNode(null);
@ -141,6 +260,7 @@ function CanvasInner() {
          onPaneClick={onPaneClick}
          onMoveEnd={onMoveEnd}
          nodeTypes={nodeTypes}
+          edgeTypes={edgeTypes}
          defaultEdgeOptions={defaultEdgeOptions}
          defaultViewport={defaultViewport}
          fitView={viewport.x === 0 && viewport.y === 0 && viewport.zoom === 1}
--- a/canvas/src/components/EmptyState.tsx
+++ b/canvas/src/components/EmptyState.tsx
@ -1,27 +1,19 @@
 "use client";

-import { useState, useEffect } from "react";
+import { useState, useEffect, useCallback } from "react";
 import { api } from "@/lib/api";
 import { useCanvasStore } from "@/store/canvas";
 import { OrgTemplatesSection } from "./TemplatePalette";
+import { type Template } from "@/lib/deploy-preflight";
+import { useTemplateDeploy } from "@/hooks/useTemplateDeploy";
 import { Spinner } from "./Spinner";
 import { TIER_CONFIG } from "@/lib/design-tokens";

-interface Template {
-  id: string;
-  name: string;
-  description: string;
-  tier: number;
-  model: string;
-  skills: string[];
-  skill_count: number;
-}
-
 export function EmptyState() {
  const [templates, setTemplates] = useState<Template[]>([]);
  const [loading, setLoading] = useState(true);
-  const [deploying, setDeploying] = useState<string | null>(null);
-  const [error, setError] = useState<string | null>(null);
+  const [blankCreating, setBlankCreating] = useState(false);
+  const [blankError, setBlankError] = useState<string | null>(null);

  useEffect(() => {
    api
@ -31,48 +23,56 @@ export function EmptyState() {
      .finally(() => setLoading(false));
  }, []);

-  const deploy = async (template: Template) => {
-    setDeploying(template.id);
-    setError(null);
-    try {
-      const ws = await api.post<{ id: string }>("/workspaces", {
-        name: template.name,
-        template: template.id,
-        tier: template.tier,
-        canvas: { x: 200, y: 150 },
-      });
-      // Auto-select the new workspace and open chat
-      setTimeout(() => {
-        useCanvasStore.getState().selectNode(ws.id);
-        useCanvasStore.getState().setPanelTab("chat");
-      }, 500);
-    } catch (e) {
-      setError(e instanceof Error ? e.message : "Deploy failed");
-    } finally {
-      setDeploying(null);
-    }
-  };
+  // Canvas fills in a visible "center-ish" spot on a fresh tenant so
+  // the user doesn't have to pan to find their new workspace. Fixed
+  // (200, 150) instead of the sidebar's random placement because the
+  // canvas is guaranteed empty when this component mounts.
+  const firstDeployCoords = useCallback(() => ({ x: 200, y: 150 }), []);

+  // After the POST succeeds, auto-select the new workspace and flip
+  // the panel to Chat. This is a UX flourish that only makes sense
+  // on first deploy (the canvas is empty so the selection can't
+  // surprise anyone); the sidebar intentionally skips this step.
+  // 500 ms delay so React Flow has a frame to render the new node
+  // before it receives focus.
+  const handleDeployed = useCallback((workspaceId: string) => {
+    setTimeout(() => {
+      useCanvasStore.getState().selectNode(workspaceId);
+      useCanvasStore.getState().setPanelTab("chat");
+    }, 500);
+  }, []);
+
+  const { deploy, deploying, error, modal } = useTemplateDeploy({
+    canvasCoords: firstDeployCoords,
+    onDeployed: handleDeployed,
+  });
+
+  // "Create blank" bypasses templates entirely — no preflight, no
+  // modal, just POST /workspaces with a default name and tier.
+  // Deliberately NOT routed through useTemplateDeploy because it
+  // has no `template.id` to deploy against.
  const createBlank = async () => {
-    setDeploying("blank");
-    setError(null);
+    setBlankCreating(true);
+    setBlankError(null);
    try {
      const ws = await api.post<{ id: string }>("/workspaces", {
        name: "My First Agent",
        tier: 2,
-        canvas: { x: 200, y: 150 },
+        canvas: firstDeployCoords(),
      });
-      setTimeout(() => {
-        useCanvasStore.getState().selectNode(ws.id);
-        useCanvasStore.getState().setPanelTab("chat");
-      }, 500);
+      handleDeployed(ws.id);
    } catch (e) {
-      setError(e instanceof Error ? e.message : "Create failed");
+      setBlankError(e instanceof Error ? e.message : "Create failed");
    } finally {
-      setDeploying(null);
+      setBlankCreating(false);
    }
  };

+  // Any active gesture locks every button so the user can't fire a
+  // second POST while the first is still in flight.
+  const anyDeploying = !!deploying || blankCreating;
+  const displayError = error ?? blankError;
+
  return (
    <div className="absolute inset-0 flex items-start justify-center pointer-events-none z-[1] overflow-y-auto py-8">
      <div className="relative max-w-2xl w-full rounded-3xl border border-zinc-800/70 bg-zinc-950/80 backdrop-blur-xl px-8 py-8 text-center shadow-2xl shadow-black/40 pointer-events-auto mx-4">
@ -112,8 +112,8 @@ export function EmptyState() {
                <button
                  type="button"
                  key={t.id}
-                  onClick={() => deploy(t)}
-                  disabled={!!deploying}
+                  onClick={() => void deploy(t)}
+                  disabled={anyDeploying}
                  className="group rounded-xl border border-zinc-800/60 bg-zinc-900/50 px-3.5 py-3 hover:border-blue-500/40 hover:bg-zinc-900/80 transition-all disabled:opacity-50 disabled:cursor-not-allowed disabled:hover:border-zinc-800/60 disabled:hover:bg-zinc-900/50 text-left focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70"
                >
                  <div className="flex items-center gap-2 mb-1">
@ -143,10 +143,10 @@ export function EmptyState() {
        <button
          type="button"
          onClick={createBlank}
-          disabled={!!deploying}
+          disabled={anyDeploying}
          className="w-full rounded-xl border border-dashed border-zinc-700/60 bg-zinc-900/30 px-4 py-3 text-sm text-zinc-400 hover:text-zinc-200 hover:border-zinc-600 hover:bg-zinc-900/50 transition-all disabled:opacity-50 disabled:cursor-not-allowed disabled:hover:text-zinc-400 disabled:hover:border-zinc-700/60 focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70"
        >
-          {deploying === "blank" ? "Creating..." : "+ Create blank workspace"}
+          {blankCreating ? "Creating..." : "+ Create blank workspace"}
        </button>

        {/* Org templates — instantiate a whole team in one click */}
@ -154,12 +154,17 @@ export function EmptyState() {
          <OrgTemplatesSection />
        </div>

-        {error && (
+        {displayError && (
          <div role="alert" className="mt-3 px-3 py-2 bg-red-950/40 border border-red-800/50 rounded-lg text-xs text-red-400">
-            {error}
+            {displayError}
          </div>
        )}

+        {/* Missing-keys preflight modal — owned by useTemplateDeploy,
+            shared with TemplatePalette. Rendered inline here so it
+            overlays this card naturally. */}
+        {modal}
+
        {/* Tips */}
        <div className="mt-5 pt-4 border-t border-zinc-800/50">
          <div className="flex items-center justify-center gap-6 text-[10px] text-zinc-400">
--- a/canvas/src/components/Legend.tsx
+++ b/canvas/src/components/Legend.tsx
@ -1,19 +1,92 @@
 "use client";

+import { useEffect, useState } from "react";
 import { STATUS_CONFIG } from "@/lib/design-tokens";
 import { useCanvasStore } from "@/store/canvas";

 const LEGEND_STATUSES = ["online", "provisioning", "degraded", "failed", "paused", "offline"] as const;

+// Persist the user's choice across sessions. Default is "open" so
+// first-time users still see the symbol key; once dismissed we
+// respect that until they explicitly reopen via the floating pill.
+const STORAGE_KEY = "molecule.legend.open";
+
+function readStoredOpen(): boolean {
+  if (typeof window === "undefined") return true;
+  try {
+    const v = window.localStorage.getItem(STORAGE_KEY);
+    if (v === null) return true;
+    return v === "1";
+  } catch {
+    return true;
+  }
+}
+
+function writeStoredOpen(open: boolean) {
+  if (typeof window === "undefined") return;
+  try {
+    window.localStorage.setItem(STORAGE_KEY, open ? "1" : "0");
+  } catch {
+    // localStorage can throw in private mode / quota / disabled
+    // contexts. Silent fallback — the in-memory state still works
+    // for the current session.
+  }
+}
+
 export function Legend() {
  // TemplatePalette (when open) is fixed top-0 left-0 w-[280px] — the
  // default bottom-6 left-4 position of this legend would sit under it.
  // Shift past the 280 px palette + a 16 px gap when the palette is open.
  const paletteOpen = useCanvasStore((s) => s.templatePaletteOpen);
  const leftClass = paletteOpen ? "left-[296px]" : "left-4";
+
+  // SSR-safe pattern: mount with the default (true) so first paint
+  // matches the server output, then hydrate the persisted value
+  // after mount. Avoids a hydration mismatch warning when the user
+  // had previously closed the legend.
+  const [open, setOpen] = useState(true);
+  useEffect(() => {
+    setOpen(readStoredOpen());
+  }, []);
+
+  const closeLegend = () => {
+    setOpen(false);
+    writeStoredOpen(false);
+  };
+  const openLegend = () => {
+    setOpen(true);
+    writeStoredOpen(true);
+  };
+
+  if (!open) {
+    return (
+      <button
+        type="button"
+        onClick={openLegend}
+        aria-label="Show legend"
+        title="Show legend"
+        className={`fixed bottom-6 ${leftClass} z-30 flex items-center gap-1.5 rounded-full bg-zinc-900/95 border border-zinc-700/50 px-3 py-1.5 text-[11px] font-semibold text-zinc-400 uppercase tracking-wider shadow-xl shadow-black/30 backdrop-blur-sm hover:text-zinc-200 hover:border-zinc-600 transition-[left,colors] duration-200`}
+      >
+        <span aria-hidden="true" className="text-[10px]">ⓘ</span>
+        Legend
+      </button>
+    );
+  }
+
  return (
    <div className={`fixed bottom-6 ${leftClass} z-30 bg-zinc-900/95 border border-zinc-700/50 rounded-xl px-4 py-3 shadow-xl shadow-black/30 backdrop-blur-sm max-w-[280px] transition-[left] duration-200`}>
-      <div className="text-[11px] font-semibold text-zinc-400 uppercase tracking-wider mb-2">Legend</div>
+      <div className="flex items-start justify-between mb-2">
+        <div className="text-[11px] font-semibold text-zinc-400 uppercase tracking-wider">Legend</div>
+        <button
+          type="button"
+          onClick={closeLegend}
+          aria-label="Hide legend"
+          title="Hide legend"
+          className="-mt-0.5 -mr-1 px-1.5 text-[14px] leading-none text-zinc-500 hover:text-zinc-200 transition-colors"
+        >
+          ×
+        </button>
+      </div>

      {/* Status */}
      <div className="mb-2">
--- a/canvas/src/components/MissingKeysModal.tsx
+++ b/canvas/src/components/MissingKeysModal.tsx
@ -1,6 +1,7 @@
 "use client";

 import { useState, useEffect, useCallback, useRef, useMemo } from "react";
+import { createPortal } from "react-dom";
 import { api } from "@/lib/api";
 import { getKeyLabel, type ProviderChoice } from "@/lib/deploy-preflight";

@ -196,6 +197,12 @@ function ProviderPickerModal({
  );

  if (!open) return null;
+  // Portal to document.body for the same reason as
+  // OrgImportPreflightModal — several callers (TemplatePalette,
+  // EmptyState) render the modal inside their own fixed+filtered
+  // containers, which re-anchor the "fixed" positioning to the
+  // wrapper's bounds instead of the viewport.
+  if (typeof document === "undefined") return null;

  const allSaved = entries.length > 0 && entries.every((e) => e.saved);
  const anySaving = entries.some((e) => e.saving);
@ -203,8 +210,14 @@ function ProviderPickerModal({
    .replace(/[-_]/g, " ")
    .replace(/\b\w/g, (c) => c.toUpperCase());

-  return (
-    <div className="fixed inset-0 z-50 flex items-center justify-center">
+  return createPortal(
+    // z-[60] so this stacks ABOVE OrgImportPreflightModal (z-50).
+    // Both can be on screen at once during an org import: the org-
+    // preflight is open while the user clicks a per-workspace deploy
+    // that triggers MissingKeys. Without the explicit z-order the
+    // backdrop click might dismiss the wrong modal depending on
+    // React's commit ordering.
+    <div className="fixed inset-0 z-[60] flex items-center justify-center">
      <div
        aria-hidden="true"
        className="absolute inset-0 bg-black/70 backdrop-blur-sm"
@ -215,7 +228,7 @@ function ProviderPickerModal({
        role="dialog"
        aria-modal="true"
        aria-labelledby="missing-keys-title"
-        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[480px] w-full mx-4 overflow-hidden"
+        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[480px] w-full mx-4 max-h-[80vh] overflow-auto"
      >
        <div className="px-5 py-4 border-b border-zinc-800">
          <div className="flex items-center gap-2 mb-1">
@ -360,7 +373,8 @@ function ProviderPickerModal({
          </div>
        </div>
      </div>
-    </div>
+    </div>,
+    document.body,
  );
 }

@ -474,6 +488,7 @@ function AllKeysModal({
  }, [open]);

  if (!open) return null;
+  if (typeof document === "undefined") return null;

  const allSaved = entries.length > 0 && entries.every((e) => e.saved);
  const anySaving = entries.some((e) => e.saving);
@ -481,8 +496,14 @@ function AllKeysModal({
    .replace(/[-_]/g, " ")
    .replace(/\b\w/g, (c) => c.toUpperCase());

-  return (
-    <div className="fixed inset-0 z-50 flex items-center justify-center">
+  return createPortal(
+    // z-[60] so this stacks ABOVE OrgImportPreflightModal (z-50).
+    // Both can be on screen at once during an org import: the org-
+    // preflight is open while the user clicks a per-workspace deploy
+    // that triggers MissingKeys. Without the explicit z-order the
+    // backdrop click might dismiss the wrong modal depending on
+    // React's commit ordering.
+    <div className="fixed inset-0 z-[60] flex items-center justify-center">
      <div
        className="absolute inset-0 bg-black/70 backdrop-blur-sm"
        aria-hidden="true"
@ -493,7 +514,7 @@ function AllKeysModal({
        role="dialog"
        aria-modal="true"
        aria-labelledby="missing-keys-title"
-        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[440px] w-full mx-4 overflow-hidden"
+        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[440px] w-full mx-4 max-h-[80vh] overflow-auto"
      >
        <div className="px-5 py-4 border-b border-zinc-800">
          <div className="flex items-center gap-2 mb-1">
@ -608,6 +629,7 @@ function AllKeysModal({
          </div>
        </div>
      </div>
-    </div>
+    </div>,
+    document.body,
  );
 }
--- a/canvas/src/components/OrgImportPreflightModal.tsx
+++ b/canvas/src/components/OrgImportPreflightModal.tsx
@ -0,0 +1,540 @@
+"use client";
+
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+import { createPortal } from "react-dom";
+import { createSecret } from "@/lib/api/secrets";
+
+/**
+ * One entry from the server's preflight `required_env` / `recommended_env`.
+ *
+ *   - A plain string is a STRICT requirement: that exact env var must be
+ *     configured.
+ *   - A `{any_of: [...]}` object is an OR group: at least one member
+ *     must be configured to satisfy it. Lets a template say "either
+ *     ANTHROPIC_API_KEY or CLAUDE_CODE_OAUTH_TOKEN" without forcing
+ *     both.
+ *
+ * Matches the Go `EnvRequirement` type's JSON shape (MarshalJSON in
+ * workspace-server/internal/handlers/org.go). The union is written so
+ * that a narrow check — `typeof e === "string"` — distinguishes cleanly.
+ */
+export type EnvRequirement = string | { any_of: string[] };
+
+/** Flat member list for a requirement. */
+export function envReqMembers(r: EnvRequirement): string[] {
+  return typeof r === "string" ? [r] : r.any_of;
+}
+
+/** True if any member is present in `configured`. */
+export function envReqSatisfied(r: EnvRequirement, configured: Set<string>): boolean {
+  if (typeof r === "string") return configured.has(r);
+  return r.any_of.some((m) => configured.has(m));
+}
+
+/** Stable react-key / dedup key for a requirement. Sorted for groups so
+ *  reordered-member variants still collapse to one entry. */
+export function envReqKey(r: EnvRequirement): string {
+  if (typeof r === "string") return r;
+  return [...r.any_of].sort().join("|");
+}
+
+interface Props {
+  open: boolean;
+  /** Display name of the org template — headline only. */
+  orgName: string;
+  /** Total workspace count so the header can read "12 workspaces". */
+  workspaceCount: number;
+  /** Env vars the server has declared MUST be set as global secrets.
+   *  Import is disabled until every entry here is configured. Entries
+   *  are either a single key name or an any-of group. */
+  requiredEnv: EnvRequirement[];
+  /** Env vars the server suggests — import can proceed without them,
+   *  but the user sees them listed so they can decide. Same union
+   *  shape as `requiredEnv`. */
+  recommendedEnv: EnvRequirement[];
+  /** Names of env vars already configured globally. Used to strike
+   *  through entries the user has already set up in another
+   *  session. Passed in rather than queried inside the modal so the
+   *  parent can refresh after each save without prop-driven effects. */
+  configuredKeys: Set<string>;
+  /** Called after a successful secret save so the parent can refresh
+   *  `configuredKeys`. */
+  onSecretSaved: () => void;
+  /** User clicked Import with all required envs satisfied. */
+  onProceed: () => void;
+  /** User dismissed the modal. Import is NOT fired. */
+  onCancel: () => void;
+}
+
+interface DraftEntry {
+  key: string;
+  value: string;
+  saving: boolean;
+  error: string | null;
+}
+
+/**
+ * OrgImportPreflightModal
+ * -----------------------
+ * Two-tier env preflight before POST /org/import:
+ *
+ *   - REQUIRED section (red, blocking) — every entry MUST be configured
+ *     globally before the Import button enables. Matches the server-
+ *     side preflight that would 412 the import anyway.
+ *
+ *   - RECOMMENDED section (yellow, non-blocking) — listed so the user
+ *     can add them if they want the full experience, but the Import
+ *     button stays enabled regardless.
+ *
+ * Saving goes to the GLOBAL secrets endpoint (PUT /settings/secrets)
+ * because org-level templates deploy shared resources. Per-workspace
+ * overrides still work via the Config tab on an individual node
+ * after import. The modal does NOT enable Import the moment a key is
+ * typed — only after it saves successfully (so a half-entered token
+ * can't proceed and then fail at container-start time instead).
+ */
+export function OrgImportPreflightModal({
+  open,
+  orgName,
+  workspaceCount,
+  requiredEnv,
+  recommendedEnv,
+  configuredKeys,
+  onSecretSaved,
+  onProceed,
+  onCancel,
+}: Props) {
+  const [drafts, setDrafts] = useState<Record<string, DraftEntry>>({});
+
+  // Flatten the union-shaped requirement lists to the set of every key
+  // that could ever appear as an input row. Used purely to seed the
+  // drafts map — satisfaction semantics still read from the grouped
+  // EnvRequirement entries (a group can be satisfied by any one
+  // member).
+  const allMemberKeys = useMemo(() => {
+    const keys: string[] = [];
+    for (const r of requiredEnv) keys.push(...envReqMembers(r));
+    for (const r of recommendedEnv) keys.push(...envReqMembers(r));
+    return keys;
+  }, [requiredEnv, recommendedEnv]);
+
+  // Seed a draft entry per declared key the first time the modal
+  // opens. Entries persist across `configuredKeys` changes so a mid-
+  // save recheck doesn't wipe what the user typed.
+  //
+  // Dep: derive a STABLE string from the env-name lists rather than
+  // the array refs themselves. The parent computes
+  // `preflight.org.required_env ?? []`, which produces a fresh []
+  // identity on every re-render (e.g. when refreshConfiguredKeys
+  // bumps state); depending on the array refs would re-fire the
+  // effect on every parent render and mask any future edit that
+  // drops the `if (!next[k])` guard as a silent input-reset bug.
+  const envKeysSignature = useMemo(
+    () => [...allMemberKeys].sort().join("|"),
+    [allMemberKeys],
+  );
+  useEffect(() => {
+    if (!open) return;
+    setDrafts((prev) => {
+      const next = { ...prev };
+      for (const k of allMemberKeys) {
+        if (!next[k]) {
+          next[k] = { key: k, value: "", saving: false, error: null };
+        }
+      }
+      return next;
+    });
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [open, envKeysSignature]);
+
+  const missingRequired = useMemo(
+    () => requiredEnv.filter((r) => !envReqSatisfied(r, configuredKeys)),
+    [requiredEnv, configuredKeys],
+  );
+  const missingRecommended = useMemo(
+    () => recommendedEnv.filter((r) => !envReqSatisfied(r, configuredKeys)),
+    [recommendedEnv, configuredKeys],
+  );
+  const canProceed = missingRequired.length === 0;
+
+  // Synchronous in-flight gate. A ref (not state) so two clicks
+  // dispatched in the SAME microtask both see the gate flip — state
+  // commits don't help here because setState is async. The previous
+  // closure-based `current.saving` gate worked under React Testing
+  // Library's act() flushing but failed for true microtask-level
+  // double-fires (programmatic clicks, dblclick events, Enter-spam
+  // before React commits). Set is keyed by env var name so different
+  // rows can save concurrently.
+  const inFlightRef = useRef<Set<string>>(new Set());
+
+  // Latest-drafts ref so saveOne can read the current input value
+  // without taking `drafts` as a useCallback dep — that dep would
+  // re-create saveOne on every keystroke and re-bind every Save
+  // button's onClick handler, churn that scales with row count.
+  const draftsRef = useRef(drafts);
+  useEffect(() => {
+    draftsRef.current = drafts;
+  }, [drafts]);
+
+  const saveOne = useCallback(
+    async (key: string) => {
+      // Microtask-safe gate: claim the slot synchronously BEFORE any
+      // await so a second click in the same tick bounces immediately.
+      if (inFlightRef.current.has(key)) return;
+      const current = draftsRef.current[key];
+      if (!current || !current.value.trim()) return;
+      inFlightRef.current.add(key);
+
+      const startValue = current.value;
+      setDrafts((d) => ({
+        ...d,
+        [key]: { ...d[key], saving: true, error: null },
+      }));
+      try {
+        await createSecret("global", key, startValue);
+        setDrafts((d) => ({
+          ...d,
+          [key]: { ...d[key], value: "", saving: false, error: null },
+        }));
+        // Let the parent refresh configuredKeys so the strike-through
+        // updates and canProceed recomputes.
+        onSecretSaved();
+      } catch (e) {
+        setDrafts((d) => ({
+          ...d,
+          [key]: {
+            ...d[key],
+            saving: false,
+            error: e instanceof Error ? e.message : "Save failed",
+          },
+        }));
+      } finally {
+        inFlightRef.current.delete(key);
+      }
+    },
+    [onSecretSaved],
+  );
+
+  if (!open) return null;
+
+  // Portal the dialog to document.body so it escapes any ancestor
+  // containing block. TemplatePalette renders this modal inside a
+  // sidebar whose `fixed` container plus backdrop-filter together
+  // re-anchor descendants' `position: fixed` to the sidebar's own
+  // bounds instead of the viewport — the modal ends up glued to the
+  // sidebar's scrollable region and only becomes visible after the
+  // user scrolls the sidebar. Portal dodges that class of issue
+  // once and for all, regardless of what future wrappers do.
+  //
+  // SSR-safe guard: `document` is undefined on the server. Since
+  // the modal is gated by `if (!open) return null` above, this
+  // effectively only runs after open flips true on the client.
+  if (typeof document === "undefined") return null;
+
+  return createPortal(
+    <div
+      role="dialog"
+      aria-modal="true"
+      aria-labelledby="org-preflight-title"
+      className="fixed inset-0 z-50 flex items-center justify-center bg-black/70"
+      onClick={onCancel}
+    >
+      <div
+        className="w-[560px] max-h-[80vh] overflow-auto rounded-xl bg-zinc-900 border border-zinc-700 shadow-2xl"
+        onClick={(e) => e.stopPropagation()}
+      >
+        <header className="px-5 py-4 border-b border-zinc-800">
+          <h2 id="org-preflight-title" className="text-sm font-semibold text-zinc-100">
+            Deploy {orgName}
+          </h2>
+          <p className="mt-0.5 text-[11px] text-zinc-500">
+            {workspaceCount} workspace{workspaceCount === 1 ? "" : "s"}.
+            Review the credentials needed before import.
+          </p>
+        </header>
+
+        <section className="p-5 space-y-5">
+          {requiredEnv.length > 0 && (
+            <EnvList
+              tone="required"
+              title="Required"
+              subtitle="Import is blocked until every key below is saved globally."
+              entries={requiredEnv}
+              configuredKeys={configuredKeys}
+              drafts={drafts}
+              onChange={(key, value) =>
+                setDrafts((d) => ({ ...d, [key]: { ...d[key], value } }))
+              }
+              onSave={saveOne}
+            />
+          )}
+          {recommendedEnv.length > 0 && (
+            <EnvList
+              tone="recommended"
+              title="Recommended"
+              subtitle="Not required, but some features degrade without them. Add them now for the best experience."
+              entries={recommendedEnv}
+              configuredKeys={configuredKeys}
+              drafts={drafts}
+              onChange={(key, value) =>
+                setDrafts((d) => ({ ...d, [key]: { ...d[key], value } }))
+              }
+              onSave={saveOne}
+            />
+          )}
+          {requiredEnv.length === 0 && recommendedEnv.length === 0 && (
+            <p className="text-[12px] text-zinc-400">
+              No additional credentials required for this template.
+            </p>
+          )}
+        </section>
+
+        <footer className="px-5 py-3 border-t border-zinc-800 flex items-center justify-between">
+          <button
+            type="button"
+            onClick={onCancel}
+            className="px-3 py-1.5 text-[11px] rounded bg-zinc-800 hover:bg-zinc-700 text-zinc-300"
+          >
+            Cancel
+          </button>
+          <div className="flex items-center gap-2">
+            {missingRecommended.length > 0 && canProceed && (
+              <span className="text-[10px] text-amber-400/90">
+                {missingRecommended.length} recommended key
+                {missingRecommended.length === 1 ? "" : "s"} still unset
+              </span>
+            )}
+            <button
+              type="button"
+              onClick={onProceed}
+              disabled={!canProceed}
+              className="px-4 py-1.5 text-[11px] font-semibold rounded bg-blue-600 hover:bg-blue-500 text-white disabled:bg-zinc-700 disabled:text-zinc-500 disabled:cursor-not-allowed"
+            >
+              Import
+            </button>
+          </div>
+        </footer>
+      </div>
+    </div>,
+    document.body,
+  );
+}
+
+interface EnvListProps {
+  tone: "required" | "recommended";
+  title: string;
+  subtitle: string;
+  entries: EnvRequirement[];
+  configuredKeys: Set<string>;
+  drafts: Record<string, DraftEntry>;
+  onChange: (key: string, value: string) => void;
+  onSave: (key: string) => void;
+}
+
+function EnvList({
+  tone,
+  title,
+  subtitle,
+  entries,
+  configuredKeys,
+  drafts,
+  onChange,
+  onSave,
+}: EnvListProps) {
+  const accent =
+    tone === "required"
+      ? "border-red-800/60 bg-red-950/20"
+      : "border-amber-800/50 bg-amber-950/15";
+  const headerColor =
+    tone === "required" ? "text-red-300" : "text-amber-300";
+
+  return (
+    <div className={`rounded-lg border ${accent} p-3`}>
+      <h3 className={`text-[11px] font-semibold uppercase tracking-wide ${headerColor}`}>
+        {title}
+      </h3>
+      <p className="mt-0.5 mb-2 text-[10px] text-zinc-400">{subtitle}</p>
+      <ul className="space-y-2">
+        {entries.map((entry) =>
+          typeof entry === "string" ? (
+            <StrictEnvRow
+              key={envReqKey(entry)}
+              envKey={entry}
+              configured={configuredKeys.has(entry)}
+              draft={drafts[entry]}
+              onChange={onChange}
+              onSave={onSave}
+            />
+          ) : (
+            <AnyOfEnvGroup
+              key={envReqKey(entry)}
+              members={entry.any_of}
+              configuredKeys={configuredKeys}
+              drafts={drafts}
+              onChange={onChange}
+              onSave={onSave}
+            />
+          ),
+        )}
+      </ul>
+    </div>
+  );
+}
+
+interface StrictEnvRowProps {
+  envKey: string;
+  configured: boolean;
+  draft: DraftEntry | undefined;
+  onChange: (key: string, value: string) => void;
+  onSave: (key: string) => void;
+}
+
+function StrictEnvRow({
+  envKey,
+  configured,
+  draft: d,
+  onChange,
+  onSave,
+}: StrictEnvRowProps) {
+  return (
+    <li className="flex items-center gap-2 rounded bg-zinc-900/70 border border-zinc-800 px-2 py-1.5">
+      <code
+        className={`text-[11px] font-mono flex-1 ${
+          configured ? "text-zinc-500 line-through" : "text-zinc-200"
+        }`}
+      >
+        {envKey}
+      </code>
+      {configured ? (
+        <span className="text-[10px] text-emerald-400">✓ set</span>
+      ) : (
+        <>
+          <input
+            type="password"
+            aria-label={`Value for ${envKey}`}
+            placeholder="paste value"
+            value={d?.value ?? ""}
+            onChange={(e) => onChange(envKey, e.target.value)}
+            onKeyDown={(e) => {
+              if (e.key === "Enter") {
+                e.preventDefault();
+                onSave(envKey);
+              }
+            }}
+            disabled={d?.saving}
+            className="flex-1 px-2 py-1 rounded bg-zinc-800 border border-zinc-700 text-[11px] text-zinc-200 focus:outline-none focus:border-blue-500 disabled:opacity-50"
+          />
+          <button
+            type="button"
+            onClick={() => onSave(envKey)}
+            disabled={d?.saving || !d?.value.trim()}
+            className="px-2 py-1 text-[10px] rounded bg-blue-600 hover:bg-blue-500 text-white disabled:opacity-40 disabled:cursor-not-allowed"
+          >
+            {d?.saving ? "…" : "Save"}
+          </button>
+        </>
+      )}
+      {d?.error && (
+        <span className="text-[9px] text-red-400 basis-full pl-1">
+          {d.error}
+        </span>
+      )}
+    </li>
+  );
+}
+
+interface AnyOfEnvGroupProps {
+  members: string[];
+  configuredKeys: Set<string>;
+  drafts: Record<string, DraftEntry>;
+  onChange: (key: string, value: string) => void;
+  onSave: (key: string) => void;
+}
+
+/**
+ * Renders an OR group: the user only needs to configure ONE of the
+ * members to satisfy the requirement. Once any member is configured
+ * the group shows a green banner identifying the satisfying key; the
+ * other inputs remain visible but muted so the user can still switch
+ * providers if they want (uncommon but cheap to support).
+ */
+function AnyOfEnvGroup({
+  members,
+  configuredKeys,
+  drafts,
+  onChange,
+  onSave,
+}: AnyOfEnvGroupProps) {
+  const satisfiedBy = members.find((m) => configuredKeys.has(m));
+  return (
+    <li className="rounded border border-zinc-800 bg-zinc-900/50 px-2.5 py-2">
+      <div className="flex items-center justify-between mb-1.5">
+        <span className="text-[10px] uppercase tracking-wide text-zinc-400">
+          Configure any one
+        </span>
+        {satisfiedBy && (
+          <span className="text-[10px] text-emerald-400">
+            ✓ using <code className="font-mono">{satisfiedBy}</code>
+          </span>
+        )}
+      </div>
+      <ul className="space-y-1.5">
+        {members.map((m) => {
+          const isConfigured = configuredKeys.has(m);
+          const d = drafts[m];
+          const dimmed = !!satisfiedBy && !isConfigured;
+          return (
+            <li
+              key={m}
+              className={`flex items-center gap-2 rounded bg-zinc-900/70 border border-zinc-800 px-2 py-1 ${
+                dimmed ? "opacity-50" : ""
+              }`}
+            >
+              <code
+                className={`text-[11px] font-mono flex-1 ${
+                  isConfigured ? "text-zinc-500 line-through" : "text-zinc-200"
+                }`}
+              >
+                {m}
+              </code>
+              {isConfigured ? (
+                <span className="text-[10px] text-emerald-400">✓ set</span>
+              ) : (
+                <>
+                  <input
+                    type="password"
+                    aria-label={`Value for ${m}`}
+                    placeholder="paste value"
+                    value={d?.value ?? ""}
+                    onChange={(e) => onChange(m, e.target.value)}
+                    onKeyDown={(e) => {
+                      if (e.key === "Enter") {
+                        e.preventDefault();
+                        onSave(m);
+                      }
+                    }}
+                    disabled={d?.saving}
+                    className="flex-1 px-2 py-1 rounded bg-zinc-800 border border-zinc-700 text-[11px] text-zinc-200 focus:outline-none focus:border-blue-500 disabled:opacity-50"
+                  />
+                  <button
+                    type="button"
+                    onClick={() => onSave(m)}
+                    disabled={d?.saving || !d?.value.trim()}
+                    className="px-2 py-1 text-[10px] rounded bg-blue-600 hover:bg-blue-500 text-white disabled:opacity-40 disabled:cursor-not-allowed"
+                  >
+                    {d?.saving ? "…" : "Save"}
+                  </button>
+                </>
+              )}
+              {d?.error && (
+                <span className="text-[9px] text-red-400 basis-full pl-1">
+                  {d.error}
+                </span>
+              )}
+            </li>
+          );
+        })}
+      </ul>
+    </li>
+  );
+}
--- a/canvas/src/components/ProvisioningTimeout.tsx
+++ b/canvas/src/components/ProvisioningTimeout.tsx
@ -2,6 +2,7 @@

 import { useState, useEffect, useCallback, useRef, useMemo } from "react";
 import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
+import { pruneStaleKeys } from "./canvas/useCanvasViewport";
 import { api } from "@/lib/api";
 import { showToast } from "./Toaster";
 import { ConsoleModal } from "./ConsoleModal";
@ -65,6 +66,12 @@ export function ProvisioningTimeout({
  // banner even if they stay in provisioning. Cleared when the
  // workspace leaves provisioning (status changes).
  const [dismissed, setDismissed] = useState<Set<string>>(new Set());
+  // Watch the live WS health. While it's not "connected", local node
+  // status reflects the last event we received before the drop —
+  // workspaces may have actually transitioned to online minutes ago.
+  // Suppress the banner until WS recovers + rehydrate confirms each
+  // workspace is genuinely still provisioning.
+  const wsStatus = useCanvasStore((s) => s.wsStatus);

  // Subscribe to provisioning nodes — use shallow compare to avoid infinite re-render
  // (filter+map creates new array reference on every store update).
@ -119,11 +126,7 @@ export function ProvisioningTimeout({

    // Remove tracking for nodes that are no longer provisioning
    const activeIds = new Set(parsedProvisioningNodes.map((n) => n.id));
-    for (const id of tracking.keys()) {
-      if (!activeIds.has(id)) {
-        tracking.delete(id);
-      }
-    }
+    pruneStaleKeys(tracking, activeIds);

    // Also remove from timedOut list if no longer provisioning, and
    // clear `dismissed` entries for workspaces that finished so a
@ -273,8 +276,11 @@ export function ProvisioningTimeout({
  }, []);

  const visibleTimedOut = useMemo(
-    () => timedOut.filter((e) => !dismissed.has(e.workspaceId)),
-    [timedOut, dismissed],
+    () =>
+      wsStatus === "connected"
+        ? timedOut.filter((e) => !dismissed.has(e.workspaceId))
+        : [],
+    [timedOut, dismissed, wsStatus],
  );

  if (visibleTimedOut.length === 0) return null;
--- a/canvas/src/components/SidePanel.tsx
+++ b/canvas/src/components/SidePanel.tsx
@ -29,7 +29,7 @@ const TABS: { id: PanelTab; label: string; icon: string }[] = [
  { id: "chat", label: "Chat", icon: "◈" },
  { id: "activity", label: "Activity", icon: "⊙" },
  { id: "details", label: "Details", icon: "◉" },
-  { id: "skills", label: "Skills", icon: "✦" },
+  { id: "skills", label: "Plugins", icon: "✦" },
  { id: "terminal", label: "Terminal", icon: "▸" },
  { id: "config", label: "Config", icon: "⚙" },
  { id: "schedule", label: "Schedule", icon: "⏲" },
@ -280,7 +280,7 @@ export function SidePanel() {
        className="flex-1 overflow-y-auto focus:outline-none"
      >
        {panelTab === "details" && <DetailsTab key={selectedNodeId} workspaceId={selectedNodeId} data={node.data} />}
-        {panelTab === "skills" && <SkillsTab key={selectedNodeId} data={node.data} />}
+        {panelTab === "skills" && <SkillsTab key={selectedNodeId} workspaceId={selectedNodeId} data={node.data} />}
        {panelTab === "activity" && <ActivityTab key={selectedNodeId} workspaceId={selectedNodeId} />}
        {panelTab === "chat" && <ChatTab key={selectedNodeId} workspaceId={selectedNodeId} data={node.data} />}
        {panelTab === "terminal" && <TerminalTab key={selectedNodeId} workspaceId={selectedNodeId} />}
--- a/canvas/src/components/TemplatePalette.tsx
+++ b/canvas/src/components/TemplatePalette.tsx
@ -1,35 +1,48 @@
 "use client";

 import { useState, useEffect, useCallback, useRef } from "react";
+import { flushSync } from "react-dom";
 import { api } from "@/lib/api";
 import { useCanvasStore } from "@/store/canvas";
 import type { WorkspaceData } from "@/store/socket";
-import { checkDeploySecrets, type PreflightResult, type ModelSpec } from "@/lib/deploy-preflight";
-import { MissingKeysModal } from "./MissingKeysModal";
+import { type Template } from "@/lib/deploy-preflight";
+import { useTemplateDeploy } from "@/hooks/useTemplateDeploy";
+import {
+  OrgImportPreflightModal,
+  type EnvRequirement,
+} from "./OrgImportPreflightModal";
 import { ConfirmDialog } from "./ConfirmDialog";
 import { Spinner } from "./Spinner";
 import { showToast } from "./Toaster";
 import { TIER_CONFIG } from "@/lib/design-tokens";
+import { listSecrets } from "@/lib/api/secrets";

-interface Template {
-  id: string;
-  name: string;
-  description: string;
-  tier: number;
-  runtime?: string;
-  model: string;
-  models?: ModelSpec[];
-  /** AND-required env vars declared at runtime_config.required_env. */
-  required_env?: string[];
-  skills: string[];
-  skill_count: number;
-}
-
+// `Template` type and `resolveRuntime` helper now live in
+// `@/lib/deploy-preflight` so EmptyState can import the same ones. Was
+// redeclared here + a narrower redeclaration in EmptyState; the
+// narrower one dropped `runtime`, `models`, `required_env`, which is
+// exactly the data the preflight needs. See reviewer's "runtime
+// fallback drift" note — single source of truth closes the drift.
 export interface OrgTemplate {
  dir: string;
  name: string;
  description: string;
  workspaces: number;
+  /** Env vars that MUST be set as global secrets before the org can
+   *  import. Server refuses the import with 412 if any are missing;
+   *  the canvas preflights against /secrets/list to avoid the round
+   *  trip. Aggregated from org-level + every workspace in the tree.
+   *
+   *  Each entry is either a key name (strict) or an `{any_of: [...]}`
+   *  group (any one of the listed members satisfies the requirement —
+   *  e.g. `ANTHROPIC_API_KEY` OR `CLAUDE_CODE_OAUTH_TOKEN`). */
+  required_env?: EnvRequirement[];
+  /** "Nice-to-have" tier. Import proceeds without them but features
+   *  may degrade — a channel's webhook posts get dropped, a fallback
+   *  LLM isn't available, etc. Surfaced to the user as a non-blocking
+   *  warning with an "add now" affordance. Same union shape as
+   *  `required_env`. */
+  recommended_env?: EnvRequirement[];
 }

 /** Fetch the list of org templates from the platform. Returns [] on error
@ -91,6 +104,14 @@ export function OrgTemplatesSection() {
  const [loading, setLoading] = useState(false);
  const [importing, setImporting] = useState<string | null>(null);
  const [error, setError] = useState<string | null>(null);
+  // Preflight modal state. `preflight` is non-null when the user
+  // clicked Import on an org with declared required/recommended envs
+  // and we're waiting for them to confirm; null otherwise (direct
+  // import path for orgs with zero env requirements).
+  const [preflight, setPreflight] = useState<{
+    org: OrgTemplate;
+    configuredKeys: Set<string>;
+  } | null>(null);
  // Collapsed by default — org templates are multi-workspace imports
  // that most new users don't reach for first. Keeping them
  // expand-on-demand frees ~400 px of vertical space for the
@ -109,21 +130,55 @@ export function OrgTemplatesSection() {
    loadOrgs();
  }, [loadOrgs]);

-  const handleImport = async (org: OrgTemplate) => {
+  /** Fetch the set of global secret KEYS that are already configured.
+   *  Used to strike through already-set entries in the preflight modal
+   *  and to decide whether the import needs the modal at all. */
+  const loadConfiguredKeys = useCallback(async (): Promise<Set<string>> => {
+    try {
+      const secrets = await listSecrets("global");
+      return new Set(secrets.map((s) => s.name));
+    } catch {
+      // Secrets endpoint unreachable → assume nothing configured.
+      // The server will refuse the import with 412 and the user
+      // retries; safer than letting the import fly blind.
+      return new Set();
+    }
+  }, []);
+
+  /** Actually run the import. Split out so both the "no preflight
+   *  needed" fast path and the "preflight modal approved" path can
+   *  share the fetch + hydrate + toast sequence. */
+  const doImport = useCallback(async (org: OrgTemplate) => {
    setImporting(org.dir);
    setError(null);
    try {
      await importOrgTemplate(org.dir);
-      // Refresh canvas inline — the WebSocket may be offline, in which case
-      // WORKSPACE_PROVISIONING broadcasts never arrive and the user sees
-      // no change from clicking "Import org". A direct fetch guarantees
-      // the new workspaces land on canvas regardless of WS state.
-      try {
-        const workspaces = await api.get<WorkspaceData[]>("/workspaces");
-        useCanvasStore.getState().hydrate(workspaces);
-      } catch {
-        // Rehydrate failure is non-fatal; WS (if alive) or the next
-        // health-check cycle will eventually pick the new workspaces up.
+      // Hydrate is the safety net for the "WS is offline" case —
+      // without live events the canvas stays empty. But calling it
+      // immediately wipes the org-deploy animation (hydrate rebuilds
+      // the node array from scratch, dropping the spawn / shimmer
+      // classes and position tweens). So:
+      //   1. If the number of nodes on the canvas already matches
+      //      (or exceeds) the template's workspace count, WS
+      //      delivered everything — skip hydrate.
+      //   2. Otherwise, wait a short window to let any in-flight WS
+      //      events land, then hydrate only if still behind.
+      const expectedCount = org.workspaces;
+      // Nodes transition through WORKSPACE_REMOVED which physically
+      // drops them from the store — there is no "removed" status in
+      // WorkspaceNodeData — so a simple length check is enough here.
+      const hasAll = () => useCanvasStore.getState().nodes.length >= expectedCount;
+      if (!hasAll()) {
+        await new Promise((r) => setTimeout(r, 1500));
+      }
+      if (!hasAll()) {
+        try {
+          const workspaces = await api.get<WorkspaceData[]>("/workspaces");
+          useCanvasStore.getState().hydrate(workspaces);
+        } catch {
+          // WS (if alive) or the next health-check cycle will
+          // eventually pick the new workspaces up.
+        }
      }
      showToast(`Imported "${org.name || org.dir}" (${org.workspaces} workspaces)`, "success");
    } catch (e) {
@ -133,7 +188,45 @@ export function OrgTemplatesSection() {
    } finally {
      setImporting(null);
    }
-  };
+  }, []);
+
+  /** Entry point for the Import button. Two paths:
+   *
+   *   1. No env declared by the template (required_env + recommended_env
+   *      both empty) → fire doImport directly. Matches the pre-preflight
+   *      behaviour for existing templates.
+   *
+   *   2. Any env declared → load the configured-keys set and open the
+   *      preflight modal. doImport runs only when the user clicks
+   *      Import inside the modal, which is gated to "required envs all
+   *      configured" by the modal itself. */
+  const handleImport = useCallback(async (org: OrgTemplate) => {
+    const hasEnvDeclarations =
+      (org.required_env && org.required_env.length > 0) ||
+      (org.recommended_env && org.recommended_env.length > 0);
+    if (!hasEnvDeclarations) {
+      void doImport(org);
+      return;
+    }
+    // Flip the button to its "Importing…" state while the secrets
+    // lookup runs — on a tenant with 500+ global secrets the round
+    // trip can be > 200 ms and the user otherwise gets zero visual
+    // feedback after clicking. Cleared on modal close / error.
+    setImporting(org.dir);
+    try {
+      const configuredKeys = await loadConfiguredKeys();
+      setPreflight({ org, configuredKeys });
+    } finally {
+      setImporting(null);
+    }
+  }, [doImport, loadConfiguredKeys]);
+
+  /** Called by the preflight modal after a successful key save so the
+   *  strike-through re-renders and canProceed recomputes. */
+  const refreshConfiguredKeys = useCallback(async () => {
+    const keys = await loadConfiguredKeys();
+    setPreflight((prev) => (prev ? { ...prev, configuredKeys: keys } : prev));
+  }, [loadConfiguredKeys]);

  return (
    <div className="space-y-2" data-testid="org-templates-section">
@ -222,6 +315,35 @@ export function OrgTemplatesSection() {
      })}
        </div>
      )}
+
+      {preflight && (
+        <OrgImportPreflightModal
+          open
+          orgName={preflight.org.name || preflight.org.dir}
+          workspaceCount={preflight.org.workspaces}
+          requiredEnv={preflight.org.required_env ?? []}
+          recommendedEnv={preflight.org.recommended_env ?? []}
+          configuredKeys={preflight.configuredKeys}
+          onSecretSaved={refreshConfiguredKeys}
+          onProceed={() => {
+            const org = preflight.org;
+            // flushSync guarantees the modal unmounts BEFORE we kick
+            // off the import network call. Without it, React batches
+            // setPreflight(null) with the setImporting(...) from
+            // doImport's synchronous prefix, both commit at the end
+            // of this handler, AND the await import() POST may yield
+            // a microtask before React schedules the paint. Net
+            // effect: the modal backdrop sat over the canvas during
+            // the first wave of WORKSPACE_PROVISIONING WS events,
+            // hiding the spawn animation. Force the close to land
+            // first so the user sees the canvas reveal + agents
+            // popping into place.
+            flushSync(() => setPreflight(null));
+            void doImport(org);
+          }}
+          onCancel={() => setPreflight(null)}
+        />
+      )}
    </div>
  );
 }
@ -319,14 +441,6 @@ export function TemplatePalette() {

  const [templates, setTemplates] = useState<Template[]>([]);
  const [loading, setLoading] = useState(false);
-  const [creating, setCreating] = useState<string | null>(null);
-  const [error, setError] = useState<string | null>(null);
-
-  // Missing keys modal state
-  const [missingKeysInfo, setMissingKeysInfo] = useState<{
-    template: Template;
-    preflight: PreflightResult;
-  } | null>(null);

  const loadTemplates = useCallback(async () => {
    setLoading(true);
@ -344,65 +458,15 @@ export function TemplatePalette() {
    if (open) loadTemplates();
  }, [open, loadTemplates]);

-  /** Resolve runtime from template ID (e.g., "langgraph", "claude-code-default" → "claude-code") */
-  const resolveRuntime = (templateId: string): string => {
-    const runtimeMap: Record<string, string> = {
-      langgraph: "langgraph",
-      "claude-code-default": "claude-code",
-      openclaw: "openclaw",
-      deepagents: "deepagents",
-      crewai: "crewai",
-      autogen: "autogen",
-    };
-    return runtimeMap[templateId] ?? templateId.replace(/-default$/, "");
-  };
-
-  /** Actually execute the deploy API call */
-  const executeDeploy = useCallback(async (template: Template) => {
-    setCreating(template.id);
-    setError(null);
-    try {
-      await api.post("/workspaces", {
-        name: template.name,
-        template: template.id,
-        tier: template.tier,
-        canvas: {
-          x: Math.random() * 400 + 100,
-          y: Math.random() * 300 + 100,
-        },
-      });
-      setCreating(null);
-    } catch (e) {
-      setError(e instanceof Error ? e.message : "Failed to deploy");
-      setCreating(null);
-    }
-  }, []);
-
-  /** Pre-deploy check: validate secrets before deploying */
-  const handleDeploy = async (template: Template) => {
-    setCreating(template.id);
-    setError(null);
-
-    // Prefer the runtime the Go /templates endpoint returned verbatim —
-    // resolveRuntime() is a legacy id→runtime fallback for installs whose
-    // template summary predates the `runtime` field.
-    const runtime = template.runtime ?? resolveRuntime(template.id);
-    const preflight = await checkDeploySecrets({
-      runtime,
-      models: template.models,
-      required_env: template.required_env,
-    });
-
-    if (!preflight.ok) {
-      // Missing keys — show the modal instead of deploying
-      setMissingKeysInfo({ template, preflight });
-      setCreating(null);
-      return;
-    }
-
-    // All keys present — deploy directly
-    await executeDeploy(template);
-  };
+  // Preflight + POST + modal wiring moved into useTemplateDeploy so
+  // this component and EmptyState use one implementation. The sidebar
+  // uses the hook's default random canvas placement (no override) —
+  // an already-populated canvas shouldn't have new deploys stacking on
+  // a single fixed point. No post-deploy side effect either: the
+  // palette is operator-triggered, so auto-selecting would yank
+  // focus off whatever the user was already looking at.
+  const { deploy: handleDeploy, deploying: creating, error, modal } =
+    useTemplateDeploy();

  return (
    <>
@ -426,21 +490,9 @@ export function TemplatePalette() {
        </svg>
      </button>

-      {/* Missing Keys Modal */}
-      <MissingKeysModal
-        open={!!missingKeysInfo}
-        missingKeys={missingKeysInfo?.preflight.missingKeys ?? []}
-        providers={missingKeysInfo?.preflight.providers ?? []}
-        runtime={missingKeysInfo?.preflight.runtime ?? ""}
-        onKeysAdded={() => {
-          if (missingKeysInfo) {
-            const template = missingKeysInfo.template;
-            setMissingKeysInfo(null);
-            executeDeploy(template);
-          }
-        }}
-        onCancel={() => setMissingKeysInfo(null)}
-      />
+      {/* Missing-keys modal — rendered by the shared hook. Same
+          instance shape used by EmptyState. */}
+      {modal}

      {/* Sidebar */}
      {open && (
@ -483,7 +535,7 @@ export function TemplatePalette() {
                <button
                  type="button"
                  key={t.id}
-                  onClick={() => handleDeploy(t)}
+                  onClick={() => void handleDeploy(t)}
                  disabled={isDeploying}
                  className="w-full text-left bg-zinc-800/40 hover:bg-zinc-800/70 border border-zinc-700/40 hover:border-zinc-600/50 rounded-xl p-3 transition-all disabled:opacity-50 disabled:cursor-not-allowed disabled:hover:bg-zinc-800/40 disabled:hover:border-zinc-700/40 group focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70"
                >
--- a/canvas/src/components/WorkspaceNode.tsx
+++ b/canvas/src/components/WorkspaceNode.tsx
@ -6,6 +6,8 @@ import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
 import { showToast } from "@/components/Toaster";
 import { Tooltip } from "@/components/Tooltip";
 import { STATUS_CONFIG, TIER_CONFIG } from "@/lib/design-tokens";
+import { useOrgDeployState } from "@/components/canvas/useOrgDeployState";
+import { OrgCancelButton } from "@/components/canvas/OrgCancelButton";

 /** Descendant count for the "N sub" badge — children are first-class nodes
 *  rendered as full cards inside this one via React Flow's native parentId,
@ -35,6 +37,10 @@ function EjectIcon(props: React.SVGProps<SVGSVGElement>) {
 export function WorkspaceNode({ id, data }: NodeProps<Node<WorkspaceNodeData>>) {
  const statusCfg = STATUS_CONFIG[data.status] || STATUS_CONFIG.offline;
  const tierCfg = TIER_CONFIG[data.tier] || { label: `T${data.tier}`, color: "text-zinc-500 bg-zinc-800" };
+  // Org-deploy context — four derived flags off one store subscription.
+  // Drives the shimmer while provisioning, the dimmed/non-draggable
+  // treatment on locked descendants, and the Cancel pill on the root.
+  const deploy = useOrgDeployState(id);
  const selectedNodeId = useCanvasStore((s) => s.selectedNodeId);
  const selectNode = useCanvasStore((s) => s.selectNode);
  const openContextMenu = useCanvasStore((s) => s.openContextMenu);
@ -138,8 +144,21 @@ export function WorkspaceNode({ id, data }: NodeProps<Node<WorkspaceNodeData>>)
        }
        backdrop-blur-sm
        focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70 focus-visible:ring-offset-1 focus-visible:ring-offset-zinc-950
+        ${deploy.isActivelyProvisioning ? "mol-deploy-shimmer" : ""}
+        ${deploy.isLockedChild ? "mol-deploy-locked" : ""}
      `}
    >
+      {/* Cancel-deployment pill — rendered on the root of a deploying
+          org only. Positioned absolute inside the card so it moves
+          with drag; class="nodrag" on the button stops React Flow
+          from treating clicks as a drag start. */}
+      {deploy.isDeployingRoot && (
+        <OrgCancelButton
+          rootId={id}
+          rootName={data.name}
+          workspaceCount={deploy.descendantProvisioningCount}
+        />
+      )}
      {/* Status gradient bar at top */}
      <div className={`absolute inset-x-0 top-0 h-8 bg-gradient-to-b ${statusCfg.bar} pointer-events-none`} />

--- a/canvas/src/components/tests/A2ATopologyOverlay.test.tsx
+++ b/canvas/src/components/tests/A2ATopologyOverlay.test.tsx
@ -175,9 +175,28 @@ describe("buildA2AEdges — edge properties", () => {
    expect((edge.style as React.CSSProperties).pointerEvents).toBe("none");
  });

-  it("sets pointerEvents: 'none' on labelStyle", () => {
+  it("tags the edge as type=a2a so React Flow renders the custom A2AEdge component", () => {
+    // The custom edge portals labels above the node layer and makes
+    // them clickable. Without type=a2a, RF falls back to the default
+    // edge whose label sits in the SVG group (hidden under nodes,
+    // pointerEvents:none). Regression guard for the hidden-label /
+    // unclickable-label bug observed 2026-04-25.
    const [edge] = buildA2AEdges([makeRow()], NOW);
-    expect((edge.labelStyle as React.CSSProperties).pointerEvents).toBe("none");
+    expect(edge.type).toBe("a2a");
+  });
+
+  it("populates edge.data with the fields the custom edge component reads", () => {
+    // A2AEdge reads count, lastAt, isHot, label from edge.data so the
+    // shape upstream must keep emitting them. A future buildA2AEdges
+    // refactor that drops any of these silently breaks the rendered
+    // pill (label disappears, hot/warm color swap fails, click handler
+    // can still fire but the label text vanishes).
+    const [edge] = buildA2AEdges([makeRow()], NOW);
+    const data = edge.data as Record<string, unknown>;
+    expect(data.count).toBe(1);
+    expect(typeof data.lastAt).toBe("number");
+    expect(typeof data.isHot).toBe("boolean");
+    expect(data.label).toMatch(/^1 call ·/);
  });

  it("label uses singular 'call' for count === 1", () => {
--- a/canvas/src/components/tests/Canvas.a11y.test.tsx
+++ b/canvas/src/components/tests/Canvas.a11y.test.tsx
@ -72,6 +72,7 @@ const mockStoreState = {
  selectedNodeIds: new Set<string>(),
  clearSelection: vi.fn(),
  toggleNodeSelection: vi.fn(),
+  deletingIds: new Set<string>(),
 };

 vi.mock("@/store/canvas", () => ({
--- a/canvas/src/components/tests/Canvas.pan-to-node.test.tsx
+++ b/canvas/src/components/tests/Canvas.pan-to-node.test.tsx
@ -16,7 +16,9 @@ afterEach(() => {
 // ── Shared fitView spy — must be set up before vi.mock hoisting ──────────────
 const mockFitView = vi.fn();
 const mockFitBounds = vi.fn();
-const mockGetIntersectingNodes = vi.fn(() => []);
+const mockGetIntersectingNodes = vi.fn(
+  (): Array<{ id: string; position: { x: number; y: number } }> => [],
+);

 vi.mock("@xyflow/react", () => {
  const ReactFlow = ({
@ -83,6 +85,12 @@ const mockStoreState = {
  selectedNodeIds: new Set<string>(),
  clearSelection: vi.fn(),
  toggleNodeSelection: vi.fn(),
+  // Cascade-delete / deploy animation state (added in the multilevel-
+  // layout-UX bundle). Canvas.tsx reads deletingIds.size to decide
+  // whether to apply the "locked during delete" class on each node;
+  // an empty Set mirrors the idle canvas and doesn't interact with
+  // any pan/fit behaviour under test here.
+  deletingIds: new Set<string>(),
 };

 vi.mock("@/store/canvas", () => ({
--- a/canvas/src/components/tests/OrgImportPreflightModal.test.tsx
+++ b/canvas/src/components/tests/OrgImportPreflightModal.test.tsx
@ -0,0 +1,225 @@
+// @vitest-environment jsdom
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import { render, screen, fireEvent, cleanup, waitFor } from "@testing-library/react";
+
+// Regression tests for the OrgImportPreflightModal's save path and
+// any-of group rendering. Guards two specific bugs caught in the
+// UX A/B Lab rollout (2026-04-24):
+//
+//   1. saveOne early-returned because it tried to read a local
+//      `startValue` reassigned inside a functional setDrafts
+//      updater. React did not always evaluate the updater
+//      synchronously, so the gate read "" and bailed while
+//      `saving:true` committed at next render, wedging the
+//      button on "…" without ever calling createSecret.
+//
+//   2. Double-click / Enter-spam could race past the disabled-
+//      button UI gate, firing createSecret twice. The production
+//      endpoint is idempotent so no data hazard, but the extra
+//      PUT is wasteful and harder to reason about.
+
+const createSecretMock = vi.fn().mockResolvedValue(undefined);
+
+vi.mock("@/lib/api/secrets", () => ({
+  createSecret: (...args: unknown[]) => createSecretMock(...args),
+}));
+
+import { OrgImportPreflightModal } from "../OrgImportPreflightModal";
+
+beforeEach(() => {
+  createSecretMock.mockClear();
+  createSecretMock.mockResolvedValue(undefined);
+});
+
+afterEach(() => {
+  cleanup();
+});
+
+describe("OrgImportPreflightModal — saveOne", () => {
+  it("calls createSecret exactly once when Save is clicked on an any-of member", async () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    // Both any-of members render their own input + Save.
+    const input = screen.getByLabelText(/Value for ANTHROPIC_API_KEY/i);
+    fireEvent.change(input, { target: { value: "test-secret-value" } });
+
+    // The Save button adjacent to the changed input.
+    const saveButtons = screen
+      .getAllByRole("button")
+      .filter((b) => b.textContent === "Save");
+    // Two saves on screen (one per any-of member). First is ANTHROPIC.
+    fireEvent.click(saveButtons[0]);
+
+    await waitFor(() => {
+      expect(createSecretMock).toHaveBeenCalledTimes(1);
+    });
+    expect(createSecretMock).toHaveBeenCalledWith(
+      "global",
+      "ANTHROPIC_API_KEY",
+      "test-secret-value",
+    );
+  });
+
+  it("synchronous double-click on Save fires createSecret exactly once", async () => {
+    // Pause the first save so we can fire a second click while the
+    // first is still mid-await. The two clicks happen in the SAME
+    // tick — fireEvent runs synchronously through React's event
+    // system — so any guard that depends on a committed setState
+    // (e.g. `disabled={drafts[key].saving}` or a closure read of
+    // `drafts[key].saving`) loses the race: the second click sees
+    // saving=false because React hasn't committed yet. The fix is
+    // a useRef-based gate that flips synchronously before any await.
+    let resolveCreate!: () => void;
+    createSecretMock.mockImplementationOnce(
+      () => new Promise<void>((resolve) => {
+        resolveCreate = resolve;
+      }),
+    );
+
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    const input = screen.getByLabelText(/Value for ANTHROPIC_API_KEY/i);
+    fireEvent.change(input, { target: { value: "test-secret-value" } });
+
+    const saveButtons = screen
+      .getAllByRole("button")
+      .filter((b) => b.textContent === "Save");
+    // Pull the React-bound onClick once so both invocations close
+    // over the SAME callback — simulates a double-fire that happens
+    // before React reconciles between events. Without this, RTL
+    // flushes act() between fireEvent calls and the second click
+    // sees the post-commit state.
+    const saveBtn = saveButtons[0] as HTMLButtonElement;
+    saveBtn.click();
+    saveBtn.click();
+
+    // Give React a tick to process any queued state updates.
+    await waitFor(() => {
+      expect(createSecretMock).toHaveBeenCalledTimes(1);
+    });
+
+    resolveCreate();
+    await waitFor(() => {
+      // Post-save count must remain at exactly one.
+      expect(createSecretMock).toHaveBeenCalledTimes(1);
+    });
+  });
+
+  it("does not call createSecret when value is empty", async () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    // Button is disabled when value is empty — clicking a disabled
+    // button still dispatches onClick in RTL (since fireEvent
+    // bypasses the disabled attribute), so this asserts the code-
+    // level gate catches it, not just the UI.
+    const saveButtons = screen
+      .getAllByRole("button")
+      .filter((b) => b.textContent === "Save");
+    fireEvent.click(saveButtons[0]);
+
+    // Small async wait to let any state updates settle.
+    await new Promise((r) => setTimeout(r, 50));
+    expect(createSecretMock).not.toHaveBeenCalled();
+  });
+});
+
+describe("OrgImportPreflightModal — any-of rendering", () => {
+  it("renders each any-of member as a separate input row", () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    expect(screen.getByText("Configure any one")).toBeTruthy();
+    expect(screen.getByLabelText(/Value for ANTHROPIC_API_KEY/i)).toBeTruthy();
+    expect(screen.getByLabelText(/Value for CLAUDE_CODE_OAUTH_TOKEN/i)).toBeTruthy();
+  });
+
+  it("shows satisfied indicator when any member is configured, and enables Import", () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set(["CLAUDE_CODE_OAUTH_TOKEN"])}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    // "✓ using CLAUDE_CODE_OAUTH_TOKEN" banner renders. Name appears
+    // twice (banner + member row) so use getAllByText.
+    expect(screen.getByText(/using/i)).toBeTruthy();
+    expect(screen.getAllByText("CLAUDE_CODE_OAUTH_TOKEN").length).toBeGreaterThanOrEqual(1);
+
+    const importBtn = screen.getByRole("button", { name: /^Import$/ });
+    expect(importBtn.hasAttribute("disabled")).toBe(false);
+  });
+
+  it("keeps Import disabled when no any-of member is configured", () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    const importBtn = screen.getByRole("button", { name: /^Import$/ });
+    expect(importBtn.hasAttribute("disabled")).toBe(true);
+  });
+});
--- a/canvas/src/components/tests/ProvisioningTimeout.test.tsx
+++ b/canvas/src/components/tests/ProvisioningTimeout.test.tsx
@ -226,13 +226,18 @@ describe("ProvisioningTimeout", () => {
        );
      });

-      it("returns hermes override when runtime = hermes", () => {
+      it("hermes returns default — value moved server-side post-#2054 phase 3", () => {
+        // RUNTIME_PROFILES.hermes was removed when template-hermes
+        // started declaring provision_timeout_seconds in its
+        // config.yaml. The value now flows server-side via the
+        // workspace API → WorkspaceData.provision_timeout_ms →
+        // resolver overrides path. With no override supplied, the
+        // resolver falls through to the default — same as any other
+        // runtime without a canvas-side override.
        expect(provisionTimeoutForRuntime("hermes")).toBe(
-          RUNTIME_PROFILES.hermes?.provisionTimeoutMs,
-        );
-        expect(provisionTimeoutForRuntime("hermes")).toBeGreaterThanOrEqual(
-          DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs * 5,
+          DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs,
        );
+        expect(RUNTIME_PROFILES.hermes).toBeUndefined();
      });

      it("server-side workspace override wins over runtime profile", () => {
@ -309,7 +314,7 @@ describe("ProvisioningTimeout", () => {
        expect(node?.data.provisionTimeoutMs).toBe(600_000);
      });

-      it("absent provision_timeout_ms hydrates to null (falls through to runtime profile)", () => {
+      it("absent provision_timeout_ms hydrates to null (falls through to default post-cleanup)", () => {
        useCanvasStore.getState().hydrate([
          makeWS({ id: "ws-default", name: "Default", status: "provisioning", runtime: "hermes" }),
        ]);
@ -317,27 +322,32 @@ describe("ProvisioningTimeout", () => {
          .getState()
          .nodes.find((n) => n.id === "ws-default");
        expect(node?.data.provisionTimeoutMs).toBeNull();
-        // And the resolver still returns hermes' profile value when
-        // no override is supplied — proves the fall-through stays intact.
+        // Post-#2054 phase 3: hermes no longer has a canvas-side
+        // RUNTIME_PROFILES entry. With no node override the resolver
+        // falls all the way through to DEFAULT_RUNTIME_PROFILE. In
+        // production the workspace-server-side template lookup
+        // populates node.provisionTimeoutMs to 720000 before this
+        // resolver runs (#2094); this test isolates the fall-through
+        // behavior when that population hasn't happened yet.
        expect(
          provisionTimeoutForRuntime("hermes", {
            provisionTimeoutMs: node?.data.provisionTimeoutMs ?? undefined,
          }),
-        ).toBe(RUNTIME_PROFILES.hermes.provisionTimeoutMs);
+        ).toBe(DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs);
      });

-      it("server override wins over runtime profile via the resolver path the component uses", () => {
-        // Mirrors ProvisioningTimeout.tsx:144 where node.provisionTimeoutMs
-        // is passed as overrides — verifies the resolver respects it
-        // even when the runtime has its own profile entry.
-        const override = 30_000;
+      it("server override wins over default via the resolver path the component uses", () => {
+        // Mirrors ProvisioningTimeout.tsx where node.provisionTimeoutMs
+        // is passed as overrides — verifies the resolver respects the
+        // override regardless of the runtime's profile state.
+        const override = 600_000;
        expect(
          provisionTimeoutForRuntime("hermes", {
            provisionTimeoutMs: override,
          }),
        ).toBe(override);
-        // Sanity — the runtime profile would have been much larger.
-        expect(RUNTIME_PROFILES.hermes.provisionTimeoutMs).toBeGreaterThan(
+        // Sanity — the override is the path that wins (default is much smaller).
+        expect(DEFAULT_RUNTIME_PROFILE.provisionTimeoutMs).toBeLessThan(
          override,
        );
      });
--- a/canvas/src/components/tests/SkillsTab.install.test.tsx
+++ b/canvas/src/components/tests/SkillsTab.install.test.tsx
@ -0,0 +1,143 @@
+// @vitest-environment jsdom
+//
+// Behavioral coverage for the install flow. Two regressions to pin
+// down:
+//
+//  1. The install POST URL has to include the workspace id. A pre-fix
+//     bug routed it to /workspaces/undefined/plugins because the
+//     component read `data.id`, but `WorkspaceNodeData` has no `id`
+//     field — its `extends Record<string, unknown>` index signature
+//     hid the bad access from TS. The component now takes
+//     `workspaceId` as an explicit prop; this test asserts the URL.
+//
+//  2. The optimistic install update has to flip the registry row to
+//     "Installed" without waiting for the 15s reload timer (the
+//     PLUGIN_RELOAD_DELAY_MS gap). This test asserts the row's "Install"
+//     button is replaced by the green "Installed" tag synchronously
+//     after the POST resolves.
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import { render, screen, fireEvent, waitFor, cleanup } from "@testing-library/react";
+
+const mockApiGet = vi.fn();
+const mockApiPost = vi.fn();
+vi.mock("@/lib/api", () => ({
+  api: {
+    get: (...args: unknown[]) => mockApiGet(...args),
+    post: (...args: unknown[]) => mockApiPost(...args),
+    put: vi.fn().mockResolvedValue({}),
+    del: vi.fn().mockResolvedValue({}),
+    patch: vi.fn().mockResolvedValue({}),
+  },
+}));
+
+vi.mock("@/store/canvas", () => ({
+  useCanvasStore: Object.assign(
+    vi.fn((selector: (s: Record<string, unknown>) => unknown) =>
+      selector({ setPanelTab: vi.fn() } as Record<string, unknown>),
+    ),
+    { getState: () => ({ setPanelTab: vi.fn() }) },
+  ),
+  summarizeWorkspaceCapabilities: vi.fn(() => ({ skills: [], tools: [] })),
+}));
+
+vi.mock("../Toaster", () => ({ showToast: vi.fn() }));
+
+import { SkillsTab } from "../tabs/SkillsTab";
+
+function makeData() {
+  return {
+    name: "Test WS",
+    status: "online",
+    tier: 1,
+    agentCard: null,
+    activeTasks: 0,
+    collapsed: false,
+    role: "agent",
+    lastErrorRate: 0,
+    lastSampleError: "",
+    url: "http://localhost:9000",
+    parentId: null,
+    currentTask: "",
+    runtime: "langgraph",
+    needsRestart: false,
+    budgetLimit: null,
+  };
+}
+
+const REGISTRY = [
+  {
+    name: "browser-automation",
+    version: "1.1.0",
+    description: "Browser automation + testing",
+    author: "molecule",
+    tags: ["browser", "playwright"],
+    skills: [],
+    runtimes: ["claude-code"],
+  },
+];
+
+beforeEach(() => {
+  // Order matches the component's loadInstalled / loadRegistry
+  // /loadSourceSchemes calls. Schemes endpoint resolves with an
+  // empty list so the Install-from-source input doesn't blow up.
+  mockApiGet.mockReset();
+  mockApiPost.mockReset();
+  mockApiGet.mockImplementation((path: string) => {
+    if (path.endsWith("/plugins") && path.startsWith("/workspaces/")) {
+      return Promise.resolve([]); // installed
+    }
+    if (path === "/plugins") {
+      return Promise.resolve(REGISTRY); // registry
+    }
+    if (path === "/plugins/sources") {
+      return Promise.resolve({ schemes: ["github://", "local://"] });
+    }
+    return Promise.resolve(null);
+  });
+  mockApiPost.mockResolvedValue({ status: "installed", plugin: "browser-automation" });
+});
+
+afterEach(() => {
+  cleanup();
+  vi.clearAllMocks();
+});
+
+// Returns the registry row's Install button. The custom-source input
+// also renders an "Install" button, so `findByRole({name: /install/})`
+// throws on multiple matches; scope by the row's plugin-name text.
+async function findRowInstallButton() {
+  const nameNode = await screen.findByText("browser-automation");
+  const row = nameNode.closest("div.flex.items-center.justify-between") as HTMLElement;
+  if (!row) throw new Error("could not locate row container for browser-automation");
+  const buttons = row.querySelectorAll("button");
+  const install = Array.from(buttons).find((b) => b.textContent?.trim() === "Install");
+  if (!install) throw new Error("row has no Install button (already installed?)");
+  return install;
+}
+
+describe("SkillsTab install flow", () => {
+  it("POSTs to /workspaces/<workspaceId>/plugins (no `undefined` in URL)", async () => {
+    render(<SkillsTab workspaceId="ws-abc-123" data={makeData() as never} />);
+
+    fireEvent.click(await findRowInstallButton());
+
+    await waitFor(() => expect(mockApiPost).toHaveBeenCalled());
+    expect(mockApiPost).toHaveBeenCalledWith(
+      "/workspaces/ws-abc-123/plugins",
+      { source: "local://browser-automation" },
+    );
+  });
+
+  it("flips the registry row to 'Installed' synchronously after POST resolves (no 15s wait)", async () => {
+    render(<SkillsTab workspaceId="ws-abc-123" data={makeData() as never} />);
+
+    fireEvent.click(await findRowInstallButton());
+
+    // The "Installed" green tag must appear without advancing the
+    // reload timer — the optimistic update is the entire point of
+    // this fix. If this test ever regresses to needing fake timers
+    // + advanceTimersByTime, the optimistic path is broken.
+    const installedTag = await screen.findByText(/^Installed$/i);
+    expect(installedTag).toBeDefined();
+  });
+});
--- a/canvas/src/components/tests/tabs.a11y.test.tsx
+++ b/canvas/src/components/tests/tabs.a11y.test.tsx
@ -123,7 +123,7 @@ describe("SkillsTab — aria-label on bare source input (WCAG 1.3.1)", () => {
  });

  it('install source input has aria-label="Install from source URL"', async () => {
-    render(<SkillsTab data={makeSkillsData() as never} />);
+    render(<SkillsTab workspaceId="ws-test-id" data={makeSkillsData() as never} />);

    // The source input is inside the registry section (showRegistry=false initially).
    // Click the "+ Install Plugin" button to reveal it.
@ -138,7 +138,7 @@ describe("SkillsTab — aria-label on bare source input (WCAG 1.3.1)", () => {
  });

  it("install source input is a text input (not hidden)", async () => {
-    render(<SkillsTab data={makeSkillsData() as never} />);
+    render(<SkillsTab workspaceId="ws-test-id" data={makeSkillsData() as never} />);

    const installBtn = screen.getByRole("button", { name: /install plugin/i });
    fireEvent.click(installBtn);
--- a/canvas/src/components/canvas/A2AEdge.tsx
+++ b/canvas/src/components/canvas/A2AEdge.tsx
@ -0,0 +1,133 @@
+"use client";
+
+import { memo } from "react";
+import {
+  BaseEdge,
+  EdgeLabelRenderer,
+  getBezierPath,
+  type EdgeProps,
+} from "@xyflow/react";
+import { useCanvasStore } from "@/store/canvas";
+
+/**
+ * Custom edge for the A2A topology overlay. Solves two problems with the
+ * default React Flow edge label rendering:
+ *
+ *   1. **Z-order.** The default `label` prop renders inside the edge's
+ *      SVG group, which always sits below node DOM in React Flow. When
+ *      a label happened to land underneath a workspace card, it was
+ *      hidden. EdgeLabelRenderer mounts label content in a separate
+ *      portal layer that we can pin above nodes via z-index.
+ *
+ *   2. **Clickability.** Default labels inherit `pointerEvents: none`
+ *      from the SVG path so the user can drag through them. The
+ *      portaled label is a regular HTML element with its own pointer
+ *      events — we set `pointerEvents: all` only on the label pill so
+ *      drags on the edge line still pass through to the canvas.
+ *
+ * On click: selects the source workspace and switches its side panel
+ * to Activity, where the user can inspect the underlying delegations.
+ */
+interface A2AEdgeData {
+  count: number;
+  lastAt: number;
+  isHot: boolean;
+  /** Pre-formatted "5 calls · 2m ago" — built upstream by buildA2AEdges
+   *  so the same string renders here and in any future tooltip layer. */
+  label: string;
+}
+
+function A2AEdgeImpl({
+  id,
+  source,
+  sourceX,
+  sourceY,
+  targetX,
+  targetY,
+  sourcePosition,
+  targetPosition,
+  data,
+  style = {},
+}: EdgeProps) {
+  const [edgePath, labelX, labelY] = getBezierPath({
+    sourceX,
+    sourceY,
+    sourcePosition,
+    targetX,
+    targetY,
+    targetPosition,
+  });
+
+  const selectNode = useCanvasStore((s) => s.selectNode);
+  const setPanelTab = useCanvasStore((s) => s.setPanelTab);
+
+  const edgeData = (data ?? {}) as Partial<A2AEdgeData>;
+  const labelText = edgeData.label ?? "";
+  const isHot = edgeData.isHot ?? false;
+  const count = edgeData.count ?? 0;
+
+  const handleClick = (e: React.MouseEvent) => {
+    e.stopPropagation();
+    // Select the source (the agent that initiated the delegations).
+    // The user's mental model when clicking the edge is "show me the
+    // calls FROM here" — that's the source's activity feed.
+    //
+    // Preserve the current tab when the user re-clicks the same edge
+    // (or another edge whose source is already selected). Yanking
+    // them back to Activity every click would surprise — they may
+    // have intentionally switched to Chat / Memory while looking at
+    // this peer. The first click that lands a *different* selection
+    // still routes them to Activity, which is the discovery affordance.
+    const alreadySelected =
+      useCanvasStore.getState().selectedNodeId === source;
+    selectNode(source);
+    if (!alreadySelected) {
+      setPanelTab("activity");
+    }
+  };
+
+  // The edge stroke color matches what buildA2AEdges sets on the SVG
+  // path style. Mirror it on the badge border so the visual identity
+  // (hot=violet vs warm=blue) carries to the clickable label.
+  const accent = isHot ? "border-violet-500/60" : "border-blue-500/60";
+  const accentText = isHot ? "text-violet-200" : "text-blue-200";
+  const ariaLabel = `${count} delegation${count === 1 ? "" : "s"} from ${
+    edgeData.label?.split(" · ")[1] ?? "recent"
+  }. Click to inspect.`;
+
+  return (
+    <>
+      <BaseEdge id={id} path={edgePath} style={style} markerEnd="url(#a2a-arrow)" />
+      {labelText && (
+        <EdgeLabelRenderer>
+          <div
+            // The label sits in a portal at the canvas root. position:
+            // absolute + the (labelX, labelY) translate places it at
+            // the edge midpoint. zIndex 5 wins against React Flow's
+            // node layer (default z=0) without fighting the controls
+            // strip (z=10).
+            style={{
+              position: "absolute",
+              transform: `translate(-50%, -50%) translate(${labelX}px, ${labelY}px)`,
+              pointerEvents: "all",
+              zIndex: 5,
+            }}
+            className="nodrag nopan"
+          >
+            <button
+              type="button"
+              onClick={handleClick}
+              aria-label={ariaLabel}
+              title="Open source workspace's activity feed"
+              className={`px-2 py-0.5 rounded-full bg-zinc-900/95 border ${accent} ${accentText} text-[10px] font-medium shadow-md shadow-black/40 backdrop-blur-sm hover:bg-zinc-800 hover:border-opacity-100 transition-colors cursor-pointer`}
+            >
+              {labelText}
+            </button>
+          </div>
+        </EdgeLabelRenderer>
+      )}
+    </>
+  );
+}
+
+export const A2AEdge = memo(A2AEdgeImpl);
--- a/canvas/src/components/canvas/OrgCancelButton.tsx
+++ b/canvas/src/components/canvas/OrgCancelButton.tsx
@ -0,0 +1,165 @@
+"use client";
+
+import { useState } from "react";
+import { api } from "@/lib/api";
+import { useCanvasStore } from "@/store/canvas";
+import { showToast } from "@/components/Toaster";
+
+interface Props {
+  /** Root workspace of the org being deployed. The cancel action
+   *  cascades delete through workspace-server's existing recursive
+   *  delete handler, so we only need the root id. */
+  rootId: string;
+  rootName: string;
+  /** Count rendered in the pill label; updated live as children
+   *  come online (the useOrgDeployState hook recomputes on every
+   *  status change). */
+  workspaceCount: number;
+}
+
+/**
+ * Cancel-deployment pill attached to the root of a deploying org.
+ * One click → confirm dialog → DELETE /workspaces/:rootId?confirm=true
+ * which cascades through every descendant server-side.
+ *
+ * Rendered inside the root's WorkspaceNode card via an absolute-
+ * positioned overlay so it sits visually ON the card and moves with
+ * drag. `className="nodrag"` stops React Flow from interpreting
+ * clicks here as the start of a drag gesture.
+ *
+ * Deliberately uses only `.mol-deploy-cancel*` classes for styling —
+ * every color / easing comes from theme-tokens.css, so a future
+ * light-theme (or tenant-branded theme) inherits automatically.
+ */
+export function OrgCancelButton({ rootId, rootName, workspaceCount }: Props) {
+  const [confirming, setConfirming] = useState(false);
+  const [submitting, setSubmitting] = useState(false);
+
+  const handleCancel = async () => {
+    setSubmitting(true);
+    // Populate deletingIds with the subtree so every descendant
+    // (and the root) locks into the dim + non-draggable state for
+    // the duration of the network round-trip + server cascade —
+    // same treatment the regular delete gives. Otherwise the org
+    // looks interactive for the several seconds between click and
+    // the first WORKSPACE_REMOVED event.
+    const preState = useCanvasStore.getState();
+    const subtreeIds = new Set<string>();
+    const walkStack = [rootId];
+    while (walkStack.length) {
+      const nid = walkStack.pop()!;
+      subtreeIds.add(nid);
+      for (const n of preState.nodes) {
+        if (n.data.parentId === nid) walkStack.push(n.id);
+      }
+    }
+    preState.beginDelete(subtreeIds);
+    try {
+      await api.del<{ status: string }>(
+        `/workspaces/${rootId}?confirm=true`,
+      );
+      showToast(`Cancelled deployment of "${rootName}"`, "success");
+      // Optimistic local removal — workspace-server broadcasts
+      // WORKSPACE_REMOVED per node but the WS may lag; strip the
+      // subtree now so the user sees immediate feedback. Re-read
+      // the store AFTER the await: children may have landed (or
+      // already been removed by WS events) during the network
+      // round-trip. If the WS_REMOVED handler already dropped the
+      // root during the network call, bail out — the subtree walk
+      // would miss any now-orphaned descendants (handleCanvasEvent
+      // reparents children of a removed node upward, so they no
+      // longer share the original root's id as parentId).
+      const postDeleteState = useCanvasStore.getState();
+      if (!postDeleteState.nodes.some((n) => n.id === rootId)) {
+        return;
+      }
+      const subtree = new Set<string>();
+      const stack = [rootId];
+      while (stack.length) {
+        const id = stack.pop()!;
+        subtree.add(id);
+        for (const n of postDeleteState.nodes) {
+          if (n.data.parentId === id) stack.push(n.id);
+        }
+      }
+      useCanvasStore.setState({
+        nodes: postDeleteState.nodes.filter((n) => !subtree.has(n.id)),
+        edges: postDeleteState.edges.filter(
+          (e) => !subtree.has(e.source) && !subtree.has(e.target),
+        ),
+      });
+    } catch (e) {
+      // Undo the lock so the user can try again / interact with the
+      // still-deploying subtree.
+      useCanvasStore.getState().endDelete(subtreeIds);
+      showToast(
+        e instanceof Error ? `Cancel failed: ${e.message}` : "Cancel failed",
+        "error",
+      );
+    } finally {
+      // Success path's endDelete is covered implicitly — every node
+      // in the subtree is stripped by the optimistic local removal
+      // above, and any stragglers are removed by WORKSPACE_REMOVED
+      // WS events whose handler is a no-op on already-missing ids.
+      // The deletingIds set will naturally empty as endDelete runs
+      // in both paths below.
+      useCanvasStore.getState().endDelete(subtreeIds);
+      setSubmitting(false);
+      setConfirming(false);
+    }
+  };
+
+  if (confirming) {
+    return (
+      <div
+        className="nodrag absolute -top-10 right-0 z-20 flex items-center gap-1.5 rounded-lg bg-zinc-900/95 px-2 py-1 shadow-lg border border-red-800/60"
+        onClick={(e) => e.stopPropagation()}
+      >
+        <span className="text-[10px] text-zinc-300">
+          Delete {workspaceCount} workspace{workspaceCount === 1 ? "" : "s"}?
+        </span>
+        <button
+          type="button"
+          onClick={handleCancel}
+          disabled={submitting}
+          className="mol-deploy-cancel px-2 py-0.5 rounded text-[10px] font-semibold"
+        >
+          {submitting ? "Deleting…" : "Yes"}
+        </button>
+        <button
+          type="button"
+          onClick={() => setConfirming(false)}
+          disabled={submitting}
+          className="px-2 py-0.5 rounded bg-zinc-700/80 hover:bg-zinc-600 text-[10px] text-zinc-200"
+        >
+          No
+        </button>
+      </div>
+    );
+  }
+
+  return (
+    <button
+      type="button"
+      onClick={(e) => {
+        // Stop the click from bubbling to React Flow (selects the
+        // node) — the Cancel pill is a UI surface, not a node
+        // activation.
+        e.stopPropagation();
+        setConfirming(true);
+      }}
+      className="nodrag mol-deploy-cancel mol-deploy-cancel-pulse absolute -top-7 right-1 z-20 flex items-center gap-1 rounded-full px-2.5 py-0.5 text-[10px] font-semibold shadow-md"
+      aria-label={`Cancel deployment of ${rootName}`}
+    >
+      <svg width="10" height="10" viewBox="0 0 16 16" aria-hidden="true">
+        <path
+          d="M4 4l8 8M12 4l-8 8"
+          stroke="currentColor"
+          strokeWidth="2"
+          strokeLinecap="round"
+        />
+      </svg>
+      <span>Cancel ({workspaceCount})</span>
+    </button>
+  );
+}
--- a/canvas/src/components/canvas/tests/useCanvasViewport.test.ts
+++ b/canvas/src/components/canvas/tests/useCanvasViewport.test.ts
@ -0,0 +1,98 @@
+import { describe, it, expect } from "vitest";
+import { pruneStaleKeys, shouldFitGrowing } from "../useCanvasViewport";
+
+// Tests cover the auto-fit gate in isolation. The hook itself is
+// effects + refs + React Flow handles, awkward to exercise directly —
+// extracting the pure decision into shouldFitGrowing(...) lets us
+// pin down the regression-prone logic with unit tests instead.
+
+describe("shouldFitGrowing", () => {
+  it("fits the very first time (no prior snapshot)", () => {
+    expect(shouldFitGrowing(["a"], undefined, null, 0)).toBe(true);
+  });
+
+  it("fits when the prior snapshot is empty", () => {
+    expect(shouldFitGrowing(["a", "b"], new Set(), null, 0)).toBe(true);
+  });
+
+  it("fits when a brand-new id has been added since the last fit", () => {
+    const prev = new Set(["root", "a", "b"]);
+    expect(shouldFitGrowing(["root", "a", "b", "c"], prev, null, 0)).toBe(true);
+  });
+
+  it("respects user pan when the subtree hasn't grown", () => {
+    const prev = new Set(["root", "a", "b"]);
+    // Status update on existing node — same membership.
+    expect(shouldFitGrowing(["root", "a", "b"], prev, 5_000, 1_000)).toBe(false);
+  });
+
+  it("fits when the subtree hasn't grown but the user never panned", () => {
+    const prev = new Set(["root", "a", "b"]);
+    expect(shouldFitGrowing(["root", "a", "b"], prev, null, 1_000)).toBe(true);
+  });
+
+  it("fits when the subtree hasn't grown and the user panned BEFORE the last fit", () => {
+    const prev = new Set(["root", "a", "b"]);
+    expect(shouldFitGrowing(["root", "a", "b"], prev, 500, 1_000)).toBe(true);
+  });
+
+  it("forces fit on delete-then-add even when the count is unchanged", () => {
+    // Subtree was [root, a, b, c, d]. Then `d` got removed and a
+    // sibling `e` arrived. Same length, different membership — a
+    // length-only check would skip the fit and leave `e` off-screen.
+    const prev = new Set(["root", "a", "b", "c", "d"]);
+    expect(
+      shouldFitGrowing(["root", "a", "b", "c", "e"], prev, 5_000, 1_000),
+    ).toBe(true);
+  });
+
+  it("does NOT fit on shrink-only when the user has panned (deletion alone shouldn't override exploration)", () => {
+    const prev = new Set(["root", "a", "b", "c"]);
+    expect(shouldFitGrowing(["root", "a", "b"], prev, 5_000, 1_000)).toBe(false);
+  });
+});
+
+describe("pruneStaleKeys (#2070)", () => {
+  it("drops entries whose key is no longer in the live key set", () => {
+    const map = new Map<string, Set<string>>([
+      ["root-1", new Set(["root-1", "a"])],
+      ["root-2", new Set(["root-2", "b"])],
+      ["root-3", new Set(["root-3", "c"])],
+    ]);
+    pruneStaleKeys(map, new Set(["root-1", "root-3"]));
+    expect([...map.keys()].sort()).toEqual(["root-1", "root-3"]);
+  });
+
+  it("is a no-op when every key is still live", () => {
+    const map = new Map<string, Set<string>>([
+      ["root-1", new Set(["root-1"])],
+      ["root-2", new Set(["root-2"])],
+    ]);
+    pruneStaleKeys(map, new Set(["root-1", "root-2"]));
+    expect(map.size).toBe(2);
+  });
+
+  it("clears the map when no live keys remain", () => {
+    const map = new Map<string, Set<string>>([
+      ["root-1", new Set(["root-1"])],
+    ]);
+    pruneStaleKeys(map, new Set());
+    expect(map.size).toBe(0);
+  });
+
+  it("does not add new entries — only deletes stale ones", () => {
+    const map = new Map<string, Set<string>>();
+    pruneStaleKeys(map, new Set(["root-1", "root-2"]));
+    expect(map.size).toBe(0);
+  });
+
+  it("preserves value identity for survivors (no rebuild)", () => {
+    const survivor = new Set(["root-1", "a"]);
+    const map = new Map<string, Set<string>>([
+      ["root-1", survivor],
+      ["root-2", new Set(["root-2", "b"])],
+    ]);
+    pruneStaleKeys(map, new Set(["root-1"]));
+    expect(map.get("root-1")).toBe(survivor);
+  });
+});
--- a/canvas/src/components/canvas/useCanvasViewport.ts
+++ b/canvas/src/components/canvas/useCanvasViewport.ts
@ -3,11 +3,58 @@
 import { useCallback, useEffect, useRef } from "react";
 import { useReactFlow } from "@xyflow/react";
 import { useCanvasStore } from "@/store/canvas";
+import { appendClass, removeClass } from "@/store/classNames";
 import {
  CHILD_DEFAULT_HEIGHT,
  CHILD_DEFAULT_WIDTH,
 } from "@/store/canvas-topology";

+/**
+ * Decide whether the deploy-time auto-fit should run. Pure function so
+ * the gate logic is unit-testable in isolation — the surrounding
+ * useEffect tangle of refs, timers, and React Flow handles is awkward
+ * to exercise directly.
+ *
+ * Returns true when the auto-fit SHOULD fire:
+ *   - the subtree contains an id that wasn't in the previous snapshot
+ *     (a new node arrived → user has lost context, force the fit
+ *     through regardless of any user-pan in between), OR
+ *   - the user has not panned since the last successful fit (so the
+ *     auto-fit isn't fighting their override).
+ *
+ * `prevSubtreeIds === undefined` means no fit has ever run for this
+ * root — treat every id as "new" and fit. `userPannedAt === null`
+ * means the user has never panned at all in this session — fit.
+ */
+export function shouldFitGrowing(
+  currentSubtreeIds: readonly string[],
+  prevSubtreeIds: ReadonlySet<string> | undefined,
+  userPannedAt: number | null,
+  lastAutoFitAt: number,
+): boolean {
+  if (!prevSubtreeIds || prevSubtreeIds.size === 0) return true;
+  for (const id of currentSubtreeIds) {
+    if (!prevSubtreeIds.has(id)) return true;
+  }
+  if (userPannedAt === null) return true;
+  return userPannedAt <= lastAutoFitAt;
+}
+
+/**
+ * Drop entries from `map` whose key isn't in `liveKeys`. Generic so the
+ * same shape can be reused for any keyed-by-node-id cache (#2070).
+ */
+export function pruneStaleKeys<T>(
+  map: Map<string, T>,
+  liveKeys: ReadonlySet<string>,
+): void {
+  for (const key of map.keys()) {
+    if (!liveKeys.has(key)) {
+      map.delete(key);
+    }
+  }
+}
+
 /**
 * Wires the two canvas-wide CustomEvent listeners and the viewport
 * save/restore bookkeeping so Canvas.tsx doesn't have to.
@ -25,17 +72,79 @@ export function useCanvasViewport() {
  const saveViewport = useCanvasStore((s) => s.saveViewport);
  const saveTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
  const panTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
-  const autoFitTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
+  // Two distinct fit timers — DO NOT collapse to one.
+  //   - settleFitTimerRef:   1200ms one-shot run by the
+  //     "transition from any-provisioning to none" effect (the deploy
+  //     just finished — settle on the whole org once).
+  //   - trackingFitTimerRef: 500ms debounced by the per-arrival
+  //     molecule:fit-deploying-org event handler (track the org's
+  //     bounds as children land during the deploy).
+  // They MUST NOT share a ref: the two effects fire interleaved
+  // (every WS event during a deploy resets the tracking timer; the
+  // settle timer arms the moment provisioning hits zero), and a
+  // shared ref made each effect silently clearTimeout the other's
+  // pending fit. Today's behavior happened to land in the right
+  // order out of luck; splitting the refs makes ordering independent
+  // of fire sequence.
+  const settleFitTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
+  const trackingFitTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
  // Tracks whether any workspace was provisioning on the previous
  // render so we can detect the boundary when the last one finishes
  // and auto-fit the viewport around the whole tree.
  const hadProvisioningRef = useRef(false);
+  // Respect-user-pan gate for the deploy-time auto-fit. Earlier
+  // revisions tried to detect user pans via `onMoveEnd`, but React
+  // Flow v12 fires that callback with a truthy event at the END of
+  // a programmatic fitView animation — so the first auto-fit we
+  // triggered would immediately look like a user pan and block
+  // every subsequent fit for the rest of the deploy, leaving the
+  // viewport stuck wherever the first fit landed. Now we stamp
+  // this ref ONLY on wheel / pointerdown / touchstart on the
+  // React Flow pane itself (see the effect below), which are
+  // unambiguous user-gesture signals.
+  const userPannedAtRef = useRef<number | null>(null);
+  const lastAutoFitAtRef = useRef(0);

  useEffect(() => {
    return () => {
      clearTimeout(saveTimerRef.current);
      clearTimeout(panTimerRef.current);
-      clearTimeout(autoFitTimerRef.current);
+      clearTimeout(settleFitTimerRef.current);
+      clearTimeout(trackingFitTimerRef.current);
+    };
+  }, []);
+
+  // User-gesture listeners for the respect-user-pan gate. Listens on
+  // `document` with capture phase and filters to events whose target
+  // lies inside the React Flow pane — this avoids a mount-order race
+  // (`.react-flow__pane` may not exist when the hook first runs if
+  // RF is behind a Suspense boundary) AND keeps clicks on the
+  // toolbar / modals / side panel from stamping user-pan-intent.
+  // Capture phase runs before target-phase `stopPropagation` so a
+  // handler elsewhere can't swallow the signal.
+  //
+  // Wheel only — NOT pointerdown. A pointerdown on the pane fires for
+  // ordinary clicks (deselect, click-near-a-card, modal-close-bubble)
+  // as well as the start of a drag-pan. Treating every pointerdown as
+  // "user wants to override auto-fit" meant a single accidental click
+  // before/during an org import locked out every subsequent fit, so
+  // the viewport stuck at whatever the first fit landed on while
+  // children kept materialising off-screen. Wheel is the canonical
+  // unambiguous gesture: scroll-to-pan and pinch-zoom both surface as
+  // wheel events. Drag-pans without an accompanying wheel are rare
+  // enough that letting them be overridden by a follow-up auto-fit is
+  // the right tradeoff.
+  useEffect(() => {
+    if (typeof window === "undefined") return;
+    const stamp = (e: Event) => {
+      const target = e.target as HTMLElement | null;
+      if (!target?.closest?.(".react-flow__pane")) return;
+      userPannedAtRef.current = Date.now();
+    };
+    const opts: AddEventListenerOptions = { passive: true, capture: true };
+    document.addEventListener("wheel", stamp, opts);
+    return () => {
+      document.removeEventListener("wheel", stamp, opts);
    };
  }, []);

@ -55,20 +164,64 @@ export function useCanvasViewport() {
    hadProvisioningRef.current = hasProvisioning;

    if (wasProvisioning && !hasProvisioning && nodeCount > 0) {
-      clearTimeout(autoFitTimerRef.current);
+      // Root-complete moment — every root that has children just
+      // finished deploying. Pop + glow once (mol-deploy-root-complete)
+      // then auto-fit the viewport around the whole org. Leaf-only
+      // roots (single workspaces with no children) are skipped so the
+      // effect reads as "your org landed" not "random card flickered".
+      const state = useCanvasStore.getState();
+      const rootsWithChildren = new Set<string>();
+      for (const n of state.nodes) {
+        if (n.data.parentId) continue;
+        if (state.nodes.some((c) => c.data.parentId === n.id)) {
+          rootsWithChildren.add(n.id);
+        }
+      }
+      if (rootsWithChildren.size > 0) {
+        useCanvasStore.setState({
+          nodes: state.nodes.map((n) =>
+            rootsWithChildren.has(n.id)
+              ? { ...n, className: appendClass(n.className, "mol-deploy-root-complete") }
+              : n,
+          ),
+        });
+        // Strip the one-shot class after the keyframe ends so a later
+        // deploy on the same node can fire it again.
+        window.setTimeout(() => {
+          const s = useCanvasStore.getState();
+          useCanvasStore.setState({
+            nodes: s.nodes.map((n) =>
+              rootsWithChildren.has(n.id)
+                ? { ...n, className: removeClass(n.className, "mol-deploy-root-complete") }
+                : n,
+            ),
+          });
+        }, 800);
+      }
+
+      clearTimeout(settleFitTimerRef.current);
      // 1200ms settle delay: lets React Flow's DOM measurement pass
      // resize newly-online parents before we compute bounds.
      // Measuring too early gives us the pre-render skeleton bbox and
      // fitView zooms to that smaller-than-real rectangle.
-      autoFitTimerRef.current = setTimeout(() => {
+      settleFitTimerRef.current = setTimeout(() => {
        fitView({
+          // Deliberately SLOWER than the in-flight tracking fits
+          // (400ms). The asymmetry reads as "settling" on the
+          // finished org rather than "tracking" another arrival,
+          // which is the intended UX for the "deploy done" moment.
+          // Don't normalize these two durations to the same value.
          duration: 1200,
-          padding: 0.25,
+          // Match the deploy-time fit padding (0.45) so end-state
+          // and in-flight state use the same framing — otherwise
+          // the final zoom-out "jumps" relative to the intermediate
+          // fits and looks like a mis-layout.
+          padding: 0.45,
          // Cap zoom-in: a small tree (2-3 nodes) would otherwise end
          // up at the 2x maxZoom, visually implying "something is
-          // wrong". 0.8 reads like "here's your whole org" even when
-          // the tree is small.
-          maxZoom: 0.8,
+          // wrong". 0.65 reads like "here's your whole org" even when
+          // the tree is small — matches deploy-time cap.
+          maxZoom: 0.65,
          // Cap zoom-out: fitView would fall back to the component's
          // minZoom=0.1 on a sparse/outlier layout, leaving the user
          // staring at a postage-stamp canvas. 0.25 is the floor.
@ -92,6 +245,120 @@ export function useCanvasViewport() {
    return () => window.removeEventListener("molecule:pan-to-node", handler);
  }, [fitView]);

+  // Auto pan+zoom to the whole deploying org after each child
+  // arrival — DEBOUNCED. Firing fitView on every event with a
+  // 600ms animation meant rapid sibling arrivals (server paces 2s
+  // apart, HMR bursts can land faster) made the viewport lurch
+  // continuously, which the user read as "parent flashing around".
+  // We now wait until the arrivals GO QUIET for 500ms, then run
+  // exactly one fit. The rootId we captured on the most recent
+  // event drives the fit bounds. Respect-user-pan still short-
+  // circuits: if the user moved after our last auto-fit, we never
+  // fit again this deploy.
+  const pendingFitRootRef = useRef<string | null>(null);
+  // Membership snapshot of the subtree at the moment of the last
+  // successful auto-fit, keyed by root id. When a new event arrives,
+  // we compute growth as "any id in the current subtree that wasn't
+  // in the snapshot". An id-set rather than just a count handles the
+  // delete-then-add case correctly: subtree of 6 → delete one → 5 →
+  // a different child arrives → 6 again. A length-only comparison
+  // would call this "no growth" and skip the fit even though a
+  // brand-new node landed off-screen. The id-set sees the new id
+  // wasn't in the snapshot and forces the fit.
+  //
+  // Map is keyed by root id. Pruned in `runFit` against the live node
+  // set so deleted roots don't accumulate across long sessions of
+  // import-then-delete cycles (#2070). Bounded to "roots present right
+  // now" by that prune; cleanup runs only at user-driven cadence
+  // (deploys), so the rate naturally tracks growth.
+  const lastFitSubtreeIdsRef = useRef<Map<string, Set<string>>>(new Map());
+  useEffect(() => {
+    const runFit = () => {
+      const rootCandidate = pendingFitRootRef.current;
+      pendingFitRootRef.current = null;
+      if (!rootCandidate) return;
+      const state = useCanvasStore.getState();
+      pruneStaleKeys(
+        lastFitSubtreeIdsRef.current,
+        new Set(state.nodes.map((n) => n.id)),
+      );
+      // Climb to the true root — the event's rootId is the just-
+      // landed child's direct parent, which may itself be nested.
+      let topId = rootCandidate;
+      let cursor = state.nodes.find((n) => n.id === topId);
+      while (cursor?.data.parentId) {
+        const up = state.nodes.find((n) => n.id === cursor!.data.parentId);
+        if (!up) break;
+        cursor = up;
+        topId = up.id;
+      }
+      const subtree: string[] = [];
+      const stack = [topId];
+      while (stack.length) {
+        const id = stack.pop()!;
+        subtree.push(id);
+        for (const n of state.nodes) {
+          if (n.data.parentId === id) stack.push(n.id);
+        }
+      }
+      if (subtree.length === 0) return;
+
+      // Growth check: did any id in the current subtree NOT appear
+      // in the snapshot from the last fit? If yes, fit through
+      // regardless of the user-pan timestamp — the user has lost
+      // context, the new arrival is off-screen, and the deploy is
+      // the primary thing they want to watch. If no, fall back to
+      // the user-pan respect gate so post-deploy exploration isn't
+      // yanked back.
+      if (!shouldFitGrowing(
+        subtree,
+        lastFitSubtreeIdsRef.current.get(topId),
+        userPannedAtRef.current,
+        lastAutoFitAtRef.current,
+      )) {
+        return;
+      }
+      fitView({
+        nodes: subtree.map((id) => ({ id })),
+        // Short animation — server paces children ~2s apart, so a
+        // 400ms fit animation reads as "smoothly tracked" rather
+        // than "constantly lurching". Longer durations (the earlier
+        // 600ms) start to overlap if the user re-triggers deploys.
+        duration: 400,
+        // Generous padding so the right-hand Communications panel,
+        // bottom-left Legend, and bottom-right "New Workspace"
+        // button don't cover the outer cards. React Flow padding
+        // is a fraction of viewport dims, so 0.45 ≈ ~430px of
+        // margin on a 960-wide canvas — enough clearance for the
+        // two side panels (~300px + ~280px).
+        padding: 0.45,
+        // Lower maxZoom so small orgs (2-3 cards) still zoom out
+        // enough to show the parent frame + children clearly with
+        // the padded margins. 0.65 reads as "here's the whole org"
+        // without getting dragged to the maxZoom by fitView's
+        // "fill the viewport" default.
+        maxZoom: 0.65,
+        minZoom: 0.25,
+      });
+      lastAutoFitAtRef.current = Date.now();
+      lastFitSubtreeIdsRef.current.set(topId, new Set(subtree));
+    };
+    const handler = (e: Event) => {
+      const { rootId } = (e as CustomEvent<{ rootId: string }>).detail;
+      // Keep the most recently-requested root. Back-to-back imports
+      // on two different orgs (rare — user would have to click
+      // Import twice within 500ms) "later wins" the viewport rather
+      // than ping-ponging between them. If this becomes a real
+      // pattern we'd flush the pending fit synchronously when
+      // `rootId` changes, rather than resetting the timer.
+      pendingFitRootRef.current = rootId;
+      clearTimeout(trackingFitTimerRef.current);
+      trackingFitTimerRef.current = setTimeout(runFit, 500);
+    };
+    window.addEventListener("molecule:fit-deploying-org", handler);
+    return () => window.removeEventListener("molecule:fit-deploying-org", handler);
+  }, [fitView]);
+
  // Zoom to a team: fit the parent + its direct children in view.
  useEffect(() => {
    const handler = (e: Event) => {
@ -129,6 +396,11 @@ export function useCanvasViewport() {

  const onMoveEnd = useCallback(
    (_event: unknown, vp: { x: number; y: number; zoom: number }) => {
+      // User-pan detection moved to the wheel/pointerdown listener
+      // above — onMoveEnd fires for programmatic fitView too, which
+      // made this callback an unreliable source for user-intent
+      // tracking. This now only handles the debounced viewport
+      // save so a reload lands the user back where they were.
      clearTimeout(saveTimerRef.current);
      saveTimerRef.current = setTimeout(() => {
        saveViewport(vp.x, vp.y, vp.zoom);
--- a/canvas/src/components/canvas/useDragHandlers.ts
+++ b/canvas/src/components/canvas/useDragHandlers.ts
@ -113,6 +113,18 @@ export function useDragHandlers(): DragHandlers {

  const onNodeDragStart: OnNodeDrag<WorkspaceNode> = useCallback(
    (event, node) => {
+      // Belt-and-braces drag-lock: the primary mechanism is the
+      // `draggable: false` projection in Canvas.tsx — React Flow
+      // won't invoke this callback for locked nodes. But a future
+      // change to the projection that forgets a locked subtree
+      // would silently allow dragging, and locked drags mid-deploy
+      // corrupt the spawn animation. Fall through to a state-based
+      // check here so the invariant stays enforced in both places.
+      if (node.draggable === false) {
+        dragStartStateRef.current = null;
+        return;
+      }
+
      dragModifiersRef.current = {
        alt: event.altKey,
        meta: event.metaKey || event.ctrlKey,
--- a/canvas/src/components/canvas/useOrgDeployState.ts
+++ b/canvas/src/components/canvas/useOrgDeployState.ts
@ -0,0 +1,152 @@
+"use client";
+
+import { useMemo } from "react";
+import { useCanvasStore } from "@/store/canvas";
+
+/**
+ * Org-deploy state for a single workspace node. Computed from the
+ * current canvas store snapshot — no per-org status field on the
+ * backend is required (a root "is deploying" iff any descendant in
+ * its subtree still reports status === "provisioning").
+ *
+ * Performance note: the first version of this hook walked the entire
+ * nodes array per node render — O(n²) for a 50-node org. The current
+ * implementation computes ONE map of derived state for the whole
+ * canvas per nodes-array change, then each call site looks up its
+ * own id. The map is built inside useMemo against a cheap projection
+ * (id + parentId + status tuples via useShallow) so unrelated store
+ * mutations (drag, selection, viewport) don't re-run the walk.
+ */
+export interface OrgDeployState {
+  isActivelyProvisioning: boolean;
+  isDeployingRoot: boolean;
+  isLockedChild: boolean;
+  descendantProvisioningCount: number;
+}
+
+const EMPTY: OrgDeployState = {
+  isActivelyProvisioning: false,
+  isDeployingRoot: false,
+  isLockedChild: false,
+  descendantProvisioningCount: 0,
+};
+
+/** Projection used to drive the deploy-state computation. Shallow-
+ *  compared so re-renders only happen when one of these fields
+ *  actually changes across any node. */
+interface NodeProjection {
+  id: string;
+  parentId: string | null;
+  status: string;
+}
+
+function buildDeployMap(
+  projections: NodeProjection[],
+  deletingIds: ReadonlySet<string>,
+): Map<string, OrgDeployState> {
+  const byId = new Map<string, NodeProjection>();
+  const childrenBy = new Map<string, string[]>();
+  for (const p of projections) {
+    byId.set(p.id, p);
+    if (p.parentId) {
+      const arr = childrenBy.get(p.parentId) ?? [];
+      arr.push(p.id);
+      childrenBy.set(p.parentId, arr);
+    }
+  }
+
+  // Walk once from each node up to its root, memoising the root id.
+  // `rootOf.get(id)` short-circuits further walks on the same chain.
+  const rootOf = new Map<string, string>();
+  const findRoot = (id: string): string => {
+    const cached = rootOf.get(id);
+    if (cached) return cached;
+    let cursor: NodeProjection | undefined = byId.get(id);
+    let rootId = id;
+    while (cursor && cursor.parentId) {
+      const parent = byId.get(cursor.parentId);
+      if (!parent) break;
+      cursor = parent;
+      rootId = parent.id;
+      const alreadyKnown = rootOf.get(rootId);
+      if (alreadyKnown) {
+        rootId = alreadyKnown;
+        break;
+      }
+    }
+    rootOf.set(id, rootId);
+    return rootId;
+  };
+
+  // Count provisioning descendants per node. Also walk once per root
+  // using an iterative DFS so we don't stack-overflow on deep trees.
+  const countProvisioning = (rootId: string): number => {
+    let count = 0;
+    const stack = [rootId];
+    while (stack.length) {
+      const id = stack.pop()!;
+      const node = byId.get(id);
+      if (!node) continue;
+      if (node.status === "provisioning") count++;
+      const kids = childrenBy.get(id);
+      if (kids) stack.push(...kids);
+    }
+    return count;
+  };
+
+  // Per-root cache of subtree count so every descendant resolves in O(1).
+  const rootCount = new Map<string, number>();
+
+  const out = new Map<string, OrgDeployState>();
+  for (const p of projections) {
+    const rootId = findRoot(p.id);
+    let provCount = rootCount.get(rootId);
+    if (provCount === undefined) {
+      provCount = countProvisioning(rootId);
+      rootCount.set(rootId, provCount);
+    }
+    const rootIsDeploying = provCount > 0;
+    // A node being deleted gets the same visual + interaction lock
+    // as a deploying child. "The system owns this node right now,
+    // don't touch it" is the shared semantic — the user only cares
+    // that the card is dim and won't drag; they don't need to know
+    // whether it's coming up or going down.
+    const deleting = deletingIds.has(p.id);
+    out.set(p.id, {
+      isActivelyProvisioning: p.status === "provisioning",
+      isDeployingRoot: p.id === rootId && rootIsDeploying,
+      isLockedChild: deleting || (p.id !== rootId && rootIsDeploying),
+      descendantProvisioningCount:
+        p.id === rootId ? provCount : 0, // only roots display the count
+    });
+  }
+  return out;
+}
+
+/** Store-wide derived map. Recomputed whenever the `nodes` array
+ *  reference changes — which is on every store mutation that touches
+ *  nodes, including pure position tweens. The map build is O(n) so
+ *  a 50-node canvas costs ~50μs per tween frame; that's cheap enough
+ *  to not need a projection layer. (An earlier attempt to narrow the
+ *  subscription via `useShallow((s) => s.nodes.map(...))` triggered
+ *  React 18's "getSnapshot should be cached" loop because the
+ *  projection creates fresh object references each call — shallow
+ *  equality always sees "changed", which re-renders, which re-runs
+ *  the selector, ad infinitum.) */
+function useDeployMap(): Map<string, OrgDeployState> {
+  const nodes = useCanvasStore((s) => s.nodes);
+  const deletingIds = useCanvasStore((s) => s.deletingIds);
+  return useMemo(() => {
+    const projections = nodes.map((n) => ({
+      id: n.id,
+      parentId: n.data.parentId,
+      status: n.data.status,
+    }));
+    return buildDeployMap(projections, deletingIds);
+  }, [nodes, deletingIds]);
+}
+
+export function useOrgDeployState(nodeId: string): OrgDeployState {
+  const map = useDeployMap();
+  return map.get(nodeId) ?? EMPTY;
+}
--- a/canvas/src/components/tabs/ActivityTab.tsx
+++ b/canvas/src/components/tabs/ActivityTab.tsx
@ -5,6 +5,7 @@ import { api } from "@/lib/api";
 import { ConversationTraceModal } from "@/components/ConversationTraceModal";
 import { type ActivityEntry } from "@/types/activity";
 import { useWorkspaceName } from "@/hooks/useWorkspaceName";
+import { inferA2AErrorHint } from "./chat/a2aErrorHint";

 interface Props {
  workspaceId: string;
@ -286,6 +287,26 @@ function ActivityRow({
  );
 }

+const A2A_ERROR_PREFIX = "[A2A_ERROR]";
+
+/** Render a [A2A_ERROR]-prefixed response as a structured error block
+ *  with a stripped detail line + a cause hint. The previous raw render
+ *  ("[A2A_ERROR] " literal in the response area) gave the user no
+ *  signal to act on. */
+function A2AErrorPreview({ label, raw }: { label: string; raw: string }) {
+  const detail = raw.slice(A2A_ERROR_PREFIX.length).trim() || "(no detail provided)";
+  const hint = inferA2AErrorHint(detail);
+  return (
+    <div>
+      <div className="text-[8px] text-red-400/80 uppercase tracking-wider mb-1">{label} — delivery failed</div>
+      <div className="text-[10px] text-red-300 bg-red-950/30 border border-red-800/40 rounded p-2 space-y-1.5">
+        <div className="font-mono whitespace-pre-wrap break-words max-h-32 overflow-y-auto">{detail}</div>
+        <div className="text-[9px] text-red-300/70 leading-relaxed border-t border-red-800/30 pt-1.5">{hint}</div>
+      </div>
+    </div>
+  );
+}
+
 /** Extract human-readable text from A2A request/response JSON */
 function MessagePreview({ label, body }: { label: string; body: Record<string, unknown> }) {
  // Try to extract text from A2A message parts
@ -295,6 +316,14 @@ function MessagePreview({ label, body }: { label: string; body: Record<string, u
    if (body.task && typeof body.task === "string") { text = body.task; }
    if (!text && body.result && typeof body.result === "string") { text = body.result; }
    if (text) {
+      // [A2A_ERROR]-prefixed responses get the structured error
+      // treatment. Bare text fallthrough renders a bland gray block
+      // — fine for normal replies, terrible for "[A2A_ERROR] " with
+      // no further context. Detect at the top of the rendering path
+      // so it short-circuits before the generic preview kicks in.
+      if (text.trimStart().startsWith(A2A_ERROR_PREFIX)) {
+        return <A2AErrorPreview label={label} raw={text.trimStart()} />;
+      }
      return (
        <div>
          <div className="text-[8px] text-zinc-500 uppercase tracking-wider mb-1">{label}</div>
--- a/canvas/src/components/tabs/ChatTab.tsx
+++ b/canvas/src/components/tabs/ChatTab.tsx
@ -7,9 +7,12 @@ import { api } from "@/lib/api";
 import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
 import { WS_URL } from "@/store/socket";
 import { closeWebSocketGracefully } from "@/lib/ws-close";
-import { type ChatMessage, createMessage, appendMessageDeduped } from "./chat/types";
-import { extractResponseText, extractRequestText } from "./chat/message-parser";
+import { type ChatMessage, type ChatAttachment, createMessage, appendMessageDeduped } from "./chat/types";
+import { uploadChatFiles, downloadChatFile } from "./chat/uploads";
+import { AttachmentChip, PendingAttachmentPill } from "./chat/AttachmentViews";
+import { extractResponseText, extractRequestText, extractFilesFromTask } from "./chat/message-parser";
 import { AgentCommsPanel } from "./chat/AgentCommsPanel";
+import { appendActivityLine } from "./chat/activityLog";
 import { runtimeDisplayName } from "@/lib/runtime-names";
 import { ConfirmDialog } from "@/components/ConfirmDialog";

@ -21,10 +24,18 @@ interface Props {
 type ChatSubTab = "my-chat" | "agent-comms";

 // A2A response shape (subset). The full schema is in @a2a-js/sdk but we only
-// need parts/artifacts text extraction for the synchronous fallback path.
+// need parts/artifacts text + file extraction for the synchronous fallback.
+interface A2AFileRef {
+  name?: string;
+  mimeType?: string;
+  uri?: string;
+  bytes?: string;
+  size?: number;
+}
 interface A2APart {
  kind: string;
-  text: string;
+  text?: string;
+  file?: A2AFileRef;
 }
 interface A2AResponse {
  result?: {
@ -33,25 +44,81 @@ interface A2AResponse {
  };
 }

+/** Detect activity-log rows that the workspace's own runtime fired
+ *  against itself but were misclassified as canvas-source. The proper
+ *  fix is the X-Workspace-ID header from `self_source_headers()` in
+ *  workspace/platform_auth.py, which makes the platform record
+ *  source_id = workspace_id. But three failure modes still leak a
+ *  self-message into "My Chat":
+ *
+ *    1. Historical rows already in the DB with source_id=NULL.
+ *    2. Workspace containers running pre-fix heartbeat.py / main.py
+ *       (the fix only takes effect after an image rebuild + redeploy).
+ *    3. Future internal triggers added without the helper.
+ *
+ *  This client-side filter recognises the heartbeat trigger by its
+ *  exact prefix — the heartbeat assembles
+ *
+ *    "Delegation results are ready. Review them and take appropriate
+ *     action:\n" + summary_lines + report_instruction
+ *
+ *  in workspace/heartbeat.py. The prefix is template-fixed so a
+ *  string match is reliable. If the heartbeat copy ever changes,
+ *  update this constant in the same commit.
+ *
+ *  This is a backstop, not the primary defence — the X-Workspace-ID
+ *  header is. Filtering content is fragile to copy edits, so keep
+ *  the list narrow. */
+const INTERNAL_SELF_MESSAGE_PREFIXES = [
+  "Delegation results are ready. Review them and take appropriate action",
+];
+
+function isInternalSelfMessage(text: string): boolean {
+  return INTERNAL_SELF_MESSAGE_PREFIXES.some((p) => text.startsWith(p));
+}
+
 // extractReplyText pulls the agent's text reply out of an A2A response.
-// Mirrors the Go-side extractReplyText in workspace-server/internal/channels/manager.go.
+// Concatenates ALL text parts (joined with "\n") rather than returning
+// just the first. Claude Code and other runtimes commonly emit multi-
+// part text replies for long content (markdown tables, code blocks),
+// and the prior "first part wins" implementation silently truncated
+// the rest — observed on a 15k-char Wave 1 brief that rendered only
+// the table header. Mirrors extractTextsFromParts in message-parser.ts.
+//
+// Server-side counterpart in workspace-server/internal/channels/
+// manager.go has the same single-part bug; fix that too if/when a
+// channel-delivered reply (Slack, Lark, etc.) gets truncated.
 function extractReplyText(resp: A2AResponse): string {
+  const collect = (parts: A2APart[] | undefined): string => {
+    if (!parts) return "";
+    return parts
+      .filter((p) => p.kind === "text")
+      .map((p) => p.text ?? "")
+      .filter(Boolean)
+      .join("\n");
+  };
  const result = resp?.result;
-  if (result?.parts) {
-    for (const p of result.parts) {
-      if (p.kind === "text") return p.text;
-    }
-  }
+  const collected: string[] = [];
+  const fromParts = collect(result?.parts);
+  if (fromParts) collected.push(fromParts);
+  // Walk artifacts even if parts had text — some producers (Hermes
+  // tool calls) emit a summary in parts AND details in artifacts.
+  // Returning early on parts dropped the artifact body silently.
  if (result?.artifacts) {
    for (const a of result.artifacts) {
-      for (const p of a.parts || []) {
-        if (p.kind === "text") return p.text;
-      }
+      const t = collect(a.parts);
+      if (t) collected.push(t);
    }
  }
-  return "";
+  return collected.join("\n");
 }

+// Agent-returned files live on the same response shape as text —
+// delegated to extractFilesFromTask in message-parser.ts, which also
+// walks status.message.parts (that ChatTab's legacy text extractor
+// doesn't). Single source of truth for file-part parsing across
+// live chat, activity log replay, and any future consumers.
+
 /**
 * Load chat history from the activity_logs database via the platform API.
 * Uses source=canvas to only get user-initiated messages (not agent-to-agent).
@ -71,16 +138,23 @@ async function loadMessagesFromDB(workspaceId: string): Promise<{ messages: Chat
    for (const a of [...activities].reverse()) {
      // Extract user message from request_body
      const userText = extractRequestText(a.request_body);
-      if (userText) {
+      if (userText && !isInternalSelfMessage(userText)) {
        messages.push(createMessage("user", userText));
      }

-      // Extract agent response
+      // Extract agent response — text AND any file attachments so a
+      // chat reload surfaces historical download chips, not just plain
+      // text. `result` is nested on successful A2A responses; some
+      // older rows stored the raw `result` payload at the top level,
+      // so fall back to the body itself when `.result` is absent.
      if (a.response_body) {
        const text = extractResponseText(a.response_body);
-        if (text) {
+        const attachments = extractFilesFromTask(
+          (a.response_body.result ?? a.response_body) as Record<string, unknown>,
+        );
+        if (text || attachments.length > 0) {
          const role = a.status === "error" || text.toLowerCase().startsWith("agent error") ? "system" : "agent";
-          messages.push({ ...createMessage(role, text), timestamp: a.created_at });
+          messages.push({ ...createMessage(role, text, attachments), timestamp: a.created_at });
        }
      }
    }
@ -178,7 +252,16 @@ export function ChatTab({ workspaceId, data }: Props) {
 function MyChatPanel({ workspaceId, data }: Props) {
  const [messages, setMessages] = useState<ChatMessage[]>([]);
  const [input, setInput] = useState("");
-  const [sending, setSending] = useState(!!data.currentTask);
+  // `sending` is strictly the "this tab kicked off a send and hasn't
+  // seen the reply yet" signal. Previously this was initialized from
+  // data.currentTask to pick up in-flight agent work on mount, but
+  // that conflated agent-busy (workspace heartbeat) with user-
+  // in-flight (local send): when the WS dropped a TASK_COMPLETE event,
+  // currentTask lingered, the component re-mounted with sending=true,
+  // and the Send button stayed disabled forever even though nothing
+  // local was in flight. For the "agent is busy, show spinner" UX,
+  // use data.currentTask directly in the render path.
+  const [sending, setSending] = useState(false);
  const [thinkingElapsed, setThinkingElapsed] = useState(0);
  const [activityLog, setActivityLog] = useState<string[]>([]);
  const [loading, setLoading] = useState(true);
@ -189,6 +272,17 @@ function MyChatPanel({ workspaceId, data }: Props) {
  const [error, setError] = useState<string | null>(null);
  const [confirmRestart, setConfirmRestart] = useState(false);
  const bottomRef = useRef<HTMLDivElement>(null);
+  // Files the user has picked but not yet sent. Cleared on send
+  // (upload success) or by the × on each pill.
+  const [pendingFiles, setPendingFiles] = useState<File[]>([]);
+  const [uploading, setUploading] = useState(false);
+  const fileInputRef = useRef<HTMLInputElement>(null);
+  // Guard against a double-click during the upload phase: React
+  // state updates from the click that started the upload haven't
+  // flushed yet, so the disabled-button logic sees `uploading=false`
+  // from the closure and lets a second `sendMessage` enter. A ref
+  // observes the latest value synchronously.
+  const sendInFlightRef = useRef(false);

  // Load chat history from database on mount
  useEffect(() => {
@ -231,8 +325,10 @@ function MyChatPanel({ workspaceId, data }: Props) {
      // Dedupe in case the agent proactively pushed the same text the
      // HTTP /a2a response already delivered (observed with the Hermes
      // runtime, which emits both a reply body and a send_message_to_user
-      // push for the same content).
-      setMessages((prev) => appendMessageDeduped(prev, createMessage("agent", m.content)));
+      // push for the same content). Attachments ride along with the
+      // message so files returned by the A2A_RESPONSE WS path render
+      // their download chips.
+      setMessages((prev) => appendMessageDeduped(prev, createMessage("agent", m.content, m.attachments)));
    }
    if (sendingFromAPIRef.current && msgs.length > 0) {
      setSending(false);
@ -277,12 +373,21 @@ function MyChatPanel({ workspaceId, data }: Props) {
      try {
        const msg = JSON.parse(event.data);
        if (msg.event === "ACTIVITY_LOGGED") {
+          // Filter to events for THIS workspace. The platform's
+          // BroadcastOnly fires to every connected client, and
+          // without this guard a sibling workspace's a2a_send would
+          // surface as "→ Delegating to X..." inside the wrong
+          // chat panel. (workspace_id on the WS envelope is the
+          // workspace whose activity_log row we just wrote.)
+          if (msg.workspace_id !== workspaceId) return;
+
          const p = msg.payload || {};
          const type = p.activity_type as string;
          const method = (p.method as string) || "";
          const status = (p.status as string) || "";
          const targetId = (p.target_id as string) || "";
          const durationMs = p.duration_ms as number | undefined;
+          const summary = (p.summary as string) || "";

          let line = "";
          if (type === "a2a_receive" && method === "message/send") {
@ -313,17 +418,23 @@ function MyChatPanel({ workspaceId, data }: Props) {
            const targetName = resolveWorkspaceName(targetId);
            line = `→ Delegating to ${targetName}...`;
          } else if (type === "task_update") {
-            const summary = (p.summary as string) || "";
            if (summary) line = `⟳ ${summary}`;
+          } else if (type === "agent_log") {
+            // Per-tool-use telemetry from claude_sdk_executor's
+            // _report_tool_use. The summary already carries an icon
+            // + human-readable args (📄 Read /path, ⚡ Bash: …)
+            // so we render it verbatim. No icon prefix here — the
+            // emoji at the start of summary is the visual marker.
+            if (summary) line = summary;
          }

          if (line) {
-            setActivityLog((prev) => [...prev.slice(-8), line]);
+            setActivityLog((prev) => appendActivityLine(prev, line));
          }
        } else if (msg.event === "TASK_UPDATED" && msg.workspace_id === workspaceId) {
          const task = (msg.payload?.current_task as string) || "";
          if (task) {
-            setActivityLog((prev) => [...prev.slice(-8), `⟳ ${task}`]);
+            setActivityLog((prev) => appendActivityLine(prev, `⟳ ${task}`));
          }
        }
        // A2A_RESPONSE is already consumed by the store and its text is
@ -339,10 +450,35 @@ function MyChatPanel({ workspaceId, data }: Props) {

  const sendMessage = async () => {
    const text = input.trim();
-    if (!text || !agentReachable || sending) return;
+    const filesToSend = pendingFiles;
+    // Allow sending if EITHER text OR attachments are present — a user
+    // can drop a file with no text and the agent still receives it.
+    if ((!text && filesToSend.length === 0) || !agentReachable || sending || uploading) return;
+    // Synchronous re-entry guard — see sendInFlightRef comment.
+    if (sendInFlightRef.current) return;
+    sendInFlightRef.current = true;
+
+    // Upload attachments first so we can include URIs in the A2A
+    // message parts. Sequential-before-send: a message with references
+    // to files not yet staged would fail agent-side; staging happens
+    // synchronously via /chat/uploads before message/send dispatch.
+    let uploaded: ChatAttachment[] = [];
+    if (filesToSend.length > 0) {
+      setUploading(true);
+      try {
+        uploaded = await uploadChatFiles(workspaceId, filesToSend);
+      } catch (e) {
+        setUploading(false);
+        sendInFlightRef.current = false;
+        setError(e instanceof Error ? `Upload failed: ${e.message}` : "Upload failed");
+        return;
+      }
+      setUploading(false);
+    }

    setInput("");
-    setMessages((prev) => [...prev, createMessage("user", text)]);
+    setPendingFiles([]);
+    setMessages((prev) => [...prev, createMessage("user", text, uploaded)]);
    setSending(true);
    sendingFromAPIRef.current = true;
    setError(null);
@ -356,40 +492,228 @@ function MyChatPanel({ workspaceId, data }: Props) {
        parts: [{ kind: "text", text: m.content }],
      }));

+    // A2A parts: text part (if any) + file parts (per attachment). The
+    // agent sees both in a single turn, matching the A2A spec shape.
+    const parts: A2APart[] = [];
+    if (text) parts.push({ kind: "text", text });
+    for (const att of uploaded) {
+      parts.push({
+        kind: "file",
+        file: {
+          name: att.name,
+          mimeType: att.mimeType,
+          uri: att.uri,
+          size: att.size,
+        },
+      });
+    }
+
+    // A2A calls can legitimately take minutes — LLM latency +
+    // multi-turn tool use is common on slower providers (Hermes+minimax,
+    // Claude Code invoking bash/file tools, etc.). The 15s default
+    // would silently abort the fetch here, leaving the server to
+    // complete the reply and the user staring at
+    // "agent may be unreachable". Match the upload timeout (60s × 2)
+    // for the happy-path ceiling; anything longer is genuinely stuck.
    api.post<A2AResponse>(`/workspaces/${workspaceId}/a2a`, {
      method: "message/send",
      params: {
        message: {
          role: "user",
          messageId: crypto.randomUUID(),
-          parts: [{ kind: "text", text }],
+          parts,
        },
        metadata: { history },
      },
-    })
+    }, { timeoutMs: 120_000 })
      .then((resp) => {
        // Skip if the WS A2A_RESPONSE event already handled this response.
        // Both paths (WS + HTTP) check sendingFromAPIRef — whichever clears
        // it first wins, the other becomes a no-op (no duplicate messages).
        if (!sendingFromAPIRef.current) return;
        const replyText = extractReplyText(resp);
-        if (replyText) {
-          setMessages((prev) => appendMessageDeduped(prev, createMessage("agent", replyText)));
+        const replyFiles = extractFilesFromTask((resp?.result ?? {}) as Record<string, unknown>);
+        if (replyText || replyFiles.length > 0) {
+          setMessages((prev) =>
+            appendMessageDeduped(prev, createMessage("agent", replyText, replyFiles)),
+          );
        }
        setSending(false);
        sendingFromAPIRef.current = false;
+        sendInFlightRef.current = false;
      })
      .catch(() => {
+        // Same dedup guard as .then(): if a WS path (pendingAgentMsgs
+        // or ACTIVITY_LOGGED a2a_receive ok) already delivered the
+        // reply, sendingFromAPIRef is already false and there's
+        // nothing to roll back. Surfacing "Failed to send" here would
+        // contradict the agent reply the user is currently reading —
+        // exactly the false-positive observed when the HTTP request
+        // hung up (proxy idle / 502) after WS already won.
+        if (!sendingFromAPIRef.current) {
+          sendInFlightRef.current = false;
+          return;
+        }
        setSending(false);
        sendingFromAPIRef.current = false;
+        sendInFlightRef.current = false;
        setError("Failed to send message — agent may be unreachable");
      });
  };

+  const onFilesPicked = (fileList: FileList | null) => {
+    if (!fileList) return;
+    const picked = Array.from(fileList);
+    // Deduplicate against current pending set by name+size — user
+    // picking the same file twice shouldn't append it.
+    setPendingFiles((prev) => {
+      const keyed = new Set(prev.map((f) => `${f.name}:${f.size}`));
+      return [...prev, ...picked.filter((f) => !keyed.has(`${f.name}:${f.size}`))];
+    });
+    if (fileInputRef.current) fileInputRef.current.value = "";
+  };
+
+  const removePendingFile = (index: number) =>
+    setPendingFiles((prev) => prev.filter((_, i) => i !== index));
+
+  // Monotonic counter so two paste events within the same wall-clock
+  // second still produce distinct filenames. Without this, on
+  // Firefox (where pasted images have an empty `file.name`), two
+  // pastes ~100ms apart could yield identical synthetic names AND
+  // identical sizes, collapsing into one attachment via the
+  // `name:size` dedup in onFilesPicked.
+  const pasteCounterRef = useRef(0);
+
+  /** Paste-from-clipboard image attachment.
+   *
+   *  Browser clipboard image items arrive as `File`s whose `name` is
+   *  often a generic "image.png" (Chrome) or empty (Firefox/Safari),
+   *  so two consecutive screenshot pastes collide on the name+size
+   *  dedup the file-picker uses. Re-tag each pasted image with a
+   *  per-paste unique name so dedup keeps them apart and the upload
+   *  pipeline (which expects a non-empty filename) is happy.
+   *
+   *  Falls through to onFilesPicked via direct File[] (NOT through
+   *  the DataTransfer constructor — that throws on Safari < 14.1
+   *  and old Edge, silently aborting the paste).
+   *
+   *  Only intercepts the paste when the clipboard has at least one
+   *  image; text-only pastes fall through to the textarea's default
+   *  behaviour. */
+  const mimeToExt = (mime: string): string => {
+    // Avoid raw `mime.split("/")[1]` — that yields `"svg+xml"`,
+    // `"jpeg"`, `"webp"` etc. which produce ugly filenames and may
+    // trip server-side extension allowlists. Map known types
+    // explicitly; unknown falls back to a safe default.
+    if (mime === "image/svg+xml") return "svg";
+    if (mime === "image/jpeg") return "jpg";
+    if (mime === "image/png") return "png";
+    if (mime === "image/gif") return "gif";
+    if (mime === "image/webp") return "webp";
+    if (mime === "image/heic") return "heic";
+    return "png";
+  };
+
+  const onPasteIntoComposer = (e: React.ClipboardEvent<HTMLTextAreaElement>) => {
+    if (!dropEnabled) return;
+    const items = e.clipboardData?.items;
+    if (!items || items.length === 0) return;
+    const imageFiles: File[] = [];
+    for (let i = 0; i < items.length; i++) {
+      const item = items[i];
+      if (!item.type.startsWith("image/")) continue;
+      const file = item.getAsFile();
+      if (!file) continue;
+      const ext = mimeToExt(file.type);
+      const stamp = new Date()
+        .toISOString()
+        .replace(/[:.]/g, "-")
+        .slice(0, 19);
+      const seq = pasteCounterRef.current++;
+      const fname = `pasted-${stamp}-${seq}-${i}.${ext}`;
+      imageFiles.push(new File([file], fname, { type: file.type }));
+    }
+    if (imageFiles.length === 0) return;
+    e.preventDefault();
+    // Reuse the picker path so file-size guards, dedup, and pending-
+    // list state all run through the same code. Build a synthetic
+    // FileList-like object to avoid the DataTransfer constructor —
+    // that's missing on Safari < 14.1 / old Edge and would silently
+    // throw, leaving the paste a no-op.
+    addPastedFiles(imageFiles);
+  };
+
+  // Variant of onFilesPicked that accepts a File[] directly, sidestepping
+  // the DataTransfer-FileList round-trip. Same dedup + state shape.
+  const addPastedFiles = (files: File[]) => {
+    setPendingFiles((prev) => {
+      const keyed = new Set(prev.map((f) => `${f.name}:${f.size}`));
+      return [...prev, ...files.filter((f) => !keyed.has(`${f.name}:${f.size}`))];
+    });
+  };
+
+  // Drag-and-drop staging. dragDepthRef counts enter vs leave events so
+  // the overlay doesn't flicker when the cursor crosses nested children
+  // (textarea, buttons) — dragenter/dragleave fire for every boundary.
+  const [dragOver, setDragOver] = useState(false);
+  const dragDepthRef = useRef(0);
+  const dropEnabled = agentReachable && !sending && !uploading;
+  const isFileDrag = (e: React.DragEvent) =>
+    Array.from(e.dataTransfer.types || []).includes("Files");
+
+  const onDragEnter = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    e.preventDefault();
+    dragDepthRef.current += 1;
+    setDragOver(true);
+  };
+  const onDragOver = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    e.preventDefault();
+    e.dataTransfer.dropEffect = "copy";
+  };
+  const onDragLeave = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    dragDepthRef.current = Math.max(0, dragDepthRef.current - 1);
+    if (dragDepthRef.current === 0) setDragOver(false);
+  };
+  const onDrop = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    e.preventDefault();
+    dragDepthRef.current = 0;
+    setDragOver(false);
+    onFilesPicked(e.dataTransfer.files);
+  };
+
+  const downloadAttachment = (att: ChatAttachment) => {
+    // Errors here are rare but user-visible (401 on a revoked token,
+    // 404 if the agent deleted the file). Surface via the inline
+    // error banner — the message list itself stays untouched.
+    downloadChatFile(workspaceId, att).catch((e) => {
+      setError(e instanceof Error ? `Download failed: ${e.message}` : "Download failed");
+    });
+  };
+
  const isOnline = data.status === "online" || data.status === "degraded";

  return (
-    <div className="flex flex-col h-full">
+    <div
+      className="flex flex-col h-full relative"
+      onDragEnter={onDragEnter}
+      onDragOver={onDragOver}
+      onDragLeave={onDragLeave}
+      onDrop={onDrop}
+    >
+      {dragOver && (
+        <div
+          className="absolute inset-0 z-20 flex items-center justify-center bg-blue-500/10 border-2 border-dashed border-blue-400 rounded pointer-events-none"
+          aria-live="polite"
+        >
+          <div className="bg-zinc-900/90 border border-blue-400/50 rounded-lg px-4 py-2 text-xs text-blue-200">
+            Drop to attach
+          </div>
+        </div>
+      )}
      {/* Messages */}
      <div className="flex-1 overflow-y-auto p-3 space-y-3">
        {loading && (
@ -435,9 +759,23 @@ function MyChatPanel({ workspaceId, data }: Props) {
                    : "bg-zinc-800/80 text-zinc-200 border border-zinc-700/30"
              }`}
            >
-              <div className="prose prose-sm prose-invert max-w-none [&>p]:mb-1 [&>p:last-child]:mb-0">
-                <ReactMarkdown remarkPlugins={[remarkGfm]}>{msg.content}</ReactMarkdown>
-              </div>
+              {msg.content && (
+                <div className="prose prose-sm prose-invert max-w-none [&>p]:mb-1 [&>p:last-child]:mb-0">
+                  <ReactMarkdown remarkPlugins={[remarkGfm]}>{msg.content}</ReactMarkdown>
+                </div>
+              )}
+              {msg.attachments && msg.attachments.length > 0 && (
+                <div className={`flex flex-wrap gap-1 ${msg.content ? "mt-1.5" : ""}`}>
+                  {msg.attachments.map((att, i) => (
+                    <AttachmentChip
+                      key={`${msg.id}-${i}`}
+                      attachment={att}
+                      onDownload={downloadAttachment}
+                      tone={msg.role === "user" ? "user" : "agent"}
+                    />
+                  ))}
+                </div>
+              )}
              <div className="text-[9px] text-zinc-500 mt-1">
                {new Date(msg.timestamp).toLocaleTimeString()}
              </div>
@ -445,8 +783,11 @@ function MyChatPanel({ workspaceId, data }: Props) {
          </div>
        ))}

-        {/* Thinking indicator */}
-        {sending && (
+        {/* Thinking indicator — shows when this tab is awaiting a reply
+           OR when the workspace heartbeat reports an in-flight task
+           (covers the "agent is already busy when I open the tab" case
+           without locking the Send button on a stale currentTask). */}
+        {(sending || !!data.currentTask) && (
          <div className="flex justify-start">
            <div className="bg-zinc-800/50 border border-zinc-700/30 rounded-lg px-3 py-2 max-w-[85%]">
              <div className="flex items-center gap-2 text-xs text-zinc-400">
@ -490,7 +831,37 @@ function MyChatPanel({ workspaceId, data }: Props) {

      {/* Input */}
      <div className="p-3 border-t border-zinc-800">
-        <div className="flex gap-2">
+        {pendingFiles.length > 0 && (
+          <div className="flex flex-wrap gap-1.5 mb-2">
+            {pendingFiles.map((f, i) => (
+              <PendingAttachmentPill
+                key={`${f.name}-${f.size}-${i}`}
+                file={f}
+                onRemove={() => removePendingFile(i)}
+              />
+            ))}
+          </div>
+        )}
+        <div className="flex gap-2 items-end">
+          <input
+            ref={fileInputRef}
+            type="file"
+            multiple
+            className="hidden"
+            onChange={(e) => onFilesPicked(e.target.files)}
+            aria-hidden="true"
+          />
+          <button
+            onClick={() => fileInputRef.current?.click()}
+            disabled={!agentReachable || sending || uploading}
+            aria-label="Attach file"
+            title="Attach file"
+            className="p-2 bg-zinc-800 hover:bg-zinc-700 border border-zinc-700 rounded-lg text-zinc-400 hover:text-zinc-200 transition-colors shrink-0 disabled:opacity-40"
+          >
+            <svg width="14" height="14" viewBox="0 0 16 16" fill="none" aria-hidden="true">
+              <path d="M11 6.5 7 10.5a2 2 0 1 0 2.8 2.8l4-4a3.5 3.5 0 0 0-5-5l-4.5 4.5a5 5 0 0 0 7 7l4-4" stroke="currentColor" strokeWidth="1.4" strokeLinecap="round" strokeLinejoin="round" />
+            </svg>
+          </button>
          <textarea
            aria-label="Message to agent"
            value={input}
@ -501,17 +872,18 @@ function MyChatPanel({ workspaceId, data }: Props) {
                sendMessage();
              }
            }}
-            placeholder={agentReachable ? "Send a message... (Shift+Enter for new line)" : `Agent is ${data.status}`}
+            onPaste={onPasteIntoComposer}
+            placeholder={agentReachable ? "Send a message... (Shift+Enter for new line, paste images to attach)" : `Agent is ${data.status}`}
            disabled={!agentReachable || sending}
            rows={1}
            className="flex-1 bg-zinc-800 border border-zinc-700 rounded-lg px-3 py-2 text-xs text-zinc-200 placeholder-zinc-500 focus:outline-none focus:border-blue-500 resize-none disabled:opacity-50"
          />
          <button
            onClick={sendMessage}
-            disabled={!input.trim() || !agentReachable || sending}
+            disabled={(!input.trim() && pendingFiles.length === 0) || !agentReachable || sending || uploading}
            className="px-4 py-2 bg-blue-600 hover:bg-blue-500 text-xs font-medium rounded-lg text-white disabled:opacity-30 transition-colors shrink-0"
          >
-            Send
+            {uploading ? "Uploading…" : "Send"}
          </button>
        </div>
      </div>
--- a/canvas/src/components/tabs/ConfigTab.tsx
+++ b/canvas/src/components/tabs/ConfigTab.tsx
@ -105,12 +105,17 @@ interface RuntimeOption {
 // Fallback used when /templates can't be fetched (offline, older backend).
 // Keep in sync with manifest.json workspace_templates as a defensive default.
 // Model + env suggestions only flow when the backend is reachable.
+//
 // Runtimes that manage their own config outside the platform's config.yaml
-// template. For these, a missing config.yaml is expected — the user manages
-// config via the runtime's own mechanism (e.g. hermes edits
-// ~/.hermes/config.yaml on the workspace EC2 via the Terminal tab or its
-// own CLI). Showing a "No config.yaml found" error for these is misleading.
-const RUNTIMES_WITH_OWN_CONFIG = new Set<string>(["hermes", "external"]);
+// template. For these, a missing config.yaml is expected and the form
+// genuinely can't edit the runtime's settings (there's no platform file
+// to write). Hermes is NOT on this list: it DOES ship a platform
+// config.yaml via workspace-configs-templates/hermes that controls model,
+// runtime_config, required_env, etc. Editing it through this form is
+// exactly the point of the platform adaptor. The deep `~/.hermes/
+// config.yaml` on the container is a separate runtime-internal file,
+// not this one.
+const RUNTIMES_WITH_OWN_CONFIG = new Set<string>(["external"]);

 const FALLBACK_RUNTIME_OPTIONS: RuntimeOption[] = [
  { value: "", label: "LangGraph (default)", models: [] },
@ -152,9 +157,11 @@ export function ConfigTab({ workspaceId }: Props) {
    // default `LangGraph`. See GH #1894.
    let wsMetadataRuntime = "";
    let wsMetadataModel = "";
+    let wsMetadataTier: number | null = null;
    try {
-      const ws = await api.get<{ runtime?: string }>(`/workspaces/${workspaceId}`);
+      const ws = await api.get<{ runtime?: string; tier?: number }>(`/workspaces/${workspaceId}`);
      wsMetadataRuntime = (ws.runtime || "").trim();
+      if (typeof ws.tier === "number") wsMetadataTier = ws.tier;
    } catch { /* fall back to config.yaml */ }
    try {
      const m = await api.get<{ model?: string }>(`/workspaces/${workspaceId}/model`);
@ -166,11 +173,15 @@ export function ConfigTab({ workspaceId }: Props) {
      const parsed = parseYaml(res.content);
      setOriginalYaml(res.content);
      setRawDraft(res.content);
-      // Merge: config.yaml wins for fields it declares, but workspace metadata
-      // wins for runtime + model when config.yaml doesn't set them.
+      // Merge: workspace-row metadata is authoritative for the DB-backed
+      // fields (tier, runtime, model). config.yaml often lags — handleSave
+      // PATCHes tier/runtime directly and a template snapshot in the
+      // container can differ from the live row. Show the DB value so the
+      // form doesn't contradict the node badge (issue: badge=T3, form=T2).
      const merged = { ...DEFAULT_CONFIG, ...parsed } as ConfigData;
-      if (!merged.runtime && wsMetadataRuntime) merged.runtime = wsMetadataRuntime;
-      if (!merged.model && wsMetadataModel) merged.model = wsMetadataModel;
+      if (wsMetadataRuntime) merged.runtime = wsMetadataRuntime;
+      if (wsMetadataModel) merged.model = wsMetadataModel;
+      if (wsMetadataTier !== null) merged.tier = wsMetadataTier;
      setConfig(merged);
    } catch {
      // No platform-managed config.yaml. Some runtimes (hermes, external)
@ -185,6 +196,7 @@ export function ConfigTab({ workspaceId }: Props) {
        ...DEFAULT_CONFIG,
        runtime: wsMetadataRuntime,
        model: wsMetadataModel,
+        ...(wsMetadataTier !== null ? { tier: wsMetadataTier } : {}),
      } as ConfigData);
    } finally {
      setLoading(false);
--- a/canvas/src/components/tabs/DetailsTab.tsx
+++ b/canvas/src/components/tabs/DetailsTab.tsx
@ -36,7 +36,7 @@ export function DetailsTab({ workspaceId, data }: Props) {
  const [restartError, setRestartError] = useState<string | null>(null);
  const [consoleOpen, setConsoleOpen] = useState(false);
  const updateNodeData = useCanvasStore((s) => s.updateNodeData);
-  const removeNode = useCanvasStore((s) => s.removeNode);
+  const removeSubtree = useCanvasStore((s) => s.removeSubtree);
  const selectNode = useCanvasStore((s) => s.selectNode);
  // Ref for the "Delete Workspace" trigger — Cancel returns focus here
  const deleteButtonRef = useRef<HTMLButtonElement>(null);
@ -94,7 +94,11 @@ export function DetailsTab({ workspaceId, data }: Props) {
    setDeleteError(null);
    try {
      await api.del(`/workspaces/${workspaceId}?confirm=true`);
-      removeNode(workspaceId);
+      // Mirror the server-side cascade — drop the row + every
+      // descendant locally so the canvas reflects the deletion
+      // immediately, even when the WS is dead and the per-descendant
+      // WORKSPACE_REMOVED events never arrive.
+      removeSubtree(workspaceId);
      selectNode(null);
    } catch (e) {
      setDeleteError(e instanceof Error ? e.message : "Failed to delete");
--- a/canvas/src/components/tabs/SkillsTab.tsx
+++ b/canvas/src/components/tabs/SkillsTab.tsx
@ -6,6 +6,14 @@ import { useCanvasStore, summarizeWorkspaceCapabilities, type WorkspaceNodeData
 import { showToast } from "../Toaster";

 interface Props {
+  // The workspace's id is NOT a field on WorkspaceNodeData — that
+  // interface is the React Flow `node.data` blob, while the id lives
+  // on `node.id`. Pass it explicitly (matches every other tab in
+  // SidePanel) so the install/uninstall API calls don't end up
+  // POSTing to /workspaces/undefined/plugins. The interface extending
+  // Record<string, unknown> meant TypeScript silently typed
+  // `data.id` as `unknown` instead of erroring — easy to miss.
+  workspaceId: string;
  data: WorkspaceNodeData;
 }

@ -40,7 +48,7 @@ interface SourceSchemesResponse {
 // Delay before reloading installed plugins after install/uninstall (workspace restarts)
 const PLUGIN_RELOAD_DELAY_MS = 15_000;

-export function SkillsTab({ data }: Props) {
+export function SkillsTab({ workspaceId, data }: Props) {
  const capability = summarizeWorkspaceCapabilities(data);
  const skills = useMemo(() => extractSkills(data.agentCard), [data.agentCard]);
  const setPanelTab = useCanvasStore((s) => s.setPanelTab);
@ -57,32 +65,115 @@ export function SkillsTab({ data }: Props) {
  const reloadTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);

  useEffect(() => {
+    // Re-init `mountedRef.current = true` on every mount. React 18
+    // StrictMode (Next.js dev) double-invokes effects: mount →
+    // cleanup → mount. Without this re-init, the first cleanup sets
+    // mountedRef.current = false, the re-mount runs the effect body
+    // again but never restores the flag, so every subsequent
+    // `if (mountedRef.current) setX(...)` guard skips and the
+    // component appears wedged: fetches complete, state never
+    // updates, "Loading…" sits forever. Production doesn't double-
+    // invoke so the bug only surfaces in dev — but dev is where we
+    // see it, and the cost of being explicit is one assignment.
+    mountedRef.current = true;
    return () => {
      mountedRef.current = false;
      clearTimeout(reloadTimerRef.current);
    };
  }, []);

-  const workspaceId = data.id;
+  // Tracks whether loadInstalled has completed at least once (success
+  // or empty-array success — NOT failure). Without this the auto-
+  // expand effect below would fire on the initial render where
+  // `installed.length === 0` simply because the fetch hasn't returned
+  // yet, and worse, would also fire if the fetch throws (network
+  // blip, auth failure) — both cases falsely look like "no plugins
+  // installed". Gating on a separate "loaded" flag avoids the false
+  // positive.
+  const [installedLoaded, setInstalledLoaded] = useState(false);

  const loadInstalled = useCallback(async () => {
    try {
      const result = await api.get<PluginInfo[]>(`/workspaces/${workspaceId}/plugins`);
-      if (mountedRef.current) setInstalled(Array.isArray(result) ? result : []);
+      if (mountedRef.current) {
+        setInstalled(Array.isArray(result) ? result : []);
+        setInstalledLoaded(true);
+      }
    } catch (e) {
      console.warn("SkillsTab: installed plugins load failed", e);
    }
  }, [workspaceId]);

-  const loadRegistry = useCallback(async () => {
+  // registry-load lifecycle so the UI can show "Loading…" / error /
+  // retry instead of an indistinguishable "No plugins in registry"
+  // banner whether the fetch is in-flight, errored, or genuinely
+  // returned []. The previous silent console.warn-only path made
+  // an auth failure or CORS blip look identical to an empty
+  // registry — exactly the diagnosis dead-end observed when the
+  // server returned 20 plugins via curl but the canvas showed 0.
+  const [registryLoading, setRegistryLoading] = useState(false);
+  const [registryError, setRegistryError] = useState<string | null>(null);
+
+  // Synchronous gate against concurrent loadRegistry runs. Refs survive
+  // Fast Refresh re-renders (ref objects persist across re-runs of
+  // the function body), so a previously-stranded fetch can pin this
+  // ref at true and block every subsequent loadRegistry call. The
+  // `force` parameter on loadRegistry below provides the user-driven
+  // escape hatch for that wedge.
+  const registryFetchInFlight = useRef(false);
+
+  // Reset the in-flight gate on unmount so a Fast Refresh that
+  // tears down + recreates the component without a full page reload
+  // doesn't carry the stuck-true value into the new instance via
+  // dev-server-preserved module state.
+  useEffect(() => {
+    return () => {
+      registryFetchInFlight.current = false;
+    };
+  }, []);
+
+  const loadRegistry = useCallback(async (force = false) => {
+    // Default callers (mount effect, button while not loading) honour
+    // the gate. Explicit force=true callers (Retry button) bypass it
+    // — the user is signalling "forget whatever you thought was in
+    // flight, fetch again now".
+    if (!force && registryFetchInFlight.current) return;
+    registryFetchInFlight.current = true;
+    setRegistryLoading(true);
+    setRegistryError(null);
    try {
-      const result = await api.get<PluginInfo[]>("/plugins");
+      // 10s timeout — tighter than the 15s default. Plugin registry
+      // is local-disk-backed on the platform host (server reads
+      // pluginsDir entries) so a 10s budget is generous. Without
+      // an explicit timeout the UI's "Loading registry…" can sit
+      // for the full 15s + any browser hop time when a Fast
+      // Refresh strands an in-flight promise.
+      const result = await api.get<PluginInfo[]>("/plugins", { timeoutMs: 10_000 });
      if (mountedRef.current) setRegistry(Array.isArray(result) ? result : []);
    } catch (e) {
-      // Registry is the AVAILABLE PLUGINS list. Silent failure here
-      // left the user seeing "No plugins in registry" with no clue
-      // it was a fetch error — log it so devtools shows the cause.
      console.warn("SkillsTab: registry load failed", e);
+      if (mountedRef.current) {
+        // Detect timeout/abort by DOMException.name first — that's
+        // the canonical signal across browsers. Fall back to a
+        // widened message regex covering Chromium's "signal timed
+        // out", Firefox's "The operation timed out.", Safari's
+        // "Aborted". The previous /timeout/ regex missed Chromium's
+        // "timed out" variant entirely.
+        const name = (e as { name?: string })?.name ?? "";
+        const msg = e instanceof Error ? e.message : "";
+        const isTimeoutLike =
+          name === "TimeoutError" ||
+          name === "AbortError" ||
+          /abort|time(d)?\s*out/i.test(msg);
+        setRegistryError(
+          isTimeoutLike
+            ? "Registry fetch timed out (10s). The platform server may be slow or unreachable."
+            : msg || "Failed to load registry",
+        );
+      }
+    } finally {
+      registryFetchInFlight.current = false;
+      if (mountedRef.current) setRegistryLoading(false);
    }
  }, []);

@ -102,17 +193,73 @@ export function SkillsTab({ data }: Props) {
    loadSourceSchemes();
  }, [loadInstalled, loadRegistry, loadSourceSchemes]);

+  // First-time experience: if the workspace has zero plugins
+  // installed but the platform's registry has options to choose
+  // from, expand the registry by default so the user sees what's
+  // available without an extra click. Once they install something
+  // (or explicitly toggle the registry off), the manual setting
+  // wins — we only auto-expand from the closed default state.
+  const hasAutoExpandedRef = useRef(false);
+  useEffect(() => {
+    if (hasAutoExpandedRef.current) return;
+    if (installedLoaded && installed.length === 0 && registry.length > 0) {
+      setShowRegistry(true);
+      hasAutoExpandedRef.current = true;
+    }
+  }, [installedLoaded, installed.length, registry.length]);
+
  const installedNames = useMemo(() => new Set(installed.map((p) => p.name)), [installed]);

  // Install always goes through the source-based API. For registry
  // plugins we build the local:// source on the fly; custom sources
  // (github://, clawhub://, …) are typed into the input below.
-  const installFromSource = async (source: string, labelOverride?: string) => {
+  //
+  // Optional `optimistic` parameter mirrors the uninstall flow's local
+  // state mutation. Without it, the user sees the button revert from
+  // "Installing..." → "Install" the instant the POST returns, and the
+  // green "Installed" tag doesn't appear for ~15s while we wait out
+  // PLUGIN_RELOAD_DELAY_MS for the workspace restart before refetching.
+  // 15s of staring at the same button feels broken. Pushing the
+  // registry entry into `installed` immediately makes the UI reflect
+  // the install instantly; the delayed loadInstalled() reconciles
+  // anything we got wrong (or any server-side filtering we don't
+  // know about locally).
+  const installFromSource = async (
+    source: string,
+    labelOverride?: string,
+    optimistic?: PluginInfo,
+  ) => {
    const label = labelOverride ?? source;
    setInstalling(label);
    try {
      await api.post(`/workspaces/${workspaceId}/plugins`, { source });
      showToast(`Installed ${label} — restarting workspace`, "success");
+      if (optimistic && mountedRef.current) {
+        // Push with `supported_on_runtime` left undefined — the
+        // server's ListInstalled annotates the real value (true /
+        // false) at refetch time. Forcing `true` here would hide the
+        // "inert on this runtime" badge for 15s if the user
+        // installed a plugin that doesn't actually support the
+        // workspace's runtime; the badge only renders on `=== false`,
+        // so undefined keeps it neutral until reconciliation arrives.
+        setInstalled((prev) =>
+          prev.some((p) => p.name === optimistic.name)
+            ? prev
+            : [...prev, { ...optimistic, supported_on_runtime: undefined }],
+        );
+        // Note: we intentionally do NOT set `installedLoaded` here.
+        // That flag means "the initial GET has succeeded at least
+        // once" and gates the auto-expand-registry effect. A fast
+        // optimistic install BEFORE the initial fetch returns must
+        // not flip the gate, or the auto-expand never fires and a
+        // followup loadInstalled racing with the optimistic write
+        // could overwrite our entry with [] mid-restart.
+      }
+      // Drop any prior reload timer before scheduling a new one —
+      // back-to-back installs within PLUGIN_RELOAD_DELAY_MS would
+      // otherwise queue multiple loadInstalled() calls and the
+      // unmount cleanup only clears the latest handle.
+      clearTimeout(reloadTimerRef.current);
      reloadTimerRef.current = setTimeout(() => loadInstalled(), PLUGIN_RELOAD_DELAY_MS);
    } catch (e) {
      showToast(e instanceof Error ? e.message : "Install failed", "error");
@ -121,7 +268,10 @@ export function SkillsTab({ data }: Props) {
    }
  };

-  const handleInstall = (pluginName: string) => installFromSource(`local://${pluginName}`, pluginName);
+  const handleInstall = (pluginName: string) => {
+    const entry = registry.find((p) => p.name === pluginName);
+    return installFromSource(`local://${pluginName}`, pluginName, entry);
+  };

  const handleInstallCustom = async () => {
    const source = customSource.trim();
@ -133,9 +283,12 @@ export function SkillsTab({ data }: Props) {
  const handleUninstall = async (pluginName: string) => {
    setUninstalling(pluginName);
    try {
-      await api.del(`/workspaces/${data.id}/plugins/${pluginName}`);
+      await api.del(`/workspaces/${workspaceId}/plugins/${pluginName}`);
      showToast(`Removed ${pluginName} — restarting workspace`, "success");
      setInstalled((prev) => prev.filter((p) => p.name !== pluginName));
+      // Drop any prior reload timer (see installFromSource for the
+      // back-to-back-action leak rationale).
+      clearTimeout(reloadTimerRef.current);
      reloadTimerRef.current = setTimeout(() => loadInstalled(), PLUGIN_RELOAD_DELAY_MS);
    } catch (e) {
      showToast(e instanceof Error ? e.message : "Uninstall failed", "error");
@ -264,9 +417,53 @@ export function SkillsTab({ data }: Props) {
                Local registry plugins below; paste any scheme URL above for GitHub or other sources.
              </div>
            </div>
-            <div className="text-[10px] uppercase tracking-[0.2em] text-zinc-600 mb-2">Available plugins</div>
-            {registry.length === 0 ? (
-              <div className="text-[10px] text-zinc-600">No plugins in registry</div>
+            <div className="flex items-center justify-between mb-2">
+              <div className="text-[10px] uppercase tracking-[0.2em] text-zinc-600">Available plugins</div>
+              {/* Retry visible whenever registry is empty — including
+                  the loading state — so a stuck fetch (Fast Refresh
+                  stranded promise, slow server, browser quirk) has a
+                  user-driven escape hatch. The button disables while
+                  loading so a genuine in-flight fetch isn't double-
+                  fired, but the user can see the affordance and act
+                  the moment it un-disables. */}
+              {registry.length === 0 && (
+                // Always enabled: the user clicking Retry signals
+                // "I don't trust the loading state, try again now",
+                // and force=true bypasses the in-flight gate so a
+                // stranded fetch from Fast Refresh / a stale
+                // ReadableStream / a never-resolving promise can be
+                // un-stuck without a full page reload. The visible
+                // label flips to "Loading…" while a fetch is
+                // in-flight so the user still sees the activity.
+                <button
+                  type="button"
+                  onClick={() => loadRegistry(true)}
+                  className="text-[10px] text-violet-300 hover:text-violet-200 underline-offset-2 hover:underline"
+                >
+                  {registryLoading ? "Loading… click to retry" : "Retry"}
+                </button>
+              )}
+            </div>
+            {registryLoading && registry.length === 0 ? (
+              <div className="text-[10px] text-zinc-500">Loading registry…</div>
+            ) : registryError ? (
+              <div className="rounded-lg border border-red-800/40 bg-red-950/20 px-2 py-1.5">
+                <div className="text-[10px] text-red-300 font-semibold mb-0.5">
+                  Couldn't load the plugin registry
+                </div>
+                <div className="text-[10px] text-red-400/80">{registryError}</div>
+                <div className="mt-1 text-[10px] text-zinc-500">
+                  Check the platform server is reachable at /plugins. The Retry button is in the header above.
+                </div>
+              </div>
+            ) : registry.length === 0 ? (
+              <div className="rounded-lg border border-zinc-800/40 bg-zinc-950/40 px-2 py-1.5">
+                <div className="text-[10px] text-zinc-400 mb-0.5">Registry returned 0 plugins.</div>
+                <div className="text-[10px] text-zinc-600">
+                  This usually means the platform's plugins/ directory is empty.
+                  Run scripts/clone-manifest.sh to populate it from the standalone repos.
+                </div>
+              </div>
            ) : (
              <div className="space-y-1.5">
                {registry.map((p) => {
--- a/canvas/src/components/tabs/tests/ConfigTab.hermes.test.tsx
+++ b/canvas/src/components/tabs/tests/ConfigTab.hermes.test.tsx
@ -128,7 +128,13 @@ describe("ConfigTab — hermes workspace", () => {
    });
  });

-  it("shows hermes-specific info banner pointing to Terminal tab (#1894)", async () => {
+  it("does NOT show the hermes-specific info banner (removed in #2061)", async () => {
+    // Banner-text inversion: the multilevel-layout-UX PR drops "hermes"
+    // from RUNTIMES_WITH_OWN_CONFIG (now {"external"} only). Hermes now
+    // shows the normal Config form — the banner "Hermes manages its own
+    // config" is reserved for the "external" runtime, not hermes itself.
+    // If this ever flips back, revisit the banner/error UX before
+    // unpinning this assertion.
    wireApi({
      workspaceRuntime: "hermes",
      configYamlContent: null,
@ -137,9 +143,11 @@ describe("ConfigTab — hermes workspace", () => {

    render(<ConfigTab workspaceId="ws-test" />);

-    await waitFor(() => {
-      expect(screen.getByText(/Hermes manages its own config/i)).toBeTruthy();
-    });
+    // Wait for the render+loads to settle (template list drives the runtime combobox).
+    await waitFor(() =>
+      screen.getByRole("combobox", { name: /runtime/i }),
+    );
+    expect(screen.queryByText(/Hermes manages its own config/i)).toBeNull();
  });

  it("DOES show 'No config.yaml found' error for langgraph workspace (default runtime)", async () => {
@ -161,14 +169,28 @@ describe("ConfigTab — hermes workspace", () => {
 });

 describe("ConfigTab — config.yaml on disk", () => {
-  it("config.yaml runtime/model wins when present, workspace metadata is fallback", async () => {
-    // If the workspace DB has runtime=langgraph but config.yaml declares
-    // runtime: crewai, the form should show crewai (config.yaml wins).
-    // Prevents silent runtime drift across reads.
+  it("workspace metadata (DB) wins over config.yaml when both are present (#2061)", async () => {
+    // Priority inversion in #2061: previously config.yaml overrode DB, so
+    // the tier-on-node badge and runtime-in-form could drift when the
+    // user edited config.yaml on disk. The multilevel-layout-UX PR made
+    // the DB authoritative — config.yaml is read for non-DB keys (tools,
+    // MCP server list, etc.) but runtime/model/tier come from the
+    // workspace row so the node badge matches the form.
+    //
+    // Scenario: DB says "hermes", config.yaml says "crewai". The form
+    // must show hermes (DB wins).
+    //
+    // We pick hermes (not langgraph) on the DB side because "langgraph"
+    // is collapsed to the empty-string "LangGraph (default)" option in
+    // the runtime dropdown — so a "langgraph" DB value would render as
+    // the empty-valued option and obscure whether the DB-wins logic
+    // actually fired. Hermes has its own non-empty option value and
+    // gives the assertion a clean signal.
    wireApi({
-      workspaceRuntime: "langgraph", // DB
+      workspaceRuntime: "hermes", // DB — authoritative
      configYamlContent: 'runtime: crewai\nmodel: "claude-opus"\n',
      templates: [
+        { id: "t-hermes", name: "Hermes", runtime: "hermes", models: [] },
        { id: "t-crewai", name: "CrewAI", runtime: "crewai", models: [] },
      ],
    });
@ -176,6 +198,6 @@ describe("ConfigTab — config.yaml on disk", () => {
    render(<ConfigTab workspaceId="ws-test" />);

    const select = await waitFor(() => screen.getByRole("combobox", { name: /runtime/i }));
-    expect((select as HTMLSelectElement).value).toBe("crewai");
+    expect((select as HTMLSelectElement).value).toBe("hermes");
  });
 });
--- a/canvas/src/components/tabs/chat/AgentCommsPanel.tsx
+++ b/canvas/src/components/tabs/chat/AgentCommsPanel.tsx
@ -1,13 +1,17 @@
 "use client";

 import { useState, useEffect, useRef } from "react";
+import ReactMarkdown from "react-markdown";
+import remarkGfm from "remark-gfm";
 import { api } from "@/lib/api";
 import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
 import { WS_URL } from "@/store/socket";
 import { closeWebSocketGracefully } from "@/lib/ws-close";
+import { showToast } from "../../Toaster";
 import { extractResponseText, extractRequestText } from "./message-parser";
+import { inferA2AErrorHint } from "./a2aErrorHint";

-interface ActivityEntry {
+export interface ActivityEntry {
  id: string;
  activity_type: string;
  source_id: string | null;
@ -22,11 +26,29 @@ interface ActivityEntry {

 interface CommMessage {
  id: string;
-  direction: "in" | "out";
+  /** UI-facing flow from THIS workspace's point of view:
+   *
+   *    "out" — this workspace either initiated the call (a2a_send)
+   *            OR self-logged the reply from a peer it had called
+   *            (a2a_receive with source_id == workspaceId).
+   *    "in"  — a peer initiated the call to us (a2a_receive with
+   *            source_id != workspaceId).
+   *
+   *  Distinct from activity_type because the agent runtime self-
+   *  logs its outbound calls' replies as `a2a_receive` rows; without
+   *  this normalisation the UI labels would render those as
+   *  incoming ("← From X") and right-justify them on the wrong
+   *  side, even though from the user's perspective the call WAS
+   *  outgoing. See toCommMessage for the resolution rules. */
+  flow: "in" | "out";
  peerName: string;
  peerId: string;
  text: string;
  responseText: string | null;
+  /** "ok" | "error" — surfaces failed deliveries with their own
+   *  visual treatment + recovery actions instead of an opaque
+   *  "[A2A_ERROR]" body the user can't act on. */
+  status: string;
  timestamp: string;
 }

@ -36,9 +58,59 @@ function resolveName(id: string): string {
  return (node?.data as WorkspaceNodeData)?.name || id.slice(0, 8);
 }

-function toCommMessage(entry: ActivityEntry, workspaceId: string): CommMessage | null {
-  const isOutgoing = entry.activity_type === "a2a_send";
-  const peerId = isOutgoing ? (entry.target_id || "") : (entry.source_id || "");
+export function toCommMessage(entry: ActivityEntry, workspaceId: string): CommMessage | null {
+  // delegation activity rows are written by the platform's /delegate
+  // handler. They're always outbound from this workspace's POV (the
+  // platform proxies the A2A on our behalf). Two methods:
+  //   - "delegate"        — the initial outbound; status pending/dispatched
+  //   - "delegate_result" — the eventual reply; status completed/queued/failed
+  // We surface them in Agent Comms because they ARE agent-to-agent
+  // calls; without this branch they'd be dropped by the activity_type
+  // filter and the user would see "No agent-to-agent communications yet"
+  // even when the director made delegations.
+  if (entry.activity_type === "delegation") {
+    const peerId = entry.target_id || "";
+    if (!peerId) return null;
+    return {
+      id: entry.id,
+      flow: "out",
+      peerName: resolveName(peerId),
+      peerId,
+      // Prefer summary (set by the platform with a human-readable
+      // string like "Delegating to X" or "Delegation queued — target
+      // at capacity"). Fall back to request body for older rows that
+      // pre-date the summary column being populated.
+      text: entry.summary || extractRequestText(entry.request_body) || "(delegation)",
+      responseText: entry.response_body ? extractResponseText(entry.response_body) : null,
+      status: entry.status || "ok",
+      timestamp: entry.created_at,
+    };
+  }
+
+  // a2a_receive activity rows come in two shapes:
+  //
+  //   1. Real incoming call (a peer called us): source_id = the peer,
+  //      target_id = us. peerId is source_id, flow is "in".
+  //
+  //   2. Self-logged response to an outbound call (the workspace's own
+  //      runtime calls report_activity("a2a_receive", ...) after
+  //      delegating; see workspace/a2a_tools.py:181). source_id =
+  //      our own workspace_id, target_id = the peer that replied.
+  //      peerId must come from target_id (otherwise the peer-name
+  //      resolves to "us" and Restart would target THIS workspace),
+  //      and flow is "out" — from the user's perspective this row
+  //      belongs to the outbound thread, not an incoming one.
+  //
+  // a2a_send rows are always outbound from us: source_id = us,
+  // target_id = the peer.
+  const isSendActivity = entry.activity_type === "a2a_send";
+  const isSelfLoggedReceive =
+    entry.activity_type === "a2a_receive" && entry.source_id === workspaceId;
+  const flow: "in" | "out" = isSendActivity || isSelfLoggedReceive ? "out" : "in";
+  const peerId =
+    isSendActivity || isSelfLoggedReceive
+      ? entry.target_id || ""
+      : entry.source_id || "";
  if (!peerId) return null;

  const text = extractRequestText(entry.request_body) || entry.summary || "";
@ -46,15 +118,35 @@ function toCommMessage(entry: ActivityEntry, workspaceId: string): CommMessage |

  return {
    id: entry.id,
-    direction: isOutgoing ? "out" : "in",
+    flow,
    peerName: resolveName(peerId),
    peerId,
    text,
    responseText,
+    status: entry.status || "ok",
    timestamp: entry.created_at,
  };
 }

+/** Strip the [A2A_ERROR] sentinel prefix the workspace runtime adds
+ *  to failed delegation responses, so the UI can render the underlying
+ *  message (or fall back to a generic explanation when the inner text
+ *  is empty — currently common because httpx exceptions often
+ *  stringify as ""). */
+const A2A_ERROR_PREFIX = "[A2A_ERROR]";
+
+function unwrapErrorText(raw: string | null): string {
+  if (!raw) return "";
+  const trimmed = raw.trim();
+  if (trimmed.startsWith(A2A_ERROR_PREFIX)) {
+    return trimmed.slice(A2A_ERROR_PREFIX.length).trim();
+  }
+  return trimmed;
+}
+
+// inferA2AErrorHint moved to ./a2aErrorHint so the Activity tab and
+// this panel render identical hints for the same symptom.
+
 export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {
  const [messages, setMessages] = useState<CommMessage[]>([]);
  const [loading, setLoading] = useState(true);
@ -67,22 +159,49 @@ export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {
    setLoading(true);
    api.get<ActivityEntry[]>(`/workspaces/${workspaceId}/activity?source=agent&limit=50`)
      .then((entries) => {
-        const filtered = entries
-          .filter((e) => e.activity_type === "a2a_send" || e.activity_type === "a2a_receive")
+        const filtered = (entries ?? [])
+          .filter((e) =>
+            e.activity_type === "a2a_send" ||
+            e.activity_type === "a2a_receive" ||
+            e.activity_type === "delegation",
+          )
          .reverse();
        const msgs: CommMessage[] = [];
        for (const e of filtered) {
-          const m = toCommMessage(e, workspaceId);
-          if (m) {
-            const key = `${m.timestamp}:${m.direction}:${m.peerId}`;
-            msgs.push(m);
-            seenKeys.current.add(key);
+          // Per-row try/catch so a single malformed activity row
+          // (e.g. unexpected request_body shape) doesn't kill the
+          // batch — the previous code threw out of the for-loop and
+          // setMessages([3 items]) never ran, leaving the panel
+          // stuck on the empty state with no diagnostic in the
+          // console because the outer .catch silently swallowed
+          // everything.
+          try {
+            const m = toCommMessage(e, workspaceId);
+            if (m) {
+              const key = `${m.timestamp}:${m.flow}:${m.peerId}`;
+              msgs.push(m);
+              seenKeys.current.add(key);
+            }
+          } catch (rowErr) {
+            console.warn(
+              "AgentCommsPanel: failed to map activity row",
+              { id: e.id, type: e.activity_type, err: rowErr },
+            );
          }
        }
        setMessages(msgs);
        setLoading(false);
      })
-      .catch(() => setLoading(false));
+      .catch((err) => {
+        // Surface the failure in the console so a stuck panel is
+        // diagnosable without a debugger. Previous bare
+        // `.catch(() => setLoading(false))` swallowed every load
+        // failure (network errors, JSON parse errors, throws inside
+        // the .then body) — the panel just sat on the empty state
+        // with zero signal.
+        console.warn("AgentCommsPanel: load activity failed", err);
+        setLoading(false);
+      });
  }, [workspaceId]);

  // Live updates via WebSocket
@ -94,14 +213,34 @@ export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {
    ws.onmessage = (event) => {
      try {
        const msg = JSON.parse(event.data);
-        if (msg.event === "ACTIVITY_LOGGED" && msg.workspace_id === workspaceId) {
+        if (msg.workspace_id !== workspaceId) return;
+
+        // Two live-update paths:
+        //   1. ACTIVITY_LOGGED — fired by the LogActivity helper for
+        //      a2a_send / a2a_receive (and delegation rows IF the
+        //      delegation handler is ever refactored to use it). Today
+        //      the platform's delegation handlers do direct INSERT
+        //      INTO activity_logs WITHOUT firing ACTIVITY_LOGGED, so
+        //      the delegation branch here is defensive — it'd light
+        //      up automatically the day delegation handlers are
+        //      refactored to call LogActivity.
+        //   2. DELEGATION_SENT / DELEGATION_STATUS / DELEGATION_COMPLETE
+        //      / DELEGATION_FAILED — fired by the platform's delegation
+        //      handlers directly. These are the ONLY live signals the
+        //      panel currently has for delegation rows; the GET on
+        //      mount (which reads from activity_logs) shows them, but
+        //      without this branch, nothing surfaced live until the
+        //      next remount. Synthesise an ActivityEntry from the
+        //      payload so toCommMessage's existing delegation branch
+        //      handles them identically to the GET path.
+        let entry: ActivityEntry | null = null;
+        if (msg.event === "ACTIVITY_LOGGED") {
          const p = msg.payload || {};
          const type = p.activity_type as string;
          const sourceId = p.source_id as string | null;
          if (!sourceId) return; // canvas-initiated, not agent comms
-          if (type !== "a2a_send" && type !== "a2a_receive") return;
-
-          const entry: ActivityEntry = {
+          if (type !== "a2a_send" && type !== "a2a_receive" && type !== "delegation") return;
+          entry = {
            id: p.id as string || crypto.randomUUID(),
            activity_type: type,
            source_id: sourceId,
@ -113,13 +252,56 @@ export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {
            status: p.status as string || "ok",
            created_at: msg.timestamp || new Date().toISOString(),
          };
-          const m = toCommMessage(entry, workspaceId);
-          if (m) {
-            const key = `${m.timestamp}:${m.direction}:${m.peerId}`;
-            if (seenKeys.current.has(key)) return;
-            seenKeys.current.add(key);
-            setMessages((prev) => [...prev, m]);
+        } else if (
+          msg.event === "DELEGATION_SENT" ||
+          msg.event === "DELEGATION_STATUS" ||
+          msg.event === "DELEGATION_COMPLETE" ||
+          msg.event === "DELEGATION_FAILED"
+        ) {
+          const p = msg.payload || {};
+          const targetId = (p.target_id as string) || "";
+          if (!targetId) return;
+          // Map event → status. DELEGATION_STATUS payload includes its
+          // own `status` field (queued / dispatched). Other events have
+          // implicit status: SENT → pending, COMPLETE → completed,
+          // FAILED → failed.
+          let status: string;
+          let summary: string;
+          if (msg.event === "DELEGATION_STATUS") {
+            status = (p.status as string) || "queued";
+            summary = `Delegation ${status}`;
+          } else if (msg.event === "DELEGATION_COMPLETE") {
+            status = "completed";
+            summary = `Delegation completed (${(p.response_preview as string)?.slice(0, 60) || ""})`;
+          } else if (msg.event === "DELEGATION_FAILED") {
+            status = "failed";
+            summary = `Delegation failed: ${(p.error as string) || "unknown"}`;
+          } else {
+            status = "pending";
+            summary = `Delegating to ${(p.target_id as string)?.slice(0, 8) || "peer"}`;
          }
+          entry = {
+            id: (p.delegation_id as string) || crypto.randomUUID(),
+            activity_type: "delegation",
+            source_id: workspaceId,
+            target_id: targetId,
+            method: msg.event === "DELEGATION_SENT" ? "delegate" : "delegate_result",
+            summary,
+            request_body: null,
+            response_body: null,
+            status,
+            created_at: msg.timestamp || new Date().toISOString(),
+          };
+        } else {
+          return;
+        }
+
+        const m = toCommMessage(entry, workspaceId);
+        if (m) {
+          const key = `${m.timestamp}:${m.flow}:${m.peerId}`;
+          if (seenKeys.current.has(key)) return;
+          seenKeys.current.add(key);
+          setMessages((prev) => [...prev, m]);
        }
      } catch { /* ignore */ }
    };
@ -148,31 +330,177 @@ export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {

  return (
    <div className="flex-1 overflow-y-auto p-3 space-y-2">
-      {messages.map((msg) => (
-        <div key={msg.id} className={`flex ${msg.direction === "out" ? "justify-end" : "justify-start"}`}>
-          <div
-            className={`max-w-[85%] rounded-lg px-3 py-2 text-xs ${
-              msg.direction === "out"
-                ? "bg-cyan-900/30 text-cyan-100 border border-cyan-700/20"
-                : "bg-zinc-800/80 text-zinc-200 border border-zinc-700/30"
-            }`}
-          >
-            <div className="text-[9px] text-zinc-500 mb-1">
-              {msg.direction === "out" ? `→ To ${msg.peerName}` : `← From ${msg.peerName}`}
-            </div>
-            <div className="text-zinc-300">{msg.text || "(no message text)"}</div>
-            {msg.responseText && (
-              <div className="mt-1.5 pt-1.5 border-t border-zinc-700/30 text-zinc-400">
-                {msg.responseText}
-              </div>
-            )}
-            <div className="text-[9px] text-zinc-500 mt-1">
-              {new Date(msg.timestamp).toLocaleTimeString()}
-            </div>
-          </div>
-        </div>
-      ))}
+      {messages.map((msg) =>
+        msg.status === "error" ? (
+          <ErrorMessage key={msg.id} msg={msg} />
+        ) : (
+          <NormalMessage key={msg.id} msg={msg} />
+        ),
+      )}
      <div ref={bottomRef} />
    </div>
  );
 }
+
+function NormalMessage({ msg }: { msg: CommMessage }) {
+  return (
+    <div className={`flex ${msg.flow === "out" ? "justify-end" : "justify-start"}`}>
+      <div
+        className={`max-w-[85%] rounded-lg px-3 py-2 text-xs ${
+          msg.flow === "out"
+            ? "bg-cyan-900/30 text-cyan-100 border border-cyan-700/20"
+            : "bg-zinc-800/80 text-zinc-200 border border-zinc-700/30"
+        }`}
+      >
+        <div className="text-[9px] text-zinc-500 mb-1">
+          {msg.flow === "out" ? `→ To ${msg.peerName}` : `← From ${msg.peerName}`}
+        </div>
+        {msg.text ? (
+          <MarkdownBody className="text-zinc-300">{msg.text}</MarkdownBody>
+        ) : (
+          <div className="text-zinc-300">(no message text)</div>
+        )}
+        {msg.responseText && (
+          <MarkdownBody className="mt-1.5 pt-1.5 border-t border-zinc-700/30 text-zinc-400">
+            {msg.responseText}
+          </MarkdownBody>
+        )}
+        <div className="text-[9px] text-zinc-500 mt-1">
+          {new Date(msg.timestamp).toLocaleTimeString()}
+        </div>
+      </div>
+    </div>
+  );
+}
+
+/** Failure-state row. Replaces the unactionable "X failed [A2A_ERROR]"
+ *  bubble with: a clear banner naming the peer, the underlying
+ *  error text (if any), an inferred cause hint, and recovery
+ *  actions — Restart workspace, Open workspace.
+ *
+ *  Recovery actions show on BOTH directions because both target the
+ *  same peer (toCommMessage now resolves peerId to the peer in
+ *  either case): an outbound delivery failure ("we called X and it
+ *  errored"), an inbound runtime failure ("X called us and our
+ *  reply errored" — rare), or the agent-self-logged "I called X and
+ *  got an error back" pattern that is the most common shape. The
+ *  user always wants to restart or inspect the failing peer. */
+function ErrorMessage({ msg }: { msg: CommMessage }) {
+  const selectNode = useCanvasStore((s) => s.selectNode);
+  const [restarting, setRestarting] = useState(false);
+  const errorText = unwrapErrorText(msg.responseText);
+  const hint = inferA2AErrorHint(errorText);
+
+  // Guard against acting on a peer whose workspace has been deleted
+  // since this row was logged. Without the guard, restart 404s
+  // surface as a generic toast and Open silently sets a dangling
+  // selection that renders nothing in the side panel.
+  const peerExists = (): boolean => {
+    return useCanvasStore.getState().nodes.some((n) => n.id === msg.peerId);
+  };
+
+  const handleRestart = async () => {
+    if (restarting) return;
+    if (!peerExists()) {
+      showToast(`${msg.peerName} no longer exists`, "error");
+      return;
+    }
+    setRestarting(true);
+    try {
+      await api.post(`/workspaces/${msg.peerId}/restart`, {});
+      showToast(`Restarting ${msg.peerName}…`, "success");
+    } catch (e) {
+      showToast(
+        `Restart failed: ${e instanceof Error ? e.message : "unknown error"}`,
+        "error",
+      );
+    } finally {
+      setRestarting(false);
+    }
+  };
+
+  const handleOpen = () => {
+    if (!peerExists()) {
+      showToast(`${msg.peerName} no longer exists`, "error");
+      return;
+    }
+    selectNode(msg.peerId);
+  };
+
+  return (
+    <div className={`flex ${msg.flow === "out" ? "justify-end" : "justify-start"}`}>
+      <div className="max-w-[85%] rounded-lg border border-red-800/50 bg-red-950/30 px-3 py-2 text-xs">
+        <div className="flex items-center gap-1.5 text-[10px] text-red-300 font-semibold uppercase tracking-wide mb-1.5">
+          <span aria-hidden="true">⚠</span>
+          {msg.flow === "out"
+            ? `Failed to deliver to ${msg.peerName}`
+            : `${msg.peerName} returned an error`}
+        </div>
+
+        {msg.text && (
+          <div className="text-[10px] text-zinc-500 mb-1.5">
+            <span className="uppercase tracking-wide">Task</span>
+            <MarkdownBody className="text-zinc-400">{msg.text}</MarkdownBody>
+          </div>
+        )}
+
+        <div className="rounded bg-zinc-950/60 border border-red-900/40 px-2 py-1.5 mb-1.5">
+          <div className="text-[9px] uppercase tracking-wide text-red-400 mb-0.5">
+            Underlying error
+          </div>
+          <code className="text-[11px] font-mono text-red-200 whitespace-pre-wrap break-words">
+            {errorText || "(no detail returned)"}
+          </code>
+        </div>
+
+        <p className="text-[10px] text-zinc-400 leading-snug mb-2">{hint}</p>
+
+        {msg.peerId && (
+          <div className="flex flex-wrap items-center gap-1.5">
+            <button
+              type="button"
+              onClick={handleRestart}
+              disabled={restarting}
+              className="px-2 py-0.5 rounded bg-red-900/50 hover:bg-red-800/60 border border-red-700/40 text-[10px] text-red-200 disabled:opacity-50 transition-colors"
+            >
+              {restarting ? "Restarting…" : `Restart ${msg.peerName}`}
+            </button>
+            <button
+              type="button"
+              onClick={handleOpen}
+              className="px-2 py-0.5 rounded bg-zinc-800 hover:bg-zinc-700 border border-zinc-700/50 text-[10px] text-zinc-300 transition-colors"
+            >
+              Open {msg.peerName}
+            </button>
+          </div>
+        )}
+
+        <div className="text-[9px] text-zinc-500 mt-1.5">
+          {new Date(msg.timestamp).toLocaleTimeString()}
+        </div>
+      </div>
+    </div>
+  );
+}
+
+/** Tiny markdown wrapper matching ChatTab's My Chat styling. Same
+ *  remark-gfm pipeline (tables, strikethrough, task lists) plus the
+ *  prose tweaks that keep paragraphs tight inside a small bubble.
+ *  Code blocks get an `overflow-x-auto` so a long line of code doesn't
+ *  blow out the bubble's max-width — agent-to-agent replies routinely
+ *  ship code samples and JSON. */
+function MarkdownBody({
+  children,
+  className,
+}: {
+  children: string;
+  className?: string;
+}) {
+  return (
+    <div
+      className={`prose prose-sm prose-invert max-w-none [&>p]:mb-1 [&>p:last-child]:mb-0 [&_pre]:overflow-x-auto [&_table]:block [&_table]:overflow-x-auto ${className ?? ""}`}
+    >
+      <ReactMarkdown remarkPlugins={[remarkGfm]}>{children}</ReactMarkdown>
+    </div>
+  );
+}
--- a/canvas/src/components/tabs/chat/AttachmentViews.tsx
+++ b/canvas/src/components/tabs/chat/AttachmentViews.tsx
@ -0,0 +1,94 @@
+"use client";
+
+// Small presentational components for chat attachments. Kept in a
+// separate file so ChatTab.tsx stays focused on state + send/receive
+// orchestration. Both variants share the file-icon + name + size
+// layout; the only difference is the trailing action (remove for
+// pending, download for completed).
+
+import type { ChatAttachment } from "./types";
+
+function formatSize(bytes: number | undefined): string {
+  if (bytes == null) return "";
+  if (bytes < 1024) return `${bytes} B`;
+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`;
+  return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
+}
+
+/** Inline pill for a file that the user has picked but not yet sent.
+ *  Renders above the textarea; clicking × pops it from the pending
+ *  list without uploading. */
+export function PendingAttachmentPill({
+  file,
+  onRemove,
+}: {
+  file: File;
+  onRemove: () => void;
+}) {
+  return (
+    <div className="flex items-center gap-1.5 rounded-md border border-zinc-700/60 bg-zinc-800/80 px-2 py-1 text-[10px] text-zinc-300 max-w-[200px]">
+      <FileGlyph className="text-zinc-400 shrink-0" />
+      <span className="truncate" title={file.name}>{file.name}</span>
+      <span className="text-zinc-500 shrink-0 tabular-nums">{formatSize(file.size)}</span>
+      <button
+        onClick={onRemove}
+        aria-label={`Remove ${file.name}`}
+        className="ml-0.5 text-zinc-500 hover:text-zinc-200 transition-colors shrink-0"
+      >
+        <svg width="10" height="10" viewBox="0 0 16 16" fill="none" aria-hidden="true">
+          <path d="M4 4l8 8M12 4l-8 8" stroke="currentColor" strokeWidth="1.6" strokeLinecap="round" />
+        </svg>
+      </button>
+    </div>
+  );
+}
+
+/** Chip rendered inside a message bubble for a sent/received file.
+ *  Clicking triggers the download via the passed onDownload callback
+ *  so the parent controls workspace-scoped URL resolution. */
+export function AttachmentChip({
+  attachment,
+  onDownload,
+  tone,
+}: {
+  attachment: ChatAttachment;
+  onDownload: (a: ChatAttachment) => void;
+  tone: "user" | "agent";
+}) {
+  const toneClasses =
+    tone === "user"
+      ? "border-blue-400/30 bg-blue-600/20 hover:bg-blue-600/30 text-blue-100"
+      : "border-zinc-600/50 bg-zinc-700/40 hover:bg-zinc-600/50 text-zinc-100";
+  return (
+    <button
+      onClick={() => onDownload(attachment)}
+      title={`Download ${attachment.name}`}
+      className={`flex items-center gap-1.5 rounded-md border px-2 py-1 text-[10px] transition-colors max-w-full ${toneClasses}`}
+    >
+      <FileGlyph className="shrink-0 opacity-70" />
+      <span className="truncate">{attachment.name}</span>
+      {attachment.size != null && (
+        <span className="opacity-60 shrink-0 tabular-nums">{formatSize(attachment.size)}</span>
+      )}
+      <DownloadGlyph className="opacity-70 shrink-0" />
+    </button>
+  );
+}
+
+function FileGlyph({ className }: { className?: string }) {
+  return (
+    <svg width="10" height="10" viewBox="0 0 16 16" fill="none" className={className} aria-hidden="true">
+      <path d="M4 2h5l3 3v9a1 1 0 0 1-1 1H4a1 1 0 0 1-1-1V3a1 1 0 0 1 1-1Z" stroke="currentColor" strokeWidth="1.3" strokeLinejoin="round" />
+      <path d="M9 2v3h3" stroke="currentColor" strokeWidth="1.3" strokeLinejoin="round" />
+    </svg>
+  );
+}
+
+function DownloadGlyph({ className }: { className?: string }) {
+  return (
+    <svg width="10" height="10" viewBox="0 0 16 16" fill="none" className={className} aria-hidden="true">
+      <path d="M8 2v9M4 7l4 4 4-4" stroke="currentColor" strokeWidth="1.4" strokeLinecap="round" strokeLinejoin="round" />
+      <path d="M3 13h10" stroke="currentColor" strokeWidth="1.4" strokeLinecap="round" />
+    </svg>
+  );
+}
--- a/canvas/src/components/tabs/chat/tests/AgentCommsPanel.test.ts
+++ b/canvas/src/components/tabs/chat/tests/AgentCommsPanel.test.ts
@ -0,0 +1,174 @@
+// @vitest-environment jsdom
+import { describe, it, expect, vi } from "vitest";
+
+// Stub the canvas store before importing the SUT — toCommMessage calls
+// useCanvasStore.getState() inside resolveName to look up peer names,
+// which would otherwise hit the real Zustand store.
+vi.mock("@/store/canvas", () => ({
+  useCanvasStore: {
+    getState: () => ({
+      nodes: [
+        { id: "ws-self", data: { name: "Self" } },
+        { id: "ws-peer", data: { name: "Peer Agent" } },
+      ],
+    }),
+  },
+}));
+
+import { toCommMessage, type ActivityEntry } from "../AgentCommsPanel";
+
+const SELF = "ws-self";
+const PEER = "ws-peer";
+
+function makeEntry(overrides: Partial<ActivityEntry> = {}): ActivityEntry {
+  return {
+    id: "act-1",
+    activity_type: "a2a_send",
+    source_id: SELF,
+    target_id: PEER,
+    method: "message/send",
+    summary: "Delegating to Peer Agent",
+    request_body: null,
+    response_body: null,
+    status: "ok",
+    created_at: "2026-04-25T18:00:00Z",
+    ...overrides,
+  };
+}
+
+describe("toCommMessage — flow derivation", () => {
+  it("a2a_send is always outbound (flow=out, peer=target)", () => {
+    const m = toCommMessage(
+      makeEntry({ activity_type: "a2a_send", source_id: SELF, target_id: PEER }),
+      SELF,
+    );
+    expect(m).toBeTruthy();
+    expect(m!.flow).toBe("out");
+    expect(m!.peerId).toBe(PEER);
+    expect(m!.peerName).toBe("Peer Agent");
+  });
+
+  it("a2a_receive from a peer (peer-initiated call) is inbound", () => {
+    // Real incoming call: source = peer, target = us.
+    const m = toCommMessage(
+      makeEntry({
+        activity_type: "a2a_receive",
+        source_id: PEER,
+        target_id: SELF,
+      }),
+      SELF,
+    );
+    expect(m!.flow).toBe("in");
+    expect(m!.peerId).toBe(PEER);
+    expect(m!.peerName).toBe("Peer Agent");
+  });
+
+  it("a2a_receive self-logged by our runtime AFTER an outbound call is OUTBOUND from the user's POV", () => {
+    // workspace/a2a_tools.py:181 self-logs an a2a_receive on the
+    // CALLER's workspace_id with source_id=us, target_id=peer.
+    // From the user's perspective this row belongs to the outbound
+    // delegation thread — render flow=out + peer=target so the
+    // bubble right-justifies under "Delegating to peer" and the
+    // Restart button targets the actual peer (NOT us). Regression
+    // for the bug where these rows rendered as "← From Self" with
+    // a Restart button that would have restarted the user's own
+    // workspace.
+    const m = toCommMessage(
+      makeEntry({
+        activity_type: "a2a_receive",
+        source_id: SELF,
+        target_id: PEER,
+        summary: "Peer Agent failed",
+        status: "error",
+      }),
+      SELF,
+    );
+    expect(m!.flow).toBe("out");
+    expect(m!.peerId).toBe(PEER);
+    expect(m!.peerName).toBe("Peer Agent");
+    expect(m!.status).toBe("error");
+  });
+
+  it("returns null when no peer can be resolved", () => {
+    // a2a_receive with both ids null — discard rather than render a
+    // ghost bubble pointing at "Unknown".
+    const m = toCommMessage(
+      makeEntry({
+        activity_type: "a2a_receive",
+        source_id: null,
+        target_id: null,
+      }),
+      SELF,
+    );
+    expect(m).toBeNull();
+  });
+
+  it("propagates status through to the message (drives error rendering)", () => {
+    const m = toCommMessage(
+      makeEntry({ status: "error", activity_type: "a2a_send" }),
+      SELF,
+    );
+    expect(m!.status).toBe("error");
+  });
+
+  // --- delegation rows ---
+  // The platform's /delegate handler writes activity_type='delegation'
+  // for both the initial outbound (method='delegate') and the eventual
+  // reply (method='delegate_result', status=queued|completed|failed).
+  // Pre-fix the panel filtered these out and showed "no agent comms"
+  // even when 6+ delegations existed in the DB.
+
+  it("delegation 'delegate' row maps as outbound to target", () => {
+    const m = toCommMessage(
+      makeEntry({
+        activity_type: "delegation",
+        method: "delegate",
+        source_id: SELF,
+        target_id: PEER,
+        summary: "Delegating to ws-peer",
+        status: "pending",
+      }),
+      SELF,
+    );
+    expect(m).toBeTruthy();
+    expect(m!.flow).toBe("out");
+    expect(m!.peerId).toBe(PEER);
+    expect(m!.peerName).toBe("Peer Agent");
+    expect(m!.text).toBe("Delegating to ws-peer");
+    expect(m!.status).toBe("pending");
+  });
+
+  it("delegation 'delegate_result' queued row preserves status='queued'", () => {
+    // The "queued" status is the load-bearing signal the LLM uses to
+    // decide whether to wait or fall back. If toCommMessage drops or
+    // rewrites it, the UI loses the ability to show the "peer busy,
+    // will reply" affordance.
+    const m = toCommMessage(
+      makeEntry({
+        activity_type: "delegation",
+        method: "delegate_result",
+        source_id: SELF,
+        target_id: PEER,
+        summary: "Delegation queued — target at capacity",
+        status: "queued",
+      }),
+      SELF,
+    );
+    expect(m!.status).toBe("queued");
+    expect(m!.text).toContain("queued");
+  });
+
+  it("delegation row with no target_id returns null", () => {
+    // Defensive: a delegation row missing target_id can't be rendered
+    // (we wouldn't know which peer to attribute it to). Drop instead
+    // of rendering a ghost.
+    const m = toCommMessage(
+      makeEntry({
+        activity_type: "delegation",
+        target_id: null,
+      }),
+      SELF,
+    );
+    expect(m).toBeNull();
+  });
+});
--- a/canvas/src/components/tabs/chat/tests/a2aErrorHint.test.ts
+++ b/canvas/src/components/tabs/chat/tests/a2aErrorHint.test.ts
@ -0,0 +1,67 @@
+import { describe, it, expect } from "vitest";
+import { inferA2AErrorHint } from "../a2aErrorHint";
+
+// Pure logic. Pin every named pattern so a future contributor adding a
+// new symptom doesn't accidentally collapse the buckets — and so the
+// "most specific first" ordering can't drift without a test failing.
+
+describe("inferA2AErrorHint", () => {
+  it("matches the Claude Code SDK init wedge specifically", () => {
+    const hint = inferA2AErrorHint("Control request timeout: initialize");
+    expect(hint).toMatch(/Claude Code SDK is wedged/);
+  });
+
+  it("does NOT misfire on user tasks containing 'initialize' generally", () => {
+    // Regression: an earlier bare-`initialize` pattern would have
+    // false-positived "failed to initialize database" into the SDK
+    // wedge hint. Confirm the full-phrase guard holds.
+    const hint = inferA2AErrorHint("failed to initialize database connection");
+    expect(hint).not.toMatch(/Claude Code SDK/);
+  });
+
+  it("recognises httpx ReadTimeout / ConnectTimeout class names", () => {
+    expect(inferA2AErrorHint("ReadTimeout: timeout")).toMatch(/proxy timeout/);
+    expect(inferA2AErrorHint("ConnectTimeout: ...")).toMatch(/proxy timeout/);
+  });
+
+  it("recognises generic timeout / deadline-exceeded language", () => {
+    expect(inferA2AErrorHint("deadline exceeded after 300s")).toMatch(/proxy timeout/);
+    expect(inferA2AErrorHint("Operation timeout")).toMatch(/proxy timeout/);
+  });
+
+  it("handles connection-reset family (RemoteProtocolError, ConnectionReset, no-message)", () => {
+    expect(inferA2AErrorHint("RemoteProtocolError: ...")).toMatch(/connection.*dropped/);
+    expect(inferA2AErrorHint("ConnectionResetError")).toMatch(/connection.*dropped/);
+    expect(inferA2AErrorHint("connection reset by peer")).toMatch(/connection.*dropped/);
+    expect(inferA2AErrorHint("RemoteProtocolError (no message — likely connection reset)")).toMatch(/connection.*dropped/);
+  });
+
+  it("recognises agent-runtime exceptions", () => {
+    expect(inferA2AErrorHint("Agent error: ValueError raised")).toMatch(/runtime threw an exception/);
+    expect(inferA2AErrorHint("RuntimeException in tool call")).toMatch(/runtime threw an exception/);
+  });
+
+  it("recognises peer-unreachable cases (Activity-tab originals)", () => {
+    expect(inferA2AErrorHint("workspace not found")).toMatch(/can't be reached/);
+    expect(inferA2AErrorHint("not accessible")).toMatch(/can't be reached/);
+    expect(inferA2AErrorHint("workspace is offline")).toMatch(/can't be reached/);
+  });
+
+  it("returns the empty-detail-specific hint when input is exactly empty", () => {
+    expect(inferA2AErrorHint("")).toMatch(/no error detail/);
+  });
+
+  it("returns a generic fallback for unrecognised text", () => {
+    const hint = inferA2AErrorHint("some completely novel error nobody has matched yet");
+    expect(hint).toMatch(/Check the workspace logs|delivery failure/);
+  });
+
+  it("Claude SDK wedge wins over the more general timeout pattern", () => {
+    // Both 'control request timeout' and 'timeout' match the same
+    // input. The SDK wedge hint is more actionable; the ordering in
+    // the function must keep it first. Lock that priority in.
+    const hint = inferA2AErrorHint("Control request timeout: initialize");
+    expect(hint).toMatch(/Claude Code SDK/);
+    expect(hint).not.toMatch(/proxy timeout/);
+  });
+});
--- a/canvas/src/components/tabs/chat/tests/activityLog.test.ts
+++ b/canvas/src/components/tabs/chat/tests/activityLog.test.ts
@ -0,0 +1,41 @@
+import { describe, it, expect } from "vitest";
+import { ACTIVITY_LOG_WINDOW, appendActivityLine } from "../activityLog";
+
+describe("appendActivityLine", () => {
+  it("appends a fresh line", () => {
+    expect(appendActivityLine([], "📄 Read /a")).toEqual(["📄 Read /a"]);
+  });
+
+  it("collapses an immediate duplicate", () => {
+    const prev = ["📄 Read /a"];
+    // Same exact string twice in a row is noise — the helper should
+    // return the original array reference, not a new one.
+    expect(appendActivityLine(prev, "📄 Read /a")).toBe(prev);
+  });
+
+  it("keeps non-adjacent duplicates", () => {
+    const prev = ["📄 Read /a", "⚡ Bash: ls"];
+    expect(appendActivityLine(prev, "📄 Read /a")).toEqual([
+      "📄 Read /a",
+      "⚡ Bash: ls",
+      "📄 Read /a",
+    ]);
+  });
+
+  it("rolls off the oldest line when the window fills", () => {
+    const seed = Array.from({ length: ACTIVITY_LOG_WINDOW }, (_, i) => `line-${i}`);
+    const next = appendActivityLine(seed, "newest");
+    expect(next.length).toBe(ACTIVITY_LOG_WINDOW);
+    expect(next[next.length - 1]).toBe("newest");
+    // Oldest entry is dropped — line-0 is gone.
+    expect(next[0]).toBe("line-1");
+  });
+
+  it("keeps the original array reference when below the window cap", () => {
+    const prev = ["a", "b"];
+    const next = appendActivityLine(prev, "c");
+    // Returned a new array (we appended); must NOT mutate prev.
+    expect(prev).toEqual(["a", "b"]);
+    expect(next).toEqual(["a", "b", "c"]);
+  });
+});
--- a/canvas/src/components/tabs/chat/tests/message-parser.test.ts
+++ b/canvas/src/components/tabs/chat/tests/message-parser.test.ts
@ -4,6 +4,7 @@ import {
  extractResponseText,
  extractAgentText,
  extractTextsFromParts,
+  extractFilesFromTask,
 } from "../message-parser";

 describe("extractRequestText", () => {
@ -99,6 +100,67 @@ describe("extractResponseText", () => {
  it("returns empty when result has no parts", () => {
    expect(extractResponseText({ result: { other: true } })).toBe("");
  });
+
+  // Regression: Claude Code (and other long-reply runtimes) emits
+  // multi-part text replies. The previous implementation returned
+  // only the first part, silently truncating the rest. Observed
+  // 2026-04-25 on a 15k-char Wave 1 brief that rendered as just the
+  // markdown table header.
+  it("joins all text parts when result.parts has multiple", () => {
+    const body = {
+      result: {
+        parts: [
+          { kind: "text", text: "# Header" },
+          { kind: "text", text: "| Col |" },
+          { kind: "text", text: "| --- |" },
+          { kind: "text", text: "| Row |" },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("# Header\n| Col |\n| --- |\n| Row |");
+  });
+
+  it("joins all text parts across multiple artifacts", () => {
+    const body = {
+      result: {
+        artifacts: [
+          { parts: [{ kind: "text", text: "First artifact" }] },
+          { parts: [{ kind: "text", text: "Second artifact" }] },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("First artifact\nSecond artifact");
+  });
+
+  it("joins all .root.text variants when present", () => {
+    const body = {
+      result: {
+        parts: [
+          { root: { text: "alpha" } },
+          { root: { text: "beta" } },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("alpha\nbeta");
+  });
+
+  // Regression: when a response carries BOTH parts and artifacts
+  // (Hermes tool-call replies do this — summary in parts, detail in
+  // artifacts), the early-return-on-parts implementation silently
+  // dropped the artifacts body. The collected-from-every-source
+  // implementation must surface both.
+  it("collects text from BOTH result.parts AND result.artifacts when both present", () => {
+    const body = {
+      result: {
+        parts: [{ kind: "text", text: "Summary" }],
+        artifacts: [
+          { parts: [{ kind: "text", text: "Detail block one" }] },
+          { parts: [{ kind: "text", text: "Detail block two" }] },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("Summary\nDetail block one\nDetail block two");
+  });
 });

 describe("extractTextsFromParts", () => {
@ -133,3 +195,71 @@ describe("extractTextsFromParts", () => {
    expect(extractTextsFromParts(parts)).toBe("Only text");
  });
 });
+
+describe("extractFilesFromTask", () => {
+  it("pulls A2A file parts out of a result", () => {
+    const task = {
+      parts: [
+        { kind: "text", text: "here's the report" },
+        {
+          kind: "file",
+          file: { name: "report.pdf", mimeType: "application/pdf", uri: "workspace:/reports/report.pdf", size: 4096 },
+        },
+      ],
+    };
+    const files = extractFilesFromTask(task);
+    expect(files).toEqual([
+      { name: "report.pdf", mimeType: "application/pdf", uri: "workspace:/reports/report.pdf", size: 4096 },
+    ]);
+  });
+
+  it("recovers a filename from the URI when `name` is absent", () => {
+    const task = {
+      parts: [
+        { kind: "file", file: { uri: "workspace:/workspace/out/graph.png" } },
+      ],
+    };
+    const files = extractFilesFromTask(task);
+    expect(files[0].name).toBe("graph.png");
+  });
+
+  it("skips file parts without a URI (inline bytes are not supported yet)", () => {
+    const task = {
+      parts: [
+        { kind: "file", file: { name: "inline.bin", bytes: "AAA=" } },
+      ],
+    };
+    expect(extractFilesFromTask(task)).toEqual([]);
+  });
+
+  it("walks artifacts[] so file parts nested inside artifact envelopes are found", () => {
+    const task = {
+      artifacts: [
+        {
+          parts: [
+            { kind: "file", file: { name: "trace.log", uri: "workspace:/logs/trace.log" } },
+          ],
+        },
+      ],
+    };
+    const files = extractFilesFromTask(task);
+    expect(files[0]).toMatchObject({ name: "trace.log", uri: "workspace:/logs/trace.log" });
+  });
+
+  it("returns [] on malformed input rather than throwing", () => {
+    expect(extractFilesFromTask({})).toEqual([]);
+    expect(extractFilesFromTask({ parts: "not-an-array" } as unknown as Record<string, unknown>)).toEqual([]);
+  });
+
+  it("walks result.message.parts — the non-task reply shape some A2A servers use", () => {
+    const task = {
+      message: {
+        parts: [
+          { kind: "file", file: { name: "out.txt", uri: "workspace:/workspace/out.txt" } },
+        ],
+      },
+    };
+    const files = extractFilesFromTask(task);
+    expect(files[0]).toMatchObject({ name: "out.txt", uri: "workspace:/workspace/out.txt" });
+  });
+});
--- a/canvas/src/components/tabs/chat/tests/uploads.test.ts
+++ b/canvas/src/components/tabs/chat/tests/uploads.test.ts
@ -0,0 +1,41 @@
+import { describe, it, expect } from "vitest";
+import { resolveAttachmentHref } from "../uploads";
+
+describe("resolveAttachmentHref — URI scheme normalisation", () => {
+  const wsId = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee";
+
+  it("rewrites the canonical workspace:<path> scheme to /chat/download", () => {
+    const url = resolveAttachmentHref(wsId, "workspace:/workspace/report.pdf");
+    expect(url).toContain(`/workspaces/${wsId}/chat/download`);
+    expect(url).toContain(encodeURIComponent("/workspace/report.pdf"));
+  });
+
+  it("accepts bare absolute container paths (some agents omit the scheme)", () => {
+    const url = resolveAttachmentHref(wsId, "/workspace/report.pdf");
+    expect(url).toContain(`/workspaces/${wsId}/chat/download`);
+    expect(url).toContain(encodeURIComponent("/workspace/report.pdf"));
+  });
+
+  it("accepts file:/// URIs pointing into an allowed root", () => {
+    const url = resolveAttachmentHref(wsId, "file:///workspace/report.pdf");
+    expect(url).toContain(`/workspaces/${wsId}/chat/download`);
+    expect(url).toContain(encodeURIComponent("/workspace/report.pdf"));
+  });
+
+  it("passes through HTTP(S) URIs unchanged so off-platform artefacts still render", () => {
+    const external = "https://example.com/static/report.pdf";
+    expect(resolveAttachmentHref(wsId, external)).toBe(external);
+  });
+
+  it("passes through container paths that are not under any allowed root", () => {
+    // /etc/passwd looks like a path but isn't one of the allowed
+    // roots — falling back to raw passthrough forces the caller into
+    // the external-URL branch, which opens a new tab and lets the
+    // browser refuse. Rewriting would 400 anyway server-side.
+    expect(resolveAttachmentHref(wsId, "/etc/passwd")).toBe("/etc/passwd");
+  });
+
+  it("passes through unknown schemes unchanged", () => {
+    expect(resolveAttachmentHref(wsId, "s3://bucket/key")).toBe("s3://bucket/key");
+  });
+});
--- a/canvas/src/components/tabs/chat/a2aErrorHint.ts
+++ b/canvas/src/components/tabs/chat/a2aErrorHint.ts
@ -0,0 +1,54 @@
+/**
+ * Maps an A2A delivery-failure detail string (the bit AFTER stripping
+ * the [A2A_ERROR] sentinel prefix) to a one-line operator-actionable
+ * hint. Pattern matches are lowercase substring checks, ordered most-
+ * specific first so the right hint wins when multiple patterns
+ * overlap (e.g. "control request timeout" wins over generic "timeout").
+ *
+ * Used by both the chat Agent Comms panel and the Activity tab so the
+ * same symptom reads identically across surfaces. Two prior copies
+ * had already drifted (Activity tab gained `not found`/`offline`
+ * cases AgentCommsPanel never picked up) — this module is the merged
+ * superset and the only place hint text should change.
+ */
+export function inferA2AErrorHint(detail: string): string {
+  const t = detail.toLowerCase();
+
+  // "control request timeout" is the specific Claude Code SDK init
+  // wedge symptom. Pattern on the full phrase, not bare "initialize"
+  // — a user task containing "failed to initialize database" would
+  // false-positive into the SDK-wedge hint.
+  if (t.includes("control request timeout")) {
+    return "The remote agent's Claude Code SDK is wedged on initialization (often after a long idle period or OAuth refresh). A workspace restart usually clears it.";
+  }
+  if (
+    t.includes("readtimeout") ||
+    t.includes("connecttimeout") ||
+    t.includes("deadline exceeded") ||
+    t.includes("timeout")
+  ) {
+    return "The remote agent didn't respond within the proxy timeout. It may be busy with a long task, or the runtime is stuck — restart the workspace if this repeats.";
+  }
+  if (
+    t.includes("connectionreset") ||
+    t.includes("remoteprotocolerror") ||
+    t.includes("connection reset") ||
+    t.includes("no message")
+  ) {
+    return "The connection to the remote agent dropped before a reply arrived. Usually a transient network blip — retry once. If it repeats, the remote container may have crashed mid-request; check its logs.";
+  }
+  if (t.includes("agent error") || t.includes("exception")) {
+    return "The remote agent's runtime threw an exception. Check the workspace's container logs for the traceback. Restart usually clears transient runtime crashes.";
+  }
+  if (
+    t.includes("not found") ||
+    t.includes("not accessible") ||
+    t.includes("offline")
+  ) {
+    return "The remote workspace can't be reached — it may be stopped, removed, or outside the access control list. Verify the peer is online before retrying.";
+  }
+  if (detail === "") {
+    return "The remote agent returned no error detail (the underlying httpx exception had an empty message — typically a connection-reset or silent timeout). A workspace restart is the safe first move.";
+  }
+  return "The remote agent reported a delivery failure. Check the workspace logs or try restarting.";
+}
--- a/canvas/src/components/tabs/chat/activityLog.ts
+++ b/canvas/src/components/tabs/chat/activityLog.ts
@ -0,0 +1,23 @@
+/**
+ * Sliding-window log for the in-chat activity feed (the live progress
+ * lines under the spinner while a chat reply is in flight).
+ *
+ * Sized to fit the spinner area without forcing a scroll; per-tool-use
+ * rows from the workspace's _report_tool_use can fire dozens per turn
+ * (Read 5 files + Grep + Bash + Edits + delegations), so a too-small
+ * window flushes useful early context before the user can read it.
+ *
+ * Consecutive identical lines collapse to a single entry — the same
+ * tool repeated on the same target (e.g. Read of the same file twice
+ * within a turn) is noise, not new progress.
+ */
+export const ACTIVITY_LOG_WINDOW = 20;
+
+export function appendActivityLine(prev: string[], line: string): string[] {
+  if (prev[prev.length - 1] === line) return prev; // collapse duplicates
+  const next =
+    prev.length >= ACTIVITY_LOG_WINDOW
+      ? prev.slice(-(ACTIVITY_LOG_WINDOW - 1))
+      : prev;
+  return [...next, line];
+}
--- a/canvas/src/components/tabs/chat/message-parser.ts
+++ b/canvas/src/components/tabs/chat/message-parser.ts
@ -32,6 +32,64 @@ export function extractTextsFromParts(parts: unknown): string | null {
  return texts.length > 0 ? texts.join("\n") : null;
 }

+export interface ParsedFilePart {
+  name: string;
+  uri: string;
+  mimeType?: string;
+  size?: number;
+}
+
+/** Extract file parts from an A2A response. Walks parts[] + artifacts[].
+ *  Per the A2A spec a file part looks like:
+ *    { kind: "file", file: { name, mimeType, uri | bytes } }
+ *  We only surface parts that carry a `uri` — inline bytes would
+ *  require a different renderer (data URL) and are out of scope for
+ *  MVP. Names fall back to the URI's basename when absent. */
+export function extractFilesFromTask(task: Record<string, unknown>): ParsedFilePart[] {
+  const out: ParsedFilePart[] = [];
+  const pushFromParts = (parts: unknown) => {
+    if (!Array.isArray(parts)) return;
+    for (const raw of parts as Array<Record<string, unknown>>) {
+      if (raw.kind !== "file" && raw.type !== "file") continue;
+      const file = (raw.file ?? raw) as Record<string, unknown>;
+      const uri = typeof file.uri === "string" ? file.uri : "";
+      if (!uri) continue;
+      const name = (typeof file.name === "string" && file.name) || basename(uri);
+      out.push({
+        name,
+        uri,
+        mimeType: typeof file.mimeType === "string" ? file.mimeType : undefined,
+        size: typeof file.size === "number" ? file.size : undefined,
+      });
+    }
+  };
+  try {
+    pushFromParts(task.parts);
+    const artifacts = task.artifacts as Array<Record<string, unknown>> | undefined;
+    if (artifacts) for (const a of artifacts) pushFromParts(a.parts);
+    const status = task.status as Record<string, unknown> | undefined;
+    if (status?.message) {
+      const msg = status.message as Record<string, unknown>;
+      pushFromParts(msg.parts);
+    }
+    // Some A2A servers wrap a non-task reply as
+    // {result: {message: {parts: [...]}}} rather than {result: {parts}}.
+    // Without this branch we'd silently drop file parts returned by
+    // third-party implementations.
+    const message = task.message as Record<string, unknown> | undefined;
+    if (message) pushFromParts(message.parts);
+  } catch {
+    /* tolerate malformed shapes — chat falls through to text-only */
+  }
+  return out;
+}
+
+function basename(uri: string): string {
+  const cleaned = uri.replace(/^workspace:/, "").replace(/^https?:\/\//, "");
+  const slash = cleaned.lastIndexOf("/");
+  return slash >= 0 ? cleaned.slice(slash + 1) : cleaned || "file";
+}
+
 /** Extract user message text from an activity log request_body */
 export function extractRequestText(body: Record<string, unknown> | null): string {
  if (!body) return "";
@ -41,22 +99,54 @@ export function extractRequestText(body: Record<string, unknown> | null): string
  return (parts?.[0]?.text as string) || "";
 }

-/** Extract text from an activity log response_body (multiple possible formats) */
+/** Extract text from an activity log response_body (multiple possible formats).
+ *
+ *  Collects from EVERY source — top-level `parts[].text`, `parts[].root.text`
+ *  (older nested shape), and `artifacts[].parts[].text` (task-shaped
+ *  replies) — and joins them with "\n". Two reasons to collect rather
+ *  than early-return:
+ *
+ *    1. Claude Code and other long-reply runtimes emit multiple text
+ *       parts in a single `parts` array. Returning just the first
+ *       silently truncates 15k-char briefs to their leading line
+ *       (observed UX A/B Lab Wave 1, 2026-04-25).
+ *
+ *    2. Some producers emit a summary in `parts[].text` AND details in
+ *       `artifacts[].parts[].text` (Hermes does this for tool calls).
+ *       The previous "first source wins" returned only the summary;
+ *       artifacts dropped silently. */
 export function extractResponseText(body: Record<string, unknown>): string {
  try {
    // {result: "text"} — from MCP server delegation logs
    if (typeof body.result === "string") return body.result;

-    // A2A JSON-RPC response: {result: {parts: [{kind: "text", text: "..."}]}}
    const result = body.result as Record<string, unknown> | undefined;
    if (result) {
+      const collected: string[] = [];
+
+      // A2A JSON-RPC: {result: {parts: [{kind: "text", text: "..."}]}}
+      const fromParts = extractTextsFromParts(result.parts);
+      if (fromParts) collected.push(fromParts);
+
+      // Older nested shape: {parts: [{root: {text: "..."}}]}
      const parts = (result.parts || []) as Array<Record<string, unknown>>;
+      const rootTexts: string[] = [];
      for (const p of parts) {
-        const t = (p.text as string) || "";
-        if (t) return t;
        const root = p.root as Record<string, unknown> | undefined;
-        if (root?.text) return root.text as string;
+        if (root?.text) rootTexts.push(root.text as string);
      }
+      if (rootTexts.length > 0) collected.push(rootTexts.join("\n"));
+
+      // Task shape: {result: {artifacts: [{parts: [...]}]}}
+      const artifacts = result.artifacts as Array<Record<string, unknown>> | undefined;
+      if (artifacts) {
+        for (const a of artifacts) {
+          const t = extractTextsFromParts(a.parts);
+          if (t) collected.push(t);
+        }
+      }
+
+      if (collected.length > 0) return collected.join("\n");
    }

    // {task: "text"} — request body format, shouldn't be in response but handle it
--- a/canvas/src/components/tabs/chat/types.ts
+++ b/canvas/src/components/tabs/chat/types.ts
@ -1,12 +1,38 @@
+/** One file attached to a chat message. Shared shape for both
+ *  directions: when a user attaches a file the UI uploads it and
+ *  stashes the returned metadata here; when an agent returns a
+ *  `kind: file` part in an A2A response, the parser populates the
+ *  same fields. `uri` uses the `workspace:<abs-path>` scheme the
+ *  server returns — the renderer translates that to a download
+ *  request against GET /workspaces/:id/chat/download. */
+export interface ChatAttachment {
+  name: string;
+  uri: string;
+  mimeType?: string;
+  size?: number;
+}
+
 export interface ChatMessage {
  id: string;
  role: "user" | "agent" | "system";
  content: string;
+  /** Attachments sent with or returned alongside this message. */
+  attachments?: ChatAttachment[];
  timestamp: string; // ISO string for serialization
 }

-export function createMessage(role: ChatMessage["role"], content: string): ChatMessage {
-  return { id: crypto.randomUUID(), role, content, timestamp: new Date().toISOString() };
+export function createMessage(
+  role: ChatMessage["role"],
+  content: string,
+  attachments?: ChatAttachment[],
+): ChatMessage {
+  return {
+    id: crypto.randomUUID(),
+    role,
+    content,
+    attachments: attachments && attachments.length > 0 ? attachments : undefined,
+    timestamp: new Date().toISOString(),
+  };
 }

 // appendMessageDeduped adds a ChatMessage to `prev` unless the tail
@ -25,11 +51,23 @@ export function createMessage(role: ChatMessage["role"], content: string): ChatM
 // messages ("hi", "hi") from a real user/agent still render.
 export function appendMessageDeduped(prev: ChatMessage[], msg: ChatMessage, dedupeWindowMs = 3000): ChatMessage[] {
  const cutoff = Date.now() - dedupeWindowMs;
+  const sig = attachmentSignature(msg.attachments);
  const alreadyThere = prev.some((m) => {
    if (m.role !== msg.role || m.content !== msg.content) return false;
+    // Attachments participate in the dedupe key so a text-only push
+    // doesn't shadow the file-carrying HTTP response (and vice versa).
+    // When both carry the same text AND the same files, collapse.
+    if (attachmentSignature(m.attachments) !== sig) return false;
    const t = Date.parse(m.timestamp);
    return !Number.isNaN(t) && t >= cutoff;
  });
  if (alreadyThere) return prev;
  return [...prev, msg];
 }
+
+function attachmentSignature(atts: ChatAttachment[] | undefined): string {
+  if (!atts || atts.length === 0) return "";
+  // URI is the stable identity — name can differ across delivery
+  // paths (agent vs our parser's basename fallback).
+  return atts.map((a) => a.uri).sort().join("|");
+}
--- a/canvas/src/components/tabs/chat/uploads.ts
+++ b/canvas/src/components/tabs/chat/uploads.ts
@ -0,0 +1,135 @@
+import { PLATFORM_URL } from "@/lib/api";
+import { getTenantSlug } from "@/lib/tenant";
+import type { ChatAttachment } from "./types";
+
+/** Chat attachments are intentionally uploaded via a direct fetch()
+ *  instead of the `api.post` helper — `api.post` JSON-stringifies the
+ *  body, which would 500 on a Blob. Mirrors the header plumbing
+ *  (tenant slug, admin token, credentials) so SaaS + self-hosted
+ *  callers work the same way. */
+export async function uploadChatFiles(
+  workspaceId: string,
+  files: File[],
+): Promise<ChatAttachment[]> {
+  if (files.length === 0) return [];
+
+  const form = new FormData();
+  for (const f of files) form.append("files", f, f.name);
+
+  const headers: Record<string, string> = {};
+  const slug = getTenantSlug();
+  if (slug) headers["X-Molecule-Org-Slug"] = slug;
+  const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
+  if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
+
+  // Uploads legitimately take a while on cold cache (tar write +
+  // docker cp into the container). 60s is comfortable for the 25MB/
+  // 50MB caps the server enforces.
+  const res = await fetch(`${PLATFORM_URL}/workspaces/${workspaceId}/chat/uploads`, {
+    method: "POST",
+    headers,
+    body: form,
+    credentials: "include",
+    signal: AbortSignal.timeout(60_000),
+  });
+  if (!res.ok) {
+    const text = await res.text().catch(() => "");
+    throw new Error(`upload failed: ${res.status} ${text}`);
+  }
+  const json = (await res.json()) as { files: ChatAttachment[] };
+  return json.files ?? [];
+}
+
+/** Resolve a file URI into a browser-downloadable URL. Accepts:
+ *    - `workspace:<abs-path>` (our canonical form)
+ *    - `file:///workspace/...` (some agents emit this)
+ *    - `/workspace/...` (bare absolute path inside the container)
+ *  Everything that looks like an allowed-root container path is
+ *  rewritten to the authenticated /chat/download endpoint. HTTP(S)
+ *  URIs pass through unchanged so we can also render links to
+ *  artefacts hosted off-platform. Unknown schemes fall back to the
+ *  raw URI — the caller gets to decide how to render it. */
+export function resolveAttachmentHref(
+  workspaceId: string,
+  uri: string,
+): string {
+  const containerPath = normalizeWorkspaceUri(uri);
+  if (containerPath) {
+    return `${PLATFORM_URL}/workspaces/${workspaceId}/chat/download?path=${encodeURIComponent(containerPath)}`;
+  }
+  return uri;
+}
+
+/** Extracts the absolute container path from a workspace-scoped URI,
+ *  or null if the URI isn't a container path. The matching roots
+ *  mirror the server's `allowedRoots` allowlist. */
+const ALLOWED_CONTAINER_ROOTS = ["/configs", "/workspace", "/home", "/plugins"];
+
+function normalizeWorkspaceUri(uri: string): string | null {
+  let path: string | null = null;
+  if (uri.startsWith("workspace:")) {
+    path = uri.slice("workspace:".length);
+  } else if (uri.startsWith("file:///")) {
+    path = uri.slice("file://".length); // keep the leading slash
+  } else if (uri.startsWith("/")) {
+    path = uri;
+  }
+  if (!path) return null;
+  // Only rewrite when the path lands in an allowed root; otherwise
+  // return null so the caller falls through to raw-URI handling
+  // (which will open a new tab for HTTP-ish schemes).
+  for (const root of ALLOWED_CONTAINER_ROOTS) {
+    if (path === root || path.startsWith(root + "/")) return path;
+  }
+  return null;
+}
+
+/** Trigger a browser download for an attachment. Uses fetch+blob
+ *  rather than an anchor navigation because the download endpoint
+ *  requires workspace auth — and the browser won't attach
+ *  `Authorization: Bearer` or `X-Molecule-Org-Slug` to a bare anchor
+ *  click. A 25MB per-file cap server-side keeps the blob buffer
+ *  bounded. HTTP(S) URIs skip the fetch path and open directly
+ *  since they're off-platform artefacts that we don't own auth for. */
+export async function downloadChatFile(
+  workspaceId: string,
+  attachment: ChatAttachment,
+): Promise<void> {
+  const href = resolveAttachmentHref(workspaceId, attachment.uri);
+  const isContainerPath = normalizeWorkspaceUri(attachment.uri) !== null;
+  if (!isContainerPath) {
+    // External URL — let the browser navigate. Opens in new tab so
+    // the canvas context survives a navigation. `href` here is the
+    // raw URI (http(s), or anything else the agent sent back).
+    window.open(href, "_blank", "noopener,noreferrer");
+    return;
+  }
+
+  const headers: Record<string, string> = {};
+  const slug = getTenantSlug();
+  if (slug) headers["X-Molecule-Org-Slug"] = slug;
+  const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
+  if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
+
+  const res = await fetch(href, {
+    headers,
+    credentials: "include",
+    signal: AbortSignal.timeout(60_000),
+  });
+  if (!res.ok) {
+    throw new Error(`download failed: ${res.status}`);
+  }
+  const blob = await res.blob();
+  // Revoke the object URL after the click — browsers hold the blob
+  // until the URL is either revoked or the document unloads. 30s is
+  // plenty of headroom for the click → save dialog round-trip.
+  const url = URL.createObjectURL(blob);
+  const a = document.createElement("a");
+  a.href = url;
+  a.download = attachment.name;
+  a.rel = "noopener";
+  document.body.appendChild(a);
+  a.click();
+  a.remove();
+  setTimeout(() => URL.revokeObjectURL(url), 30_000);
+}
--- a/canvas/src/hooks/tests/useTemplateDeploy.test.tsx
+++ b/canvas/src/hooks/tests/useTemplateDeploy.test.tsx
@ -0,0 +1,316 @@
+// @vitest-environment jsdom
+/**
+ * Tests for useTemplateDeploy — the shared preflight + POST + modal
+ * hook used by TemplatePalette (sidebar) and EmptyState (welcome grid).
+ *
+ * Behavioural coverage for the three flows the hook owns:
+ *   1. Happy path: preflight ok → POST /workspaces → onDeployed fires
+ *   2. Preflight errors: network throw vs not-ok-with-missing-keys
+ *      (different code paths — the throw must NOT strand `deploying`,
+ *      see the inline comment in the SUT for the prior bug)
+ *   3. Modal lifecycle: keys-added retries POST without re-running
+ *      preflight; cancel closes without POST
+ *
+ * Issue: #2071 (Canvas test gaps follow-up).
+ */
+import {
+  describe,
+  it,
+  expect,
+  vi,
+  beforeEach,
+  afterEach,
+  type Mock,
+} from "vitest";
+import { act, render, cleanup, screen, fireEvent } from "@testing-library/react";
+import { renderHook } from "@testing-library/react";
+import type { Template } from "@/lib/deploy-preflight";
+
+// ── Hoisted mocks ────────────────────────────────────────────────────────────
+const { mockApiPost, mockCheckDeploySecrets, mockResolveRuntime } = vi.hoisted(
+  () => ({
+    mockApiPost: vi.fn(),
+    mockCheckDeploySecrets: vi.fn(),
+    mockResolveRuntime: vi.fn(),
+  }),
+);
+
+vi.mock("@/lib/api", () => ({
+  api: { post: mockApiPost },
+}));
+
+vi.mock("@/lib/deploy-preflight", async () => {
+  // Re-export the real types; only swap the runtime functions.
+  const actual = await vi.importActual<
+    typeof import("@/lib/deploy-preflight")
+  >("@/lib/deploy-preflight");
+  return {
+    ...actual,
+    checkDeploySecrets: mockCheckDeploySecrets,
+    resolveRuntime: mockResolveRuntime,
+  };
+});
+
+// MissingKeysModal: render a minimal stand-in that exposes the two
+// callbacks the hook wires up. The real modal pulls in radix + the
+// secrets store, neither of which is relevant to this hook's behavior.
+vi.mock("@/components/MissingKeysModal", () => ({
+  MissingKeysModal: (props: {
+    open: boolean;
+    onKeysAdded: () => void;
+    onCancel: () => void;
+  }) =>
+    props.open ? (
+      <div data-testid="missing-keys-modal">
+        <button data-testid="modal-keys-added" onClick={props.onKeysAdded}>
+          keys added
+        </button>
+        <button data-testid="modal-cancel" onClick={props.onCancel}>
+          cancel
+        </button>
+      </div>
+    ) : null,
+}));
+
+// Import the hook AFTER the mocks are declared.
+import { useTemplateDeploy } from "../useTemplateDeploy";
+
+// ── Helpers ──────────────────────────────────────────────────────────────────
+
+function makeTemplate(over: Partial<Template> = {}): Template {
+  return {
+    id: "claude-code-default",
+    name: "Claude Code",
+    description: "",
+    tier: 1,
+    model: "claude-sonnet-4-5",
+    skills: [],
+    skill_count: 0,
+    runtime: "claude-code",
+    models: [],
+    required_env: [],
+    ...over,
+  };
+}
+
+beforeEach(() => {
+  mockApiPost.mockReset();
+  mockCheckDeploySecrets.mockReset();
+  mockResolveRuntime.mockReset();
+  // Default: identity-mapped runtime, preflight passes.
+  mockResolveRuntime.mockImplementation((id: string) => id);
+  mockCheckDeploySecrets.mockResolvedValue({
+    ok: true,
+    missingKeys: [],
+    providers: [],
+    runtime: "claude-code",
+  });
+  mockApiPost.mockResolvedValue({ id: "ws-new" });
+});
+
+afterEach(() => {
+  cleanup();
+});
+
+// ── Tests ────────────────────────────────────────────────────────────────────
+
+describe("useTemplateDeploy — happy path", () => {
+  it("preflight ok → POST /workspaces → onDeployed fires with new id", async () => {
+    const onDeployed = vi.fn();
+    const { result } = renderHook(() => useTemplateDeploy({ onDeployed }));
+
+    await act(async () => {
+      await result.current.deploy(makeTemplate());
+    });
+
+    expect(mockCheckDeploySecrets).toHaveBeenCalledTimes(1);
+    expect(mockApiPost).toHaveBeenCalledWith(
+      "/workspaces",
+      expect.objectContaining({
+        name: "Claude Code",
+        template: "claude-code-default",
+        tier: 1,
+      }),
+    );
+    expect(onDeployed).toHaveBeenCalledWith("ws-new");
+    expect(result.current.deploying).toBeNull();
+    expect(result.current.error).toBeNull();
+  });
+
+  it("uses caller-supplied canvasCoords when provided", async () => {
+    const canvasCoords = vi.fn(() => ({ x: 42, y: 99 }));
+    const { result } = renderHook(() => useTemplateDeploy({ canvasCoords }));
+
+    await act(async () => {
+      await result.current.deploy(makeTemplate());
+    });
+
+    expect(canvasCoords).toHaveBeenCalledTimes(1);
+    expect(mockApiPost).toHaveBeenCalledWith(
+      "/workspaces",
+      expect.objectContaining({ canvas: { x: 42, y: 99 } }),
+    );
+  });
+
+  it("falls back to random coords inside [100,500] × [100,400] when canvasCoords omitted", async () => {
+    const { result } = renderHook(() => useTemplateDeploy());
+
+    await act(async () => {
+      await result.current.deploy(makeTemplate());
+    });
+
+    const body = (mockApiPost as Mock).mock.calls[0]?.[1] as {
+      canvas: { x: number; y: number };
+    };
+    expect(body.canvas.x).toBeGreaterThanOrEqual(100);
+    expect(body.canvas.x).toBeLessThan(500);
+    expect(body.canvas.y).toBeGreaterThanOrEqual(100);
+    expect(body.canvas.y).toBeLessThan(400);
+  });
+
+  it("prefers template.runtime over resolveRuntime fallback", async () => {
+    const { result } = renderHook(() => useTemplateDeploy());
+
+    await act(async () => {
+      await result.current.deploy(
+        makeTemplate({ runtime: "hermes", id: "some-id" }),
+      );
+    });
+
+    expect(mockResolveRuntime).not.toHaveBeenCalled();
+    expect(mockCheckDeploySecrets).toHaveBeenCalledWith(
+      expect.objectContaining({ runtime: "hermes" }),
+    );
+  });
+});
+
+describe("useTemplateDeploy — preflight failure modes", () => {
+  it("preflight throw sets error and clears deploying (no stranded button)", async () => {
+    mockCheckDeploySecrets.mockRejectedValueOnce(new Error("network down"));
+    const { result } = renderHook(() => useTemplateDeploy());
+
+    await act(async () => {
+      await result.current.deploy(makeTemplate());
+    });
+
+    expect(result.current.error).toBe("network down");
+    expect(result.current.deploying).toBeNull();
+    expect(mockApiPost).not.toHaveBeenCalled();
+  });
+
+  it("preflight not-ok opens the modal without firing POST", async () => {
+    mockCheckDeploySecrets.mockResolvedValueOnce({
+      ok: false,
+      missingKeys: ["ANTHROPIC_API_KEY"],
+      providers: [],
+      runtime: "claude-code",
+    });
+    const onDeployed = vi.fn();
+
+    const { result, rerender } = renderHook(() =>
+      useTemplateDeploy({ onDeployed }),
+    );
+
+    await act(async () => {
+      await result.current.deploy(makeTemplate());
+    });
+
+    rerender();
+    render(<>{result.current.modal}</>);
+    expect(screen.getByTestId("missing-keys-modal")).toBeTruthy();
+    expect(mockApiPost).not.toHaveBeenCalled();
+    expect(onDeployed).not.toHaveBeenCalled();
+    expect(result.current.deploying).toBeNull();
+  });
+});
+
+describe("useTemplateDeploy — modal lifecycle", () => {
+  it("'keys added' retries POST without re-running preflight", async () => {
+    mockCheckDeploySecrets.mockResolvedValueOnce({
+      ok: false,
+      missingKeys: ["ANTHROPIC_API_KEY"],
+      providers: [],
+      runtime: "claude-code",
+    });
+    const onDeployed = vi.fn();
+    const { result, rerender } = renderHook(() =>
+      useTemplateDeploy({ onDeployed }),
+    );
+
+    await act(async () => {
+      await result.current.deploy(makeTemplate());
+    });
+    expect(mockCheckDeploySecrets).toHaveBeenCalledTimes(1);
+
+    rerender();
+    render(<>{result.current.modal}</>);
+
+    // Click "keys added" — the hook should retry via executeDeploy
+    // (which does NOT call preflight again).
+    await act(async () => {
+      fireEvent.click(screen.getByTestId("modal-keys-added"));
+      // Let the fire-and-forget executeDeploy promise resolve.
+      await Promise.resolve();
+      await Promise.resolve();
+    });
+
+    expect(mockCheckDeploySecrets).toHaveBeenCalledTimes(1); // still 1, not 2
+    expect(mockApiPost).toHaveBeenCalledTimes(1);
+    expect(onDeployed).toHaveBeenCalledWith("ws-new");
+  });
+
+  it("'cancel' closes the modal without firing POST", async () => {
+    mockCheckDeploySecrets.mockResolvedValueOnce({
+      ok: false,
+      missingKeys: ["ANTHROPIC_API_KEY"],
+      providers: [],
+      runtime: "claude-code",
+    });
+    const { result, rerender } = renderHook(() => useTemplateDeploy());
+
+    await act(async () => {
+      await result.current.deploy(makeTemplate());
+    });
+
+    rerender();
+    const { rerender: renderRerender } = render(<>{result.current.modal}</>);
+    expect(screen.getByTestId("missing-keys-modal")).toBeTruthy();
+
+    await act(async () => {
+      fireEvent.click(screen.getByTestId("modal-cancel"));
+    });
+
+    rerender();
+    renderRerender(<>{result.current.modal}</>);
+    expect(screen.queryByTestId("missing-keys-modal")).toBeNull();
+    expect(mockApiPost).not.toHaveBeenCalled();
+  });
+});
+
+describe("useTemplateDeploy — POST failure", () => {
+  it("POST rejection sets error and clears deploying", async () => {
+    mockApiPost.mockRejectedValueOnce(new Error("server 500"));
+    const onDeployed = vi.fn();
+    const { result } = renderHook(() => useTemplateDeploy({ onDeployed }));
+
+    await act(async () => {
+      await result.current.deploy(makeTemplate());
+    });
+
+    expect(result.current.error).toBe("server 500");
+    expect(result.current.deploying).toBeNull();
+    expect(onDeployed).not.toHaveBeenCalled();
+  });
+
+  it("non-Error rejection still surfaces a message (defensive)", async () => {
+    mockApiPost.mockRejectedValueOnce("plain string");
+    const { result } = renderHook(() => useTemplateDeploy());
+
+    await act(async () => {
+      await result.current.deploy(makeTemplate());
+    });
+
+    expect(result.current.error).toBe("Deploy failed");
+    expect(result.current.deploying).toBeNull();
+  });
+});
--- a/canvas/src/hooks/useTemplateDeploy.tsx
+++ b/canvas/src/hooks/useTemplateDeploy.tsx
@ -0,0 +1,170 @@
+"use client";
+
+import { useCallback, useState, type ReactNode } from "react";
+import { api } from "@/lib/api";
+import {
+  checkDeploySecrets,
+  resolveRuntime,
+  type PreflightResult,
+  type Template,
+} from "@/lib/deploy-preflight";
+import { MissingKeysModal } from "@/components/MissingKeysModal";
+
+/**
+ * useTemplateDeploy — shared preflight + POST + modal wiring for
+ * every surface that deploys a workspace from a template.
+ *
+ * Owns: `checkDeploySecrets` call, `MissingKeysModal` render, the
+ * `POST /workspaces` that follows, and per-template `deploying`
+ * state. Returns `modal` as a `ReactNode` ready to place inline.
+ *
+ * Why a hook rather than two copies: the runtime-fallback table
+ * (`resolveRuntime`) and the preflight wiring were previously
+ * copy-pasted between TemplatePalette and EmptyState. When the
+ * copies drifted (palette had the full id-to-runtime map,
+ * empty-state had only the `-default` strip), the two surfaces
+ * could silently disagree on future templates that need a
+ * non-identity mapping. Single owner closes the drift surface.
+ */
+export interface UseTemplateDeployOptions {
+  /** Compute canvas coords for the new workspace. Called once per
+   *  successful deploy. Defaults to random coords in the [100, 500] ×
+   *  [100, 400] band, matching the sidebar palette's historical
+   *  placement. Override for surfaces that want deterministic
+   *  placement (e.g. EmptyState's first-deploy "center-ish" target). */
+  canvasCoords?: () => { x: number; y: number };
+
+  /** Optional post-deploy side effect — passed the id of the new
+   *  workspace. EmptyState uses this to auto-select the node and
+   *  flip the side panel to Chat so a fresh tenant sees something
+   *  useful. */
+  onDeployed?: (workspaceId: string) => void;
+}
+
+/** Paired template + preflight result carried through the "user
+ *  clicked deploy → modal opens → keys saved → retry" loop. Named
+ *  so the `useState` generic and any future signature change have
+ *  a single place to track. */
+interface MissingKeysInfo {
+  template: Template;
+  preflight: PreflightResult;
+}
+
+export interface UseTemplateDeployResult {
+  /** Template id currently being deployed (incl. the preflight
+   *  network call), or null when idle. Callers pass this to disable
+   *  the relevant button and show a spinner. */
+  deploying: string | null;
+
+  /** Last deploy error message, or null. Cleared on next `deploy`
+   *  call. */
+  error: string | null;
+
+  /** Kick off a deploy. Opens the missing-keys modal if preflight
+   *  returns not-ok; otherwise fires POST /workspaces directly. */
+  deploy: (template: Template) => Promise<void>;
+
+  /** The missing-keys modal, ready to place inline. Always non-null
+   *  (the underlying component self-gates on `open`), so the caller
+   *  can drop `{modal}` anywhere without conditionals. */
+  modal: ReactNode;
+}
+
+export function useTemplateDeploy(
+  options: UseTemplateDeployOptions = {},
+): UseTemplateDeployResult {
+  const [deploying, setDeploying] = useState<string | null>(null);
+  const [error, setError] = useState<string | null>(null);
+  const [missingKeysInfo, setMissingKeysInfo] = useState<MissingKeysInfo | null>(null);
+
+  const { canvasCoords, onDeployed } = options;
+
+  /** Actually execute the POST /workspaces call. Split from `deploy`
+   *  so the "modal → keys added → retry" path can reuse it without
+   *  re-running preflight (the user just proved the keys are now set). */
+  const executeDeploy = useCallback(
+    async (template: Template) => {
+      setDeploying(template.id);
+      setError(null);
+      try {
+        const coords = canvasCoords
+          ? canvasCoords()
+          : {
+              x: Math.random() * 400 + 100,
+              y: Math.random() * 300 + 100,
+            };
+        const ws = await api.post<{ id: string }>("/workspaces", {
+          name: template.name,
+          template: template.id,
+          tier: template.tier,
+          canvas: coords,
+        });
+        onDeployed?.(ws.id);
+      } catch (e) {
+        setError(e instanceof Error ? e.message : "Deploy failed");
+      } finally {
+        setDeploying(null);
+      }
+    },
+    [canvasCoords, onDeployed],
+  );
+
+  const deploy = useCallback(
+    async (template: Template) => {
+      setDeploying(template.id);
+      setError(null);
+      let preflight: PreflightResult;
+      try {
+        const runtime = template.runtime ?? resolveRuntime(template.id);
+        preflight = await checkDeploySecrets({
+          runtime,
+          models: template.models,
+          required_env: template.required_env,
+        });
+      } catch (e) {
+        // Preflight network failure used to strand `deploying` — the
+        // button stayed disabled forever because the throw bypassed
+        // the setDeploying(null) in the non-ok branch below. Any
+        // future refactor that drops this try block will regress the
+        // same way; keep it narrow around just the preflight call
+        // so a successful preflight still lets executeDeploy own
+        // its own error path.
+        setError(e instanceof Error ? e.message : "Preflight check failed");
+        setDeploying(null);
+        return;
+      }
+      if (!preflight.ok) {
+        setMissingKeysInfo({ template, preflight });
+        setDeploying(null);
+        return;
+      }
+      await executeDeploy(template);
+    },
+    [executeDeploy],
+  );
+
+  // No useCallback here — consumers call this on every render anyway
+  // (it's placed inline in JSX), and useCallback's deps would
+  // invalidate on every state change, making the memoisation a wash.
+  // Plain ReactNode is simpler and equally performant.
+  const modal: ReactNode = (
+    <MissingKeysModal
+      open={!!missingKeysInfo}
+      missingKeys={missingKeysInfo?.preflight.missingKeys ?? []}
+      providers={missingKeysInfo?.preflight.providers ?? []}
+      runtime={missingKeysInfo?.preflight.runtime ?? ""}
+      onKeysAdded={() => {
+        if (missingKeysInfo) {
+          const template = missingKeysInfo.template;
+          setMissingKeysInfo(null);
+          // Intentional fire-and-forget — executeDeploy manages
+          // its own error state via setError.
+          void executeDeploy(template);
+        }
+      }}
+      onCancel={() => setMissingKeysInfo(null)}
+    />
+  );
+
+  return { deploying, error, deploy, modal };
+}
--- a/canvas/src/lib/tests/api.test.ts
+++ b/canvas/src/lib/tests/api.test.ts
@ -7,7 +7,7 @@ import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
 const mockFetch = vi.fn();
 globalThis.fetch = mockFetch;

-import { api } from "../api";
+import { api, PlatformUnavailableError } from "../api";

 // ---------------------------------------------------------------------------
 // Helpers
@ -380,3 +380,99 @@ describe("api – request timeout signal", () => {
    expect(sigA).not.toBe(sigB);
  });
 });
+
+// ---------------------------------------------------------------------------
+// PlatformUnavailableError classification
+// ---------------------------------------------------------------------------
+//
+// When the platform's wsauth middleware can't reach Postgres/Redis to
+// validate a token, it returns 503 + {error, code:"platform_unavailable"}.
+// api.ts must surface that as a typed error so the page-level renderer
+// can show a dedicated diagnostic instead of a generic 5xx toast.
+
+describe("PlatformUnavailableError classification", () => {
+  beforeEach(() => {
+    mockFetch.mockReset();
+  });
+
+  function mock503Platform(detail = "platform datastore unavailable — retry shortly") {
+    const body = JSON.stringify({ error: detail, code: "platform_unavailable" });
+    mockFetch.mockResolvedValueOnce({
+      ok: false,
+      status: 503,
+      json: () => Promise.reject(new Error("not used")),
+      text: () => Promise.resolve(body),
+    } as unknown as Response);
+  }
+
+  it("throws PlatformUnavailableError on 503 + code=platform_unavailable", async () => {
+    mock503Platform();
+    let thrown: unknown;
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      thrown = e;
+    }
+    expect(thrown).toBeInstanceOf(PlatformUnavailableError);
+    expect((thrown as PlatformUnavailableError).code).toBe("platform_unavailable");
+  });
+
+  it("preserves the server-provided error string as the Error message", async () => {
+    mock503Platform("Postgres unreachable");
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      expect(e).toBeInstanceOf(PlatformUnavailableError);
+      expect((e as Error).message).toBe("Postgres unreachable");
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+
+  it("does NOT classify a generic 503 (no platform_unavailable code) as PlatformUnavailableError", async () => {
+    // Generic upstream-busy 503 — should keep the legacy generic-Error
+    // path so existing busy-retry UX isn't disrupted.
+    mockFetch.mockResolvedValueOnce({
+      ok: false,
+      status: 503,
+      json: () => Promise.reject(new Error("not used")),
+      text: () => Promise.resolve(JSON.stringify({ error: "upstream busy" })),
+    } as unknown as Response);
+    try {
+      await api.get("/workspaces/x/a2a");
+    } catch (e) {
+      expect(e).not.toBeInstanceOf(PlatformUnavailableError);
+      expect((e as Error).message).toContain("503");
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+
+  it("does NOT classify on 500 (server kept legacy 500 for true internal errors)", async () => {
+    mockFailure(500, "boom");
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      expect(e).not.toBeInstanceOf(PlatformUnavailableError);
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+
+  it("falls back to generic Error when 503 body isn't JSON", async () => {
+    mockFetch.mockResolvedValueOnce({
+      ok: false,
+      status: 503,
+      json: () => Promise.reject(new Error("not used")),
+      text: () => Promise.resolve("Service Unavailable"),
+    } as unknown as Response);
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      expect(e).not.toBeInstanceOf(PlatformUnavailableError);
+      expect((e as Error).message).toContain("503");
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+});
--- a/canvas/src/lib/api.ts
+++ b/canvas/src/lib/api.ts
@ -107,11 +107,39 @@ async function request<T>(
  }
  if (!res.ok) {
    const text = await res.text();
+    // Recognise the platform's structured "datastore unreachable"
+    // shape (returned by wsauth_middleware.abortAuthLookupError when
+    // Postgres/Redis is down). Surface as a typed error so callers
+    // can render a dedicated diagnostic instead of a generic toast.
+    if (res.status === 503 && text) {
+      try {
+        const parsed = JSON.parse(text) as { code?: string; error?: string };
+        if (parsed.code === "platform_unavailable") {
+          throw new PlatformUnavailableError(parsed.error || "platform datastore unavailable");
+        }
+      } catch (err) {
+        // Re-throw the typed error if that's what we just constructed.
+        // JSON.parse failures fall through to the generic Error below.
+        if (err instanceof PlatformUnavailableError) throw err;
+      }
+    }
    throw new Error(`API ${method} ${path}: ${res.status} ${text}`);
  }
  return res.json();
 }

+/** Thrown when the platform reports its datastore (Postgres/Redis) is
+ *  unreachable. Surface with a dedicated diagnostic UI rather than a
+ *  generic API-error toast — the user's next action is to check local
+ *  services, not to retry the API call. */
+export class PlatformUnavailableError extends Error {
+  readonly code = "platform_unavailable" as const;
+  constructor(message: string) {
+    super(message);
+    this.name = "PlatformUnavailableError";
+  }
+}
+
 export const api = {
  get: <T>(path: string, options?: RequestOptions) => request<T>("GET", path, undefined, 0, options),
  post: <T>(path: string, body?: unknown, options?: RequestOptions) => request<T>("POST", path, body, 0, options),
--- a/canvas/src/lib/deploy-preflight.ts
+++ b/canvas/src/lib/deploy-preflight.ts
@ -33,6 +33,46 @@ export interface TemplateLike {
  required_env?: string[];
 }

+/** Full /templates response shape shared by TemplatePalette (sidebar)
+ *  and EmptyState (welcome grid). Was previously re-declared in each
+ *  with subtly different fields — EmptyState's narrower shape silently
+ *  dropped `runtime`, `models`, and `required_env`, so the preflight
+ *  couldn't see provider alternatives the template declared. Keep this
+ *  the single source of truth.  */
+export interface Template extends TemplateLike {
+  id: string;
+  name: string;
+  description: string;
+  tier: number;
+  model: string;
+  skills: string[];
+  skill_count: number;
+}
+
+/** Map from a template id to the runtime name the per-workspace
+ *  preflight expects. Used only when the server's `/templates`
+ *  response predates the `runtime` field on the summary (legacy
+ *  installs) — modern responses carry it verbatim. Strip `-default`
+ *  for the claude-code template and identity-map everything else
+ *  that matches our current runtime registry.
+ *
+ *  Lives in the preflight module (not TemplatePalette) so EmptyState
+ *  uses the SAME fallback table. A previous duplication in both call
+ *  sites left EmptyState with only the `-default` suffix strip, which
+ *  would silently disagree with TemplatePalette on templates whose
+ *  id needs a non-identity mapping. */
+export function resolveRuntime(templateId: string): string {
+  const runtimeMap: Record<string, string> = {
+    langgraph: "langgraph",
+    "claude-code-default": "claude-code",
+    openclaw: "openclaw",
+    deepagents: "deepagents",
+    crewai: "crewai",
+    autogen: "autogen",
+  };
+  return runtimeMap[templateId] ?? templateId.replace(/-default$/, "");
+}
+
 export interface SecretEntry {
  key: string;
  has_value: boolean;
--- a/canvas/src/lib/runtimeProfiles.ts
+++ b/canvas/src/lib/runtimeProfiles.ts
@ -60,21 +60,23 @@ export const DEFAULT_RUNTIME_PROFILE: Required<
 /**
 * Named per-runtime overrides. Keep this map small and explicit —
 * each entry is a deliberate statement that this runtime's cold-boot
- * behavior differs materially from the default.
+ * behavior differs materially from the default AND that the runtime's
+ * template manifest hasn't yet declared a server-side
+ * `provision_timeout_seconds` (the preferred path post-#2054).
 *
 * Each override must also ship with a comment explaining WHY the default
 * is wrong for this runtime. Unexplained numbers rot.
+ *
+ * Empty today — `hermes` previously lived here at 720_000ms, but
+ * Molecule-AI/molecule-ai-workspace-template-hermes now declares the
+ * value in its config.yaml manifest, so the value flows through the
+ * server (workspace API → WorkspaceData.provision_timeout_ms → resolver
+ * overrides) instead of being canvas-hardcoded. New runtimes that need
+ * a non-default cold-boot threshold should follow the same pattern:
+ * declare `runtime_config.provision_timeout_seconds` in their template
+ * manifest, NOT add an entry here.
 */
-export const RUNTIME_PROFILES: Record<string, RuntimeProfile> = {
-  hermes: {
-    // 12 min. Installs ripgrep + ffmpeg + node22 + builds hermes-agent
-    // from source + Playwright + Chromium (~300MB download). Measured
-    // cold boots on staging EC2 routinely land at 8-13 min. Aligns
-    // with SaaS E2E's PROVISION_TIMEOUT_SECS=900 (15 min) so the UI
-    // warning lands shortly before the backend itself gives up.
-    provisionTimeoutMs: 720_000,
-  },
-};
+export const RUNTIME_PROFILES: Record<string, RuntimeProfile> = {};

 /**
 * Data fields the canvas can consult for per-workspace overrides. These
--- a/canvas/src/lib/validation/tests/secret-formats.test.ts
+++ b/canvas/src/lib/validation/tests/secret-formats.test.ts
@ -143,7 +143,10 @@ describe('inferGroup', () => {

 describe('maskSecretValue', () => {
  it('masks ghp_ prefixed values showing prefix and last 4', () => {
-    const value = 'ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx';
+    // Built via concatenation, not as a literal continuous string —
+    // a literal `ghp_` + 36+ alphanumerics matches the secret-scan
+    // workflow's own regex and false-positives merge_group / push runs.
+    const value = 'ghp_' + 'x'.repeat(40);
    const masked = maskSecretValue(value);
    expect(masked.startsWith('ghp_')).toBe(true);
    expect(masked.endsWith(value.slice(-4))).toBe(true);
@ -151,7 +154,7 @@ describe('maskSecretValue', () => {
  });

  it('masks github_pat_ prefixed values', () => {
-    const value = 'github_pat_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx';
+    const value = 'github_pat_' + 'x'.repeat(82);
    const masked = maskSecretValue(value);
    expect(masked.startsWith('github_pat_')).toBe(true);
    expect(masked.endsWith(value.slice(-4))).toBe(true);
--- a/canvas/src/store/tests/canvas-batch-partial-failure.test.ts
+++ b/canvas/src/store/tests/canvas-batch-partial-failure.test.ts
@ -5,27 +5,34 @@ import { describe, it, expect, beforeEach, vi } from "vitest";
 global.fetch = vi.fn();

 import { useCanvasStore } from "../canvas";
-import type { WorkspaceData } from "../socket";
+import type { WorkspaceNodeData } from "../canvas";

-function makeWS(overrides: Partial<WorkspaceData> & { id: string }): WorkspaceData {
+function makeWS(
+  overrides: Partial<WorkspaceNodeData> & { id: string },
+): WorkspaceNodeData {
+  // makeWS builds a minimal WorkspaceNodeData for tests that set state
+  // directly on the store (bypassing hydrate). The `id` override is
+  // ignored — node IDs live on the outer Node<> wrapper, not inside
+  // `data`. It's accepted here so callers can keep their existing
+  // `makeWS({ id: "ws-foo" })` call sites even though the id is only
+  // used on the Node<> wrapper at the call site.
+  void overrides.id;
  return {
    name: "WS",
    role: "agent",
    tier: 1,
    status: "online",
-    agent_card: null,
+    agentCard: null,
    url: "http://localhost:9000",
-    parent_id: null,
-    active_tasks: 0,
-    last_error_rate: 0,
-    last_sample_error: "",
-    uptime_seconds: 60,
-    current_task: "",
-    x: 0,
-    y: 0,
+    parentId: null,
+    activeTasks: 0,
+    lastErrorRate: 0,
+    lastSampleError: "",
+    currentTask: "",
    collapsed: false,
    runtime: "",
-    budget_limit: null,
+    needsRestart: false,
+    budgetLimit: null,
    ...overrides,
  };
 }
@ -148,13 +155,13 @@ describe("batchRestart — partial failure", () => {
          id: "ws-ok",
          type: "workspace",
          position: { x: 0, y: 0 },
-          data: { ...makeWS({ id: "ws-ok" }), needsRestart: true } as WorkspaceData & { needsRestart: boolean },
+          data: { ...makeWS({ id: "ws-ok" }), needsRestart: true } as WorkspaceNodeData,
        },
        {
          id: "ws-fail",
          type: "workspace",
          position: { x: 0, y: 0 },
-          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceData & { needsRestart: boolean },
+          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceNodeData,
        },
      ],
      selectedNodeIds: new Set(["ws-ok", "ws-fail"]),
@ -166,7 +173,7 @@ describe("batchRestart — partial failure", () => {
    });

    const byId = Object.fromEntries(
-      useCanvasStore.getState().nodes.map((n) => [n.id, n.data as WorkspaceData & { needsRestart?: boolean }])
+      useCanvasStore.getState().nodes.map((n) => [n.id, n.data as WorkspaceNodeData])
    );
    expect(byId["ws-ok"].needsRestart).toBe(false);
    expect(byId["ws-fail"].needsRestart).toBe(true);
@ -179,7 +186,7 @@ describe("batchRestart — partial failure", () => {
          id: "ws-fail",
          type: "workspace",
          position: { x: 0, y: 0 },
-          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceData & { needsRestart: boolean },
+          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceNodeData,
        },
      ],
      selectedNodeIds: new Set(["ws-fail"]),
--- a/canvas/src/store/tests/canvas-events-pan.test.ts
+++ b/canvas/src/store/tests/canvas-events-pan.test.ts
@ -67,7 +67,19 @@ describe("canvas-events – molecule:pan-to-node dispatch", () => {
    vi.restoreAllMocks();
  });

-  it("dispatches molecule:pan-to-node with the new nodeId for a NEW provision", () => {
+  it("dispatches both molecule:pan-to-node AND molecule:fit-deploying-org for a NEW root-level provision", () => {
+    // Two custom events are dispatched on NEW root-level provision:
+    //   1. molecule:fit-deploying-org — tells useCanvasViewport to
+    //      frame the whole deploying subtree. Fires for root nodes
+    //      too (commit 5adc8a74) so the canvas centers the just-
+    //      landed root immediately instead of waiting for the
+    //      first child to arrive.
+    //   2. molecule:pan-to-node — pans/zooms to the single node;
+    //      only for standalone creates (no parent), so org-import
+    //      children don't chase the spawn animation.
+    // A previous version of this test expected only #2 and failed
+    // when #1 was added for roots. If only one of these ever fires
+    // again, this test should flag the regression.
    const { get, set } = makeStore([]);
    const dispatched: Event[] = [];
    const spy = vi.spyOn(window, "dispatchEvent").mockImplementation((e) => {
@ -81,9 +93,15 @@ describe("canvas-events – molecule:pan-to-node dispatch", () => {
      set
    );

-    expect(dispatched).toHaveLength(1);
-    expect(dispatched[0].type).toBe("molecule:pan-to-node");
-    expect((dispatched[0] as CustomEvent).detail?.nodeId).toBe("ws-new");
+    expect(dispatched).toHaveLength(2);
+    const panEvent = dispatched.find((e) => e.type === "molecule:pan-to-node");
+    const fitEvent = dispatched.find((e) => e.type === "molecule:fit-deploying-org");
+    expect(panEvent, "molecule:pan-to-node should fire for standalone create").toBeDefined();
+    expect(fitEvent, "molecule:fit-deploying-org should fire so the viewport frames the root").toBeDefined();
+    expect((panEvent as CustomEvent).detail?.nodeId).toBe("ws-new");
+    expect((fitEvent as CustomEvent).detail?.rootId).toBe("ws-new");
+
+    spy.mockRestore();
  });

  it("does NOT dispatch molecule:pan-to-node when restarting an existing node", () => {
--- a/canvas/src/store/tests/canvas-topology.test.ts
+++ b/canvas/src/store/tests/canvas-topology.test.ts
@ -149,6 +149,75 @@ describe("buildNodesAndEdges – parent + child workspaces", () => {
  });
 });

+describe("buildNodesAndEdges – auto-rescue respects live grown parent size", () => {
+  // Regression: child the user dragged into a user-grown area was
+  // false-rescued by every periodic rehydrate (socket health check
+  // every 30s) because the rescue heuristic used the initial
+  // grid-derived parent bbox, not the currently-grown size. Result:
+  // child snapped to a stale grid slot, then settled back ~1 frame
+  // later when growParentsToFitChildren re-ran. Observed 2026-04-25
+  // as "child jumps to weird location, then 30s later it's fine".
+
+  it("does NOT rescue a child placed inside the user-grown parent area", () => {
+    // Parent's initial grid-derived size is small; user has since grown it
+    // to 800×600. Child sits at relative (700, 400) — inside the grown
+    // bbox but outside the initial bbox. Without currentParentSizes,
+    // the rescue would re-place the child into a default grid slot.
+    const parentAbs = { x: 100, y: 100 };
+    const childAbs = { x: parentAbs.x + 700, y: parentAbs.y + 400 };
+    const workspaces = [
+      makeWS({ id: "parent", x: parentAbs.x, y: parentAbs.y }),
+      makeWS({ id: "child", parent_id: "parent", x: childAbs.x, y: childAbs.y }),
+    ];
+    const grownDims = new Map([
+      ["parent", { width: 800, height: 600 }],
+    ]);
+
+    const { nodes } = buildNodesAndEdges(workspaces, new Map(), grownDims);
+    const child = nodes.find((n) => n.id === "child")!;
+    // Child's relative position should match what we passed in.
+    expect(child.position).toEqual({ x: 700, y: 400 });
+  });
+
+  it("DOES rescue a child whose stored position is outside even the grown parent", () => {
+    // Same parent but child is way outside (relative 5000, 5000).
+    // The rescue must still fire — the heuristic isn't "always trust
+    // the user", it's "trust the user up to the current parent bbox".
+    const parentAbs = { x: 100, y: 100 };
+    const childAbs = { x: parentAbs.x + 5000, y: parentAbs.y + 5000 };
+    const workspaces = [
+      makeWS({ id: "parent", x: parentAbs.x, y: parentAbs.y }),
+      makeWS({ id: "child", parent_id: "parent", x: childAbs.x, y: childAbs.y }),
+    ];
+    const grownDims = new Map([
+      ["parent", { width: 800, height: 600 }],
+    ]);
+
+    const { nodes } = buildNodesAndEdges(workspaces, new Map(), grownDims);
+    const child = nodes.find((n) => n.id === "child")!;
+    // Rescued: NOT the original (5000, 5000); some grid slot instead.
+    expect(child.position.x).toBeLessThan(5000);
+    expect(child.position.y).toBeLessThan(5000);
+  });
+
+  it("falls back to initial-min bbox when no live size is provided (preserves legacy behavior)", () => {
+    // Empty currentParentSizes — first hydrate or test without store
+    // priming. Child outside the initial bbox should still be rescued.
+    const parentAbs = { x: 100, y: 100 };
+    const childAbs = { x: parentAbs.x + 700, y: parentAbs.y + 400 };
+    const workspaces = [
+      makeWS({ id: "parent", x: parentAbs.x, y: parentAbs.y }),
+      makeWS({ id: "child", parent_id: "parent", x: childAbs.x, y: childAbs.y }),
+    ];
+
+    const { nodes } = buildNodesAndEdges(workspaces);
+    const child = nodes.find((n) => n.id === "child")!;
+    // Without a live size hint, the initial bbox applies — rescue
+    // fires, child gets a fresh slot, NOT the user-supplied (700,400).
+    expect(child.position).not.toEqual({ x: 700, y: 400 });
+  });
+});
+
 describe("buildNodesAndEdges – deeply nested hierarchy", () => {
  it("handles three levels of nesting", () => {
    const workspaces = [
--- a/canvas/src/store/tests/canvas.test.ts
+++ b/canvas/src/store/tests/canvas.test.ts
@ -6,6 +6,7 @@ global.fetch = vi.fn(() =>
 );

 import { useCanvasStore, summarizeWorkspaceCapabilities } from "../canvas";
+import { __resetTombstonesForTest } from "../deleteTombstones";
 import type { WorkspaceData, WSMessage } from "../socket";

 // Helper to build a WorkspaceData object with sensible defaults
@ -52,6 +53,10 @@ beforeEach(() => {
    searchOpen: false,
    viewport: { x: 0, y: 0, zoom: 1 },
  });
+  // Tombstones leak across tests because the module-level map is
+  // process-lifetime by design. Reset between tests so a delete in one
+  // test doesn't shadow a hydrate in the next.
+  __resetTombstonesForTest();
  vi.clearAllMocks();
 });

@ -484,6 +489,92 @@ describe("removeNode", () => {
  });
 });

+// ---------- removeSubtree ----------
+
+describe("removeSubtree", () => {
+  beforeEach(() => {
+    useCanvasStore.getState().hydrate([
+      makeWS({ id: "root" }),
+      makeWS({ id: "mid", parent_id: "root" }),
+      makeWS({ id: "leaf", parent_id: "mid" }),
+      makeWS({ id: "sibling", parent_id: "root" }),
+      makeWS({ id: "unrelated" }), // separate root
+    ]);
+  });
+
+  it("removes the root and every descendant in one shot", () => {
+    useCanvasStore.getState().removeSubtree("root");
+    const ids = useCanvasStore
+      .getState()
+      .nodes.map((n) => n.id)
+      .sort();
+    expect(ids).toEqual(["unrelated"]);
+  });
+
+  it("removes a mid-level node and its descendants but leaves siblings + ancestors", () => {
+    useCanvasStore.getState().removeSubtree("mid");
+    const ids = useCanvasStore
+      .getState()
+      .nodes.map((n) => n.id)
+      .sort();
+    expect(ids).toEqual(["root", "sibling", "unrelated"]);
+  });
+
+  it("removing a leaf is a no-op cascade (just drops the leaf)", () => {
+    useCanvasStore.getState().removeSubtree("leaf");
+    const ids = useCanvasStore
+      .getState()
+      .nodes.map((n) => n.id)
+      .sort();
+    expect(ids).toEqual(["mid", "root", "sibling", "unrelated"]);
+  });
+
+  it("clears selection when the selected node is anywhere in the removed subtree", () => {
+    useCanvasStore.getState().selectNode("leaf");
+    useCanvasStore.getState().removeSubtree("root");
+    expect(useCanvasStore.getState().selectedNodeId).toBeNull();
+  });
+
+  it("preserves selection when the selected node is outside the removed subtree", () => {
+    useCanvasStore.getState().selectNode("unrelated");
+    useCanvasStore.getState().removeSubtree("root");
+    expect(useCanvasStore.getState().selectedNodeId).toBe("unrelated");
+  });
+
+  it("drops edges incident to any removed node", () => {
+    // The hydrate-built edges connect parent → child. After removing
+    // `root`, no edge involving root/mid/leaf/sibling should remain.
+    useCanvasStore.getState().removeSubtree("root");
+    const remaining = useCanvasStore.getState().edges;
+    for (const e of remaining) {
+      expect(["root", "mid", "leaf", "sibling"]).not.toContain(e.source);
+      expect(["root", "mid", "leaf", "sibling"]).not.toContain(e.target);
+    }
+  });
+
+  // #2069: lock the tombstone path end-to-end at the store level.
+  it("hydrate cannot resurrect ids that removeSubtree just dropped (#2069)", () => {
+    useCanvasStore.getState().removeSubtree("root");
+    expect(useCanvasStore.getState().nodes.map((n) => n.id).sort())
+      .toEqual(["unrelated"]);
+
+    // Simulate the in-flight GET response landing AFTER the delete:
+    // the snapshot still contains every original workspace, including
+    // the just-removed subtree.
+    useCanvasStore.getState().hydrate([
+      makeWS({ id: "root" }),
+      makeWS({ id: "mid", parent_id: "root" }),
+      makeWS({ id: "leaf", parent_id: "mid" }),
+      makeWS({ id: "sibling", parent_id: "root" }),
+      makeWS({ id: "unrelated" }),
+    ]);
+
+    // root/mid/leaf/sibling MUST stay deleted; only `unrelated` survives.
+    const ids = useCanvasStore.getState().nodes.map((n) => n.id).sort();
+    expect(ids).toEqual(["unrelated"]);
+  });
+});
+
 // ---------- isDescendant ----------

 describe("isDescendant", () => {
--- a/canvas/src/store/tests/deleteTombstones.test.ts
+++ b/canvas/src/store/tests/deleteTombstones.test.ts
@ -0,0 +1,82 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import {
+  __resetTombstonesForTest,
+  __tombstoneCountForTest,
+  markDeleted,
+  wasRecentlyDeleted,
+} from "../deleteTombstones";
+
+// Tombstone TTL is hardcoded at 10s in the module — these tests freeze
+// time so the GC + read-time expiry can be exercised deterministically
+// without sleeping.
+
+describe("deleteTombstones (#2069)", () => {
+  beforeEach(() => {
+    __resetTombstonesForTest();
+    vi.useFakeTimers();
+    vi.setSystemTime(new Date("2026-04-26T20:00:00Z"));
+  });
+
+  afterEach(() => {
+    vi.useRealTimers();
+    __resetTombstonesForTest();
+  });
+
+  it("flags ids as recently deleted immediately after markDeleted", () => {
+    markDeleted(["root-1", "child-a"]);
+    expect(wasRecentlyDeleted("root-1")).toBe(true);
+    expect(wasRecentlyDeleted("child-a")).toBe(true);
+  });
+
+  it("returns false for ids that were never marked", () => {
+    markDeleted(["root-1"]);
+    expect(wasRecentlyDeleted("never-deleted")).toBe(false);
+  });
+
+  it("expires tombstones after the 10s TTL", () => {
+    markDeleted(["root-1"]);
+    expect(wasRecentlyDeleted("root-1")).toBe(true);
+    vi.advanceTimersByTime(9_999);
+    expect(wasRecentlyDeleted("root-1")).toBe(true);
+    vi.advanceTimersByTime(2);
+    expect(wasRecentlyDeleted("root-1")).toBe(false);
+  });
+
+  it("evicts expired entries on read so the map stays bounded", () => {
+    markDeleted(["root-1"]);
+    expect(__tombstoneCountForTest()).toBe(1);
+    vi.advanceTimersByTime(11_000);
+    // The read itself triggers eviction — no separate GC pass needed.
+    wasRecentlyDeleted("root-1");
+    expect(__tombstoneCountForTest()).toBe(0);
+  });
+
+  it("evicts expired entries on write so the map stays bounded across long sessions", () => {
+    markDeleted(["root-1"]);
+    expect(__tombstoneCountForTest()).toBe(1);
+    vi.advanceTimersByTime(11_000);
+    // markDeleted GCs before inserting, so the second write should
+    // evict root-1 (now stale) AND insert root-2 — net size 1, not 2.
+    markDeleted(["root-2"]);
+    expect(__tombstoneCountForTest()).toBe(1);
+    expect(wasRecentlyDeleted("root-1")).toBe(false);
+    expect(wasRecentlyDeleted("root-2")).toBe(true);
+  });
+
+  it("resets the deletedAt timestamp when the same id is marked again", () => {
+    markDeleted(["root-1"]);
+    vi.advanceTimersByTime(8_000);
+    // Same id re-deleted (rare, but legal) — TTL restarts from now.
+    markDeleted(["root-1"]);
+    vi.advanceTimersByTime(8_000);
+    // 16s after the FIRST mark; would have expired without the re-mark.
+    expect(wasRecentlyDeleted("root-1")).toBe(true);
+  });
+
+  it("accepts any iterable, not just arrays", () => {
+    const ids = new Set(["root-1", "root-2"]);
+    markDeleted(ids);
+    expect(wasRecentlyDeleted("root-1")).toBe(true);
+    expect(wasRecentlyDeleted("root-2")).toBe(true);
+  });
+});
--- a/canvas/src/store/tests/socket.test.ts
+++ b/canvas/src/store/tests/socket.test.ts
@ -1,7 +1,7 @@
 import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";

 // ---------------------------------------------------------------------------
-// Mock the canvas store before importing socket.ts
+// Mock the canvas store and api before importing socket.ts
 // ---------------------------------------------------------------------------
 vi.mock("../canvas", () => ({
  useCanvasStore: {
@ -13,6 +13,7 @@ vi.mock("../canvas", () => ({
  },
 }));

+
 // ---------------------------------------------------------------------------
 // Mock WebSocket
 // ---------------------------------------------------------------------------
@ -76,7 +77,6 @@ function getLastWS(): MockWebSocket {
 beforeEach(() => {
  MockWebSocket.instances = [];
  vi.useFakeTimers();
-
  // Reset mocked store state
  vi.mocked(useCanvasStore.getState).mockReturnValue({
    applyEvent: vi.fn(),
@ -263,13 +263,59 @@ describe("WebSocket onclose – auto-reconnect", () => {
    const ws = getLastWS();
    ws.triggerClose();

-    // Fast-forward timers to trigger the reconnect
-    vi.runAllTimers();
+    // First reconnect attempt is scheduled at 1s (Math.min(1000 * 2^0,
+    // 30000)). Advance just past that — vi.runAllTimers() would
+    // additionally re-fire the fallback poll setInterval forever and
+    // hit the 10000-timer abort.
+    vi.advanceTimersByTime(1100);

    expect(MockWebSocket.instances.length).toBeGreaterThan(1);
  });
 });

+describe("HTTP fallback poll while WS unhealthy", () => {
+  it("starts a setInterval after onclose so /workspaces stays fresh", () => {
+    const setIntervalSpy = vi.spyOn(globalThis, "setInterval");
+    connectSocket();
+    const ws = getLastWS();
+    ws.triggerClose();
+    // The fallback poll runs at 10s; the reconnect uses setTimeout, so
+    // any setInterval registered between connect and close must be the
+    // fallback poll.
+    const fallbackCalls = setIntervalSpy.mock.calls.filter(
+      ([, delay]) => delay === 10_000,
+    );
+    expect(fallbackCalls.length).toBeGreaterThan(0);
+    setIntervalSpy.mockRestore();
+  });
+
+  it("clears the fallback poll once the WS reconnects (onopen)", () => {
+    const clearIntervalSpy = vi.spyOn(globalThis, "clearInterval");
+    connectSocket();
+    const ws = getLastWS();
+    ws.triggerClose(); // starts fallback poll
+    clearIntervalSpy.mockClear();
+    // Advance past the first reconnect delay so a fresh ws exists,
+    // then trigger its open.
+    vi.advanceTimersByTime(1100);
+    const ws2 = getLastWS();
+    ws2.triggerOpen();
+    expect(clearIntervalSpy).toHaveBeenCalled();
+    clearIntervalSpy.mockRestore();
+  });
+
+  it("clears the fallback poll on disconnect", () => {
+    const clearIntervalSpy = vi.spyOn(globalThis, "clearInterval");
+    connectSocket();
+    const ws = getLastWS();
+    ws.triggerClose(); // starts fallback poll
+    clearIntervalSpy.mockClear();
+    disconnectSocket();
+    expect(clearIntervalSpy).toHaveBeenCalled();
+    clearIntervalSpy.mockRestore();
+  });
+});
+
 // ---------------------------------------------------------------------------
 // onerror handler
 // ---------------------------------------------------------------------------
@ -328,3 +374,45 @@ describe("health check", () => {
    clearIntervalSpy.mockRestore();
  });
 });
+
+// Rehydrate dedup logic itself is exercised by `RehydrateDedup` unit
+// tests in this file (below). End-to-end coupling through the
+// dynamic-imported `@/lib/api` was non-trivial under our existing
+// fake-timer setup; isolating the gate in a pure helper keeps
+// regression coverage without that mocking complexity.
+
+import { RehydrateDedup } from "../socket";
+
+describe("RehydrateDedup", () => {
+  it("first call passes the gate (no prior fetch)", () => {
+    const d = new RehydrateDedup(1500);
+    expect(d.shouldSkip(0)).toBe(false);
+  });
+
+  it("blocks while a fetch is in flight", () => {
+    const d = new RehydrateDedup(1500);
+    d.beginFetch();
+    expect(d.shouldSkip(100)).toBe(true);
+  });
+
+  it("blocks within the post-completion window", () => {
+    const d = new RehydrateDedup(1500);
+    d.beginFetch();
+    d.completeFetch(1_000);
+    // 1100 - 1000 = 100 < 1500 → skip
+    expect(d.shouldSkip(1_100)).toBe(true);
+    // 2600 - 1000 = 1600 > 1500 → allow
+    expect(d.shouldSkip(2_600)).toBe(false);
+  });
+
+  it("a completed fetch followed by another beginFetch blocks for the new in-flight", () => {
+    const d = new RehydrateDedup(1500);
+    d.beginFetch();
+    d.completeFetch(1_000);
+    // First wait out the dedup window
+    expect(d.shouldSkip(2_600)).toBe(false);
+    d.beginFetch();
+    // Now a second fetch is in flight; further calls block again
+    expect(d.shouldSkip(2_700)).toBe(true);
+  });
+});
--- a/canvas/src/store/canvas-events.ts
+++ b/canvas/src/store/canvas-events.ts
@ -1,7 +1,7 @@
 import type { Node, Edge } from "@xyflow/react";
 import type { WSMessage } from "./socket";
 import type { WorkspaceNodeData } from "./canvas";
-import { extractResponseText } from "@/components/tabs/chat/message-parser";
+import { extractResponseText, extractFilesFromTask } from "@/components/tabs/chat/message-parser";

 // ---------------------------------------------------------------------------
 // Monotonically increasing counter used to assign grid positions.
@ -21,13 +21,46 @@ import { extractResponseText } from "@/components/tabs/chat/message-parser";
 //
 // A monotonic counter is immune to deletions: it only ever increases.
 // ---------------------------------------------------------------------------
+import { appendClass, removeClass, scheduleNodeClassRemoval } from "./classNames";
+
 let _provisioningSequence = 0;

 /** Reset the sequence counter — exposed for test teardown only. */
 export function resetProvisioningSequence(): void {
  _provisioningSequence = 0;
+  _pendingOnline.clear();
 }

+/** WORKSPACE_ONLINE events that arrived BEFORE the matching
+ *  WORKSPACE_PROVISIONING — buffered here so the late-arriving
+ *  provision event can immediately flip to the correct status
+ *  instead of leaving the node stuck as "provisioning" forever.
+ *  Cleared when applied, or on module reset (tests). */
+const _pendingOnline = new Set<string>();
+
+/** Debounced parent-grow. Each child arrival schedules this; the
+ *  timer keeps resetting as more siblings land, so the actual
+ *  width/height update runs ONCE after arrivals go quiet. Avoids
+ *  the visible size-pulse that happened when growParentsToFitChildren
+ *  ran per event. */
+let _growTimer: ReturnType<typeof setTimeout> | null = null;
+function scheduleParentGrow(): void {
+  if (typeof window === "undefined") return;
+  if (_growTimer) clearTimeout(_growTimer);
+  _growTimer = setTimeout(() => {
+    _growTimer = null;
+    import("./canvas").then(({ useCanvasStore }) => {
+      useCanvasStore.getState().growParentsToFitChildren?.();
+    });
+  }, 300);
+}
+
+// (absoluteNodePosition was used by an earlier "spawn from parent"
+// revision that subtracted parent absolute coords from server-sent
+// absolute child coords. The server now ships parent-relative coords
+// directly, so the walk is no longer needed. Deleted rather than
+// kept as dead code.)
+
 /**
 * Standalone event handler extracted from the canvas store.
 * Applies a single WebSocket event to the current node/edge state.
@ -38,7 +71,7 @@ export function handleCanvasEvent(
    nodes: Node<WorkspaceNodeData>[];
    edges: Edge[];
    selectedNodeId: string | null;
-    agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string }>>;
+    agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string; attachments?: Array<{ name: string; uri: string; mimeType?: string; size?: number }> }>>;
  },
  set: (partial: Record<string, unknown>) => void,
 ): void {
@ -47,14 +80,44 @@ export function handleCanvasEvent(
  switch (msg.event) {
    case "WORKSPACE_ONLINE": {
      const existing = nodes.find((n) => n.id === msg.workspace_id);
-      if (existing) {
-        set({
-          nodes: nodes.map((n) =>
-            n.id === msg.workspace_id
-              ? { ...n, data: { ...n.data, status: "online" } }
-              : n
-          ),
-        });
+      if (!existing) {
+        // PROVISIONING event hasn't been applied yet (WS reorder or
+        // this tab joined mid-deploy). Buffer so the later PROVISIONING
+        // handler can flip status in one pass instead of leaving the
+        // node stuck in "provisioning" forever.
+        _pendingOnline.add(msg.workspace_id);
+        break;
+      }
+      // Flip incoming edge from blueprint → laser so the link is
+      // drawn solid the moment this child is live. The laser class
+      // plays the stroke-dashoffset keyframe once; after ~500ms the
+      // edge falls back to the default solid style (see
+      // org-deploy.css and the follow-up setTimeout below).
+      const updatedEdges = edges.map((e) =>
+        e.target === msg.workspace_id && e.className?.includes("mol-deploy-edge-blueprint")
+          ? { ...e, className: "mol-deploy-edge-laser" }
+          : e,
+      );
+      set({
+        edges: updatedEdges,
+        nodes: nodes.map((n) =>
+          n.id === msg.workspace_id
+            ? { ...n, data: { ...n.data, status: "online" } }
+            : n,
+        ),
+      });
+      // Remove the laser class after its keyframe ends so the edge
+      // settles into the app's default solid styling. Fire-and-forget.
+      if (typeof window !== "undefined") {
+        const targetEdgeId = `${existing.data.parentId ?? ""}-${msg.workspace_id}`;
+        window.setTimeout(() => {
+          const s = get();
+          set({
+            edges: s.edges.map((e) =>
+              e.id === targetEdgeId ? { ...e, className: undefined } : e,
+            ),
+          });
+        }, 600);
      }
      break;
    }
@ -113,25 +176,73 @@ export function handleCanvasEvent(
          ),
        });
      } else {
-        // Spread new nodes in a grid so they don't stack at the viewport origin.
-        // Use the monotonic _provisioningSequence counter (not nodes.length) so
-        // deletions never cause two live nodes to share a grid slot.
-        const GRID_COLS = 4;
-        const COL_SPACING = 320;
-        const ROW_SPACING = 160;
-        const GRID_ORIGIN_X = 100;
-        const GRID_ORIGIN_Y = 100;
-        const idx = _provisioningSequence++;
-        const x = GRID_ORIGIN_X + (idx % GRID_COLS) * COL_SPACING;
-        const y = GRID_ORIGIN_Y + Math.floor(idx / GRID_COLS) * ROW_SPACING;
+        // Payload may carry parent_id + final x/y (org import broadcasts
+        // these so the canvas can animate the "spawn from parent" motion).
+        // Standalone workspace creates still omit them — fall back to the
+        // grid-slot behaviour that handled that case historically.
+        const parentIdRaw = (msg.payload.parent_id as string | undefined) ?? null;
+        const finalX = msg.payload.x as number | undefined;
+        const finalY = msg.payload.y as number | undefined;

+        let spawnX: number;
+        let spawnY: number;
+        let targetX: number;
+        let targetY: number;
+        let parentId: string | null = null;
+
+        // Place the node at its final slot immediately — no
+        // spring-from-parent motion. The earlier "materialize from
+        // parent then tween to target" was expensive (two set()
+        // calls + rAF) and produced wrong offsets because the
+        // server sends absolute coords computed against the template's
+        // own coord system while the client had placed the parent at
+        // a grid slot, so the target math always landed off-grid.
+        // Now: server coords are parent-relative (see org_import.go),
+        // we trust them verbatim.
+        const parentInStore = parentIdRaw
+          ? nodes.find((n) => n.id === parentIdRaw)
+          : undefined;
+        if (parentIdRaw && parentInStore && finalX !== undefined && finalY !== undefined) {
+          targetX = finalX;
+          targetY = finalY;
+          parentId = parentIdRaw;
+        } else {
+          // Standalone create OR org-child whose parent hasn't arrived
+          // yet (rare WS reorder) — monotonic-grid placement. The
+          // follow-up hydrate pass reconciles parent_id + the correct
+          // nested position if parent lands later.
+          const GRID_COLS = 4;
+          const COL_SPACING = 320;
+          const ROW_SPACING = 160;
+          const GRID_ORIGIN_X = 100;
+          const GRID_ORIGIN_Y = 100;
+          const idx = _provisioningSequence++;
+          targetX = GRID_ORIGIN_X + (idx % GRID_COLS) * COL_SPACING;
+          targetY = GRID_ORIGIN_Y + Math.floor(idx / GRID_COLS) * ROW_SPACING;
+        }
+        spawnX = targetX;
+        spawnY = targetY;
+
+        // Parent→child relationship is already visible via React
+        // Flow's nested rendering (the child card sits INSIDE the
+        // parent container). An explicit edge on top of that was
+        // visual double-counting and made the canvas look busy;
+        // removed per demo feedback. A2A edges (showA2AEdges) still
+        // render when enabled — those represent runtime traffic,
+        // which nesting doesn't express.
        set({
          nodes: [
            ...nodes,
            {
              id: msg.workspace_id,
              type: "workspaceNode",
-              position: { x, y },
+              position: { x: spawnX, y: spawnY },
+              // React Flow's parentId (distinct from data.parentId)
+              // triggers parent-relative positioning. Set it when the
+              // server told us this is an org-import child so the
+              // node renders nested inside the parent container.
+              ...(parentId ? { parentId } : {}),
+              className: "mol-deploy-spawn",
              data: {
                name: (msg.payload.name as string) ?? "New Workspace",
                status: "provisioning",
@ -143,7 +254,7 @@ export function handleCanvasEvent(
                lastErrorRate: 0,
                lastSampleError: "",
                url: "",
-                parentId: null,
+                parentId, // data.parentId mirrors React Flow's parentId
                currentTask: "",
                runtime: (msg.payload.runtime as string) ?? "",
                needsRestart: false,
@ -152,8 +263,76 @@ export function handleCanvasEvent(
          ],
        });

-        // Pan the canvas to the new node
+        // Grow the parent to fit the just-landed child. DEBOUNCED
+        // across rapid sibling arrivals — firing width/height updates
+        // on every child made the parent card visibly pulse in size
+        // as each kid landed, which read as the parent "flashing
+        // around". One grow pass ~300ms after the last arrival
+        // coalesces the whole burst into a single layout change.
+        if (parentId && typeof window !== "undefined") {
+          scheduleParentGrow();
+        }
+        // Parent-border pulse removed per demo feedback — the soft
+        // box-shadow ring on each arrival compounded with the size
+        // grow to make the whole parent card look unstable. The
+        // dim-light signal on the provisioning child is sufficient
+        // acknowledgement that something is happening.
+
+        // Remove the one-shot spawn class after the keyframe ends so
+        // future re-renders don't replay it.
+        scheduleNodeClassRemoval(msg.workspace_id, "mol-deploy-spawn", 400, get, set);
+
+        // Auto-pan+zoom to the whole deploying org after each
+        // arrival so the user always sees the full picture — unless
+        // they've panned themselves (handled by the viewport hook,
+        // which aborts the fit when the user moved after the last
+        // auto-fit). Event name matches the existing handler in
+        // useCanvasViewport that knows how to compute subtree bounds.
+        //
+        // Fire for roots too (not just children) so the canvas
+        // centers on the just-landed root immediately instead of
+        // waiting for the first child to arrive ~2s later. The
+        // viewport hook walks UP to find the true root, so passing
+        // the node's own id when there's no parent is equivalent
+        // to passing the root.
        if (typeof window !== "undefined") {
+          window.dispatchEvent(
+            new CustomEvent("molecule:fit-deploying-org", {
+              detail: { rootId: parentIdRaw ?? msg.workspace_id },
+            }),
+          );
+        }
+
+        // Race handling: if a WORKSPACE_ONLINE event beat the
+        // matching PROVISIONING to this tab, the online flag was
+        // buffered in _pendingOnline. Apply it now so the node
+        // doesn't stay stuck as "provisioning" forever.
+        //
+        // Only flip to "online" if the current status is still
+        // "provisioning" at drain time. Otherwise a WORKSPACE_DEGRADED
+        // / FAILED / PAUSED that arrived between the set() above and
+        // the scheduled drain would be silently clobbered — the
+        // buffered ONLINE is stale by then.
+        if (_pendingOnline.has(msg.workspace_id)) {
+          _pendingOnline.delete(msg.workspace_id);
+          if (typeof window !== "undefined") {
+            window.setTimeout(() => {
+              const s = get();
+              set({
+                nodes: s.nodes.map((n) =>
+                  n.id === msg.workspace_id && n.data.status === "provisioning"
+                    ? { ...n, data: { ...n.data, status: "online" } }
+                    : n,
+                ),
+              });
+            }, 0);
+          }
+        }
+
+        // Pan the canvas to the new node (standalone create only —
+        // during an org import, zooming to every child chases the
+        // spawn animation around the viewport which is jarring).
+        if (!parentIdRaw && typeof window !== "undefined") {
          window.dispatchEvent(
            new CustomEvent("molecule:pan-to-node", {
              detail: { nodeId: msg.workspace_id },
@ -252,12 +431,19 @@ export function handleCanvasEvent(
    }

    case "A2A_RESPONSE": {
-      // A2A proxy completed — extract response text and store as agent message.
-      // This gives the ChatTab instant response delivery via WebSocket instead of polling.
+      // A2A proxy completed — extract response text AND any `kind: file`
+      // parts. Without the file extraction, agent-returned attachments
+      // delivered via this WebSocket path would disappear (the canvas
+      // would render a text-only message while the HTTP fallback
+      // rendered the same reply with download chips, depending on
+      // which delivery path raced to completion first).
      const responseBody = msg.payload.response_body as Record<string, unknown> | undefined;
      if (responseBody) {
        const text = extractResponseText(responseBody);
-        if (text) {
+        const attachments = extractFilesFromTask(
+          (responseBody.result ?? responseBody) as Record<string, unknown>,
+        );
+        if (text || attachments.length > 0) {
          const { agentMessages } = get();
          const existing = agentMessages[msg.workspace_id] || [];
          set({
@ -265,7 +451,12 @@ export function handleCanvasEvent(
              ...agentMessages,
              [msg.workspace_id]: [
                ...existing,
-                { id: crypto.randomUUID(), content: text, timestamp: new Date().toISOString() },
+                {
+                  id: crypto.randomUUID(),
+                  content: text,
+                  timestamp: new Date().toISOString(),
+                  attachments: attachments.length > 0 ? attachments : undefined,
+                },
              ],
            },
          });
--- a/canvas/src/store/canvas-topology.ts
+++ b/canvas/src/store/canvas-topology.ts
@ -280,6 +280,15 @@ export function computeAutoLayout(
 * Accepts an optional layoutOverrides map (from computeAutoLayout) to override
 * positions for workspaces that were at 0,0.
 *
+ * `currentParentSizes` carries the LIVE measured/grown dimensions of parent
+ * nodes from the existing client store. The auto-rescue heuristic below
+ * (line ~445) compares each child's stored relative position against its
+ * parent's bbox; without the live size, the bbox is whatever the
+ * grid-derived initial min-size formula produced. That falsely rescued
+ * children dragged into the user-grown area on every periodic rehydrate
+ * (socket.ts:87 fires every 30s if no WS events seen) — observed
+ * 2026-04-25 as "child jumps to weird location, then settles 30s later".
+ *
 * Parent/child rendering model: every workspace is a first-class React Flow
 * node (full card). When a workspace has parent_id set, its RF `parentId` is
 * set to the parent's id and its position is stored RELATIVE to the parent
@ -290,7 +299,8 @@ export function computeAutoLayout(
 */
 export function buildNodesAndEdges(
  workspaces: WorkspaceData[],
-  layoutOverrides: Map<string, { x: number; y: number }> = new Map()
+  layoutOverrides: Map<string, { x: number; y: number }> = new Map(),
+  currentParentSizes: Map<string, { width: number; height: number }> = new Map(),
 ): {
  nodes: Node<WorkspaceNodeData>[];
  edges: Edge[];
@ -439,7 +449,23 @@ export function buildNodesAndEdges(
      //     child.left = 500 < parent.right = 800 → overlaps → kept
      //   legacy huge positive (position.x = 50000):
      //     child.left = 50000 >= parent.right → no overlap → rescued
-      const psize = parentSize.get(ws.parent_id!)!;
+      const initialPsize = parentSize.get(ws.parent_id!)!;
+      // Use the larger of (initial min, currently grown) for the bbox
+      // test. Without this, a child the user dragged into the grown
+      // area appears "outside" the (smaller) initial bbox and the
+      // rescue below false-fires on every periodic rehydrate, jumping
+      // the child to a stale grid slot. Live grown dims arrive via
+      // currentParentSizes from hydrate(); on first load (empty
+      // store), the map is empty and we fall back to the initial min
+      // — preserving the original rescue semantics for genuinely
+      // detached legacy data.
+      const liveParentSize = currentParentSizes.get(ws.parent_id!);
+      const psize = liveParentSize
+        ? {
+            width: Math.max(initialPsize.width, liveParentSize.width),
+            height: Math.max(initialPsize.height, liveParentSize.height),
+          }
+        : initialPsize;
      const myW = subtreeSize.get(ws.id)?.width ?? CHILD_DEFAULT_WIDTH;
      const myH = subtreeSize.get(ws.id)?.height ?? CHILD_DEFAULT_HEIGHT;
      const overlapsX =
--- a/canvas/src/store/canvas.ts
+++ b/canvas/src/store/canvas.ts
@ -9,6 +9,7 @@ import { api } from "@/lib/api";
 import { showToast } from "@/components/Toaster";
 import type { WorkspaceData, WSMessage } from "./socket";
 import { handleCanvasEvent } from "./canvas-events";
+import { markDeleted, wasRecentlyDeleted } from "./deleteTombstones";
 import {
  buildNodesAndEdges,
  computeAutoLayout,
@ -138,6 +139,16 @@ interface CanvasState {
  updateNodeData: (id: string, data: Partial<WorkspaceNodeData>) => void;
  restartWorkspace: (id: string) => Promise<void>;
  removeNode: (id: string) => void;
+  /** Remove a node AND every descendant in one atomic update. Mirrors
+   *  the server-side cascade — `DELETE /workspaces/:id?confirm=true`
+   *  drops the row plus every descendant in one transaction. The
+   *  caller (Canvas / DetailsTab delete handlers) used to call
+   *  `removeNode(rootId)` and rely on per-descendant WORKSPACE_REMOVED
+   *  WS events to clear the rest. When the WS is unhealthy those
+   *  events never arrive and the children orphan to the root until a
+   *  manual page refresh — `removeSubtree` makes the cascade
+   *  WS-independent. */
+  removeSubtree: (rootId: string) => void;
  setDragOverNode: (id: string | null) => void;
  nestNode: (draggedId: string, targetId: string | null) => Promise<void>;
  isDescendant: (ancestorId: string, nodeId: string) => boolean;
@ -177,6 +188,15 @@ interface CanvasState {
  setPendingDelete: (
    v: { id: string; name: string; hasChildren: boolean; children: { id: string; name: string }[] } | null
  ) => void;
+  /** Node IDs whose DELETE request is in flight. Populated the moment
+   *  the user confirms a cascade delete; drained as WORKSPACE_REMOVED
+   *  events strip the nodes (or all-at-once on request failure). Lets
+   *  the canvas render the "don't touch — something is happening"
+   *  treatment (dim + non-draggable) during the network round trip
+   *  and the server-side cascade, matching the deploy-lock UX. */
+  deletingIds: Set<string>;
+  beginDelete: (ids: Iterable<string>) => void;
+  endDelete: (ids: Iterable<string>) => void;
  searchOpen: boolean;
  setSearchOpen: (open: boolean) => void;
  viewport: { x: number; y: number; zoom: number };
@ -190,8 +210,8 @@ interface CanvasState {
  batchPause: () => Promise<void>;
  batchDelete: () => Promise<void>;
  /** Agent-pushed messages keyed by workspace ID. ChatTab consumes and clears these. */
-  agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string }>>;
-  consumeAgentMessages: (workspaceId: string) => Array<{ id: string; content: string; timestamp: string }>;
+  agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string; attachments?: Array<{ name: string; uri: string; mimeType?: string; size?: number }> }>>;
+  consumeAgentMessages: (workspaceId: string) => Array<{ id: string; content: string; timestamp: string; attachments?: Array<{ name: string; uri: string; mimeType?: string; size?: number }> }>;
  /** WebSocket connection status — drives the live indicator in the Toolbar. */
  wsStatus: "connected" | "connecting" | "disconnected";
  setWsStatus: (status: "connected" | "connecting" | "disconnected") => void;
@ -309,6 +329,17 @@ export const useCanvasStore = create<CanvasState>((set, get) => ({
  closeContextMenu: () => set({ contextMenu: null }),
  pendingDelete: null,
  setPendingDelete: (v) => set({ pendingDelete: v }),
+  deletingIds: new Set<string>(),
+  beginDelete: (ids) => {
+    const next = new Set(get().deletingIds);
+    for (const id of ids) next.add(id);
+    set({ deletingIds: next });
+  },
+  endDelete: (ids) => {
+    const next = new Set(get().deletingIds);
+    for (const id of ids) next.delete(id);
+    set({ deletingIds: next });
+  },
  searchOpen: false,
  setSearchOpen: (open) => set({ searchOpen: open }),
  agentMessages: {},
@ -775,9 +806,75 @@ export const useCanvasStore = create<CanvasState>((set, get) => ({
    });
  },

+  removeSubtree: (rootId) => {
+    const { nodes, edges, selectedNodeId } = get();
+    // Build a parentId → childIds index once so the descent is O(N),
+    // not O(N · depth). The store typically holds <500 nodes; even
+    // doing a linear scan per parent would be fine, but the index
+    // keeps the cost predictable as orgs grow.
+    const childrenByParent = new Map<string, string[]>();
+    for (const n of nodes) {
+      const p = n.data.parentId ?? null;
+      if (p === null) continue;
+      const arr = childrenByParent.get(p);
+      if (arr) arr.push(n.id);
+      else childrenByParent.set(p, [n.id]);
+    }
+    const removed = new Set<string>([rootId]);
+    const stack = [rootId];
+    while (stack.length) {
+      const cur = stack.pop()!;
+      const kids = childrenByParent.get(cur);
+      if (!kids) continue;
+      for (const k of kids) {
+        if (!removed.has(k)) {
+          removed.add(k);
+          stack.push(k);
+        }
+      }
+    }
+    // Tombstone removed ids so an in-flight GET /workspaces can't
+    // resurrect them via hydrate (#2069).
+    markDeleted(removed);
+    set({
+      nodes: nodes.filter((n) => !removed.has(n.id)),
+      edges: edges.filter((e) => !removed.has(e.source) && !removed.has(e.target)),
+      selectedNodeId:
+        selectedNodeId !== null && removed.has(selectedNodeId)
+          ? null
+          : selectedNodeId,
+    });
+  },
+
  hydrate: (workspaces: WorkspaceData[]) => {
-    const layoutOverrides = computeAutoLayout(workspaces);
-    const { nodes, edges } = buildNodesAndEdges(workspaces, layoutOverrides);
+    // Drop ids tombstoned by a recent removeSubtree (#2069 — stale
+    // in-flight GET /workspaces).
+    const live = workspaces.filter((w) => !wasRecentlyDeleted(w.id));
+    const layoutOverrides = computeAutoLayout(live);
+    // Carry the live measured/grown parent sizes from the existing
+    // store into the rebuild. buildNodesAndEdges runs an auto-rescue
+    // pass on each child to detach orphans whose stored relative
+    // position falls outside the parent bbox — without the live
+    // size, the bbox is the initial grid-derived minimum, which
+    // false-flags any child the user has dragged into the
+    // user-grown area. Periodic rehydrate (socket.ts health check,
+    // 30s) was reasserting the rescue against legitimate user
+    // placements, causing the "child jumps to weird location, then
+    // settles" symptom.
+    const current = get().nodes;
+    const currentParentSizes = new Map<string, { width: number; height: number }>();
+    for (const n of current) {
+      const w = (n.measured?.width ?? n.width) as number | undefined;
+      const h = (n.measured?.height ?? n.height) as number | undefined;
+      if (typeof w === "number" && typeof h === "number") {
+        currentParentSizes.set(n.id, { width: w, height: h });
+      }
+    }
+    const { nodes, edges } = buildNodesAndEdges(
+      live,
+      layoutOverrides,
+      currentParentSizes,
+    );
    set({ nodes, edges });
    for (const [nodeId, { x, y }] of layoutOverrides) {
      api.patch(`/workspaces/${nodeId}`, { x, y }).catch(() => {});
--- a/canvas/src/store/classNames.ts
+++ b/canvas/src/store/classNames.ts
@ -0,0 +1,53 @@
+/**
+ * React Flow className helpers shared across the store and canvas
+ * hooks. React Flow's Node.className / Edge.className is a single
+ * space-separated string, so every call site was previously doing
+ * the same `.split/.filter/.join` dance — centralise it here so
+ * any future class manipulation follows one policy.
+ */
+
+/** Add `cls` to the existing className, de-duplicating. Returns
+ *  the (possibly new) string; undefined/empty input → just `cls`. */
+export function appendClass(existing: string | undefined, cls: string): string {
+  if (!existing) return cls;
+  const parts = existing.split(/\s+/).filter(Boolean);
+  if (parts.includes(cls)) return existing;
+  parts.push(cls);
+  return parts.join(" ");
+}
+
+/** Remove `cls` if present. Returns the (possibly empty) string. */
+export function removeClass(existing: string | undefined, cls: string): string {
+  if (!existing) return "";
+  return existing
+    .split(/\s+/)
+    .filter((c) => c && c !== cls)
+    .join(" ");
+}
+
+/** Schedule `removeClass(nodeId, cls)` on the `nodes` slice after
+ *  `delayMs`. The callers used to inline this twice — once for
+ *  parent-pulse cleanup, once for spawn-class cleanup — and now
+ *  share the same impl so future one-shot animation classes land
+ *  consistently.
+ *
+ *  No-ops when `window` is undefined (SSR). Accepts the store's
+ *  get/set pair directly rather than a store reference so it
+ *  composes with the existing handleCanvasEvent signature. */
+export function scheduleNodeClassRemoval(
+  nodeId: string,
+  cls: string,
+  delayMs: number,
+  get: () => { nodes: Array<{ id: string; className?: string }> },
+  set: (partial: Record<string, unknown>) => void,
+): void {
+  if (typeof window === "undefined") return;
+  window.setTimeout(() => {
+    const state = get();
+    set({
+      nodes: state.nodes.map((n) =>
+        n.id === nodeId ? { ...n, className: removeClass(n.className, cls) } : n,
+      ),
+    });
+  }, delayMs);
+}
--- a/canvas/src/store/deleteTombstones.ts
+++ b/canvas/src/store/deleteTombstones.ts
@ -0,0 +1,55 @@
+/**
+ * Transient "recently deleted" map keyed by workspace id.
+ *
+ * `removeSubtree` calls `markDeleted(ids)` on every removal; `hydrate`
+ * calls `wasRecentlyDeleted(id)` to filter out incoming workspaces
+ * whose ids match a fresh tombstone — prevents an in-flight
+ * GET /workspaces from resurrecting just-deleted nodes via hydrate.
+ *
+ * TTL is shared with the WS-fallback poll cadence so a single
+ * round-trip is covered. Module-level (not store state) so it doesn't
+ * trigger React Flow re-renders. (#2069)
+ */
+
+import { FALLBACK_POLL_MS } from "./socket";
+
+const TOMBSTONE_TTL_MS = FALLBACK_POLL_MS;
+
+const tombstones = new Map<string, number>();
+
+function gcExpired(now: number): void {
+  for (const [id, deletedAt] of tombstones) {
+    if (now - deletedAt >= TOMBSTONE_TTL_MS) {
+      tombstones.delete(id);
+    }
+  }
+}
+
+export function markDeleted(ids: Iterable<string>): void {
+  const now = Date.now();
+  gcExpired(now);
+  for (const id of ids) {
+    tombstones.set(id, now);
+  }
+}
+
+export function wasRecentlyDeleted(id: string): boolean {
+  const deletedAt = tombstones.get(id);
+  if (deletedAt === undefined) return false;
+  if (Date.now() - deletedAt >= TOMBSTONE_TTL_MS) {
+    tombstones.delete(id);
+    return false;
+  }
+  return true;
+}
+
+/** Test-only: clear the module-level map between tests. Production code
+ *  must not call this — the map is intentionally process-lifetime. */
+export function __resetTombstonesForTest(): void {
+  tombstones.clear();
+}
+
+/** Test-only: inspect the current tombstone count. */
+export function __tombstoneCountForTest(): number {
+  return tombstones.size;
+}
--- a/canvas/src/store/socket.ts
+++ b/canvas/src/store/socket.ts
@ -12,30 +12,129 @@ export interface WSMessage {
  payload: Record<string, unknown>;
 }

+/** Window during which a freshly-completed rehydrate is reused
+ *  instead of firing a new GET. Picked to absorb the connect→health-
+ *  check sequence (rehydrate runs once on onopen, then the first
+ *  health-check tick fires immediately after — both should share the
+ *  same fetch) without holding back legitimately-spaced rehydrates
+ *  triggered by genuine WS silence later. */
+const REHYDRATE_DEDUP_WINDOW_MS = 1_500;
+
+/** Pure dedup gate for rehydrate(). Tracks two states:
+ *
+ *    - in-flight (between beginFetch and completeFetch): every
+ *      shouldSkip returns true.
+ *    - post-completion window (now < completedAt + windowMs):
+ *      shouldSkip returns true.
+ *
+ *  Extracted from ReconnectingSocket so the gate is unit-testable
+ *  without mocking dynamic imports or fake timers. The class itself
+ *  is stateful but tiny — instances are not shared across sockets. */
+export class RehydrateDedup {
+  private inFlight = false;
+  // -Infinity so the very first shouldSkip(now) call always passes
+  // (now - (-Infinity) > windowMs). Initializing to 0 would false-
+  // trip on test runs where now is also 0 (vi.useFakeTimers default
+  // clock) AND on real runs in the first 1.5s after epoch on
+  // clock-skewed systems.
+  private completedAt = Number.NEGATIVE_INFINITY;
+  constructor(private readonly windowMs: number) {}
+
+  shouldSkip(now: number): boolean {
+    if (this.inFlight) return true;
+    if (now - this.completedAt < this.windowMs) return true;
+    return false;
+  }
+
+  beginFetch(): void {
+    this.inFlight = true;
+  }
+
+  completeFetch(now: number = Date.now()): void {
+    this.inFlight = false;
+    this.completedAt = now;
+  }
+}
+
+/** Cadence for the HTTP fallback rehydrate that runs while the WS is
+ *  in connecting/disconnected limbo. 10s is short enough that the user
+ *  sees STARTING → ONLINE within one tick after the platform finishes
+ *  provisioning, but long enough to not pound /workspaces if the
+ *  network truly is down. The dedup gate inside rehydrate() collapses
+ *  this against the post-onopen rehydrate, so reconnect doesn't pay
+ *  for a duplicate fetch. */
+export const FALLBACK_POLL_MS = 10_000;
+
 class ReconnectingSocket {
  private ws: WebSocket | null = null;
  private attempt = 0;
  private url: string;
  private lastEventTime = 0;
  private healthCheckTimer: ReturnType<typeof setInterval> | null = null;
+  private reconnectTimer: ReturnType<typeof setTimeout> | null = null;
+  // Polls /workspaces while the WS is unhealthy so the canvas reflects
+  // truth even when realtime events aren't arriving. Without this the
+  // store can stay frozen for minutes — e.g. workspaces transition
+  // STARTING → ONLINE on the platform but the canvas keeps showing
+  // STARTING until the WS finally reconnects, triggering false
+  // "Provisioning Timeout" banners on already-online workspaces.
+  private fallbackPollTimer: ReturnType<typeof setInterval> | null = null;
+  // disposed signals that disconnect() has been called. Any in-flight
+  // reconnect / handshake must abort early rather than attach to a
+  // socket the caller no longer owns — otherwise React StrictMode's
+  // effect double-invoke (and any future intentional disconnect)
+  // leaves a zombie WebSocket alive forever.
+  private disposed = false;
+  // In-flight singleton + dedup window for rehydrate. Two reasons to
+  // collapse rapid calls:
+  //   1. connect.onopen fires rehydrate immediately, and the very next
+  //      health-check tick may fire it again before the first GET
+  //      returns — wasted round trip + rebuild churn that resets the
+  //      mid-flight UI state (auto-rescue heuristics, grow passes).
+  //   2. Future call sites (a manual "Refresh" button, post-import
+  //      hydrate, error-recovery rehydrate) might pile up.
+  // Keeping rehydrate idempotent at the call-site level means each
+  // caller can fire-and-forget without coordinating.
+  private rehydrateInFlight: Promise<void> | null = null;
+  private rehydrateDedup = new RehydrateDedup(REHYDRATE_DEDUP_WINDOW_MS);

  constructor(url: string) {
    this.url = url;
  }

  connect() {
+    if (this.disposed) return;
    useCanvasStore.getState().setWsStatus("connecting");
-    this.ws = new WebSocket(this.url);
+    // Start the HTTP fallback poll up-front, not just on onclose. Two
+    // scenarios this guards against:
+    //   1. The very first connect attempt — onclose hasn't fired yet
+    //      because we never had a successful onopen.
+    //   2. A failed handshake where the browser takes tens of seconds
+    //      to surface as onclose (Chrome can hold a SYN-SENT WebSocket
+    //      open for ~75s before giving up).
+    // Idempotent — startFallbackPoll early-returns if a timer is
+    // already running, so calling it from both places is cheap.
+    this.startFallbackPoll();
+    const ws = new WebSocket(this.url);
+    this.ws = ws;

-    this.ws.onopen = () => {
+    ws.onopen = () => {
+      if (this.disposed || this.ws !== ws) {
+        // Late-open on an abandoned socket. Close it cleanly; the
+        // caller already moved on.
+        try { ws.close(); } catch { /* noop */ }
+        return;
+      }
      this.attempt = 0;
      this.lastEventTime = Date.now();
      useCanvasStore.getState().setWsStatus("connected");
+      this.stopFallbackPoll();
      this.rehydrate();
      this.startHealthCheck();
    };

-    this.ws.onmessage = (event) => {
+    ws.onmessage = (event) => {
+      if (this.disposed || this.ws !== ws) return;
      this.lastEventTime = Date.now();
      try {
        const msg: WSMessage = JSON.parse(event.data);
@ -45,15 +144,21 @@ class ReconnectingSocket {
      }
    };

-    this.ws.onclose = () => {
+    ws.onclose = () => {
+      // Fired on intentional close (disposed) OR server/network drop.
+      // Only schedule a reconnect when the socket is still live AND
+      // corresponds to the WS we just tore down (prevents a stale
+      // onclose from a zombie socket from re-arming the loop).
+      if (this.disposed || this.ws !== ws) return;
      this.stopHealthCheck();
      useCanvasStore.getState().setWsStatus("connecting");
+      this.startFallbackPoll();
      const delay = Math.min(1000 * 2 ** this.attempt, 30000);
      this.attempt++;
-      setTimeout(() => this.connect(), delay);
+      this.reconnectTimer = setTimeout(() => this.connect(), delay);
    };

-    this.ws.onerror = () => {
+    ws.onerror = () => {
      // Suppressed — onclose handles reconnection. onerror fires before onclose
      // and the Event object doesn't contain useful info (serializes to {}).
    };
@ -80,20 +185,78 @@ class ReconnectingSocket {
    }
  }

-  private async rehydrate() {
-    try {
-      const { api } = await import("@/lib/api");
-      const workspaces = await api.get<WorkspaceData[]>("/workspaces");
-      useCanvasStore.getState().hydrate(workspaces);
-    } catch {
-      // Rehydration failed — will retry on next health check cycle
+  /** While the WS is in connecting/disconnected limbo, poll /workspaces
+   *  so the store stays fresh. The reconnect attempts continue in
+   *  parallel; whichever recovers first wins. rehydrate()'s own dedup
+   *  gate prevents this from racing with the open-time rehydrate. */
+  private startFallbackPoll() {
+    if (this.fallbackPollTimer) return;
+    this.fallbackPollTimer = setInterval(() => {
+      if (this.disposed) {
+        this.stopFallbackPoll();
+        return;
+      }
+      void this.rehydrate();
+    }, FALLBACK_POLL_MS);
+  }
+
+  private stopFallbackPoll() {
+    if (this.fallbackPollTimer) {
+      clearInterval(this.fallbackPollTimer);
+      this.fallbackPollTimer = null;
    }
  }

+  private rehydrate(): Promise<void> {
+    // Reuse an in-flight fetch — a second caller during the GET
+    // shouldn't kick off a parallel one.
+    if (this.rehydrateInFlight) return this.rehydrateInFlight;
+    if (this.rehydrateDedup.shouldSkip(Date.now())) {
+      return Promise.resolve();
+    }
+
+    // beginFetch lives INSIDE the IIFE's try so any future code added
+    // between gate-check and IIFE-construction can't throw and leave
+    // the gate stuck at inFlight=true forever. Today there's nothing
+    // that can throw here, but the cost of being defensive is one
+    // extra microtask of "in flight" status — negligible.
+    const promise = (async () => {
+      this.rehydrateDedup.beginFetch();
+      try {
+        const { api } = await import("@/lib/api");
+        const workspaces = await api.get<WorkspaceData[]>("/workspaces");
+        if (this.disposed) return;
+        useCanvasStore.getState().hydrate(workspaces);
+      } catch {
+        // Rehydration failed — will retry on next health check cycle.
+      } finally {
+        this.rehydrateDedup.completeFetch(Date.now());
+        this.rehydrateInFlight = null;
+      }
+    })();
+    this.rehydrateInFlight = promise;
+    return promise;
+  }
+
  disconnect() {
+    this.disposed = true;
    this.stopHealthCheck();
+    this.stopFallbackPoll();
+    if (this.reconnectTimer) {
+      clearTimeout(this.reconnectTimer);
+      this.reconnectTimer = null;
+    }
    if (this.ws) {
-      this.ws.close();
+      // Detach listeners before close() so we don't route the close
+      // event through our onclose → scheduleReconnect path. Belt +
+      // braces on top of the `disposed` check, because StrictMode
+      // cycles through so fast that an attached onclose can fire
+      // after disposed=true is set but before this assignment runs.
+      this.ws.onopen = null;
+      this.ws.onmessage = null;
+      this.ws.onclose = null;
+      this.ws.onerror = null;
+      try { this.ws.close(); } catch { /* noop */ }
      this.ws = null;
    }
    useCanvasStore.getState().setWsStatus("disconnected");
--- a/canvas/src/styles/org-deploy.css
+++ b/canvas/src/styles/org-deploy.css
@ -0,0 +1,151 @@
+/**
+ * Org-deploy animation module.
+ *
+ * Loaded globally (see app/globals.css). All values come from
+ * theme-tokens.css so a theme swap needs zero edits here.
+ *
+ * Component contract — canvas/src/components/canvas code adds
+ * these classes to the React Flow node / edge wrappers:
+ *
+ *   .mol-deploy-spawn             One-shot entry animation on a
+ *                                 node that just arrived. Applied
+ *                                 by canvas-events.ts for 600 ms
+ *                                 then removed.
+ *   .mol-deploy-shimmer           Persistent border shimmer while
+ *                                 a node's status === "provisioning".
+ *                                 Removed when status flips to
+ *                                 "online" / "failed".
+ *   .mol-deploy-parent-pulse      One-shot acknowledgement pulse
+ *                                 on the parent when a child lands.
+ *                                 Applied for parent-pulse duration
+ *                                 then removed.
+ *   .mol-deploy-locked            Applied to every non-root node
+ *                                 inside a deploying org so it dims
+ *                                 and the cursor signals un-
+ *                                 draggable.
+ *   .mol-deploy-root-complete     One-shot pop + glow on the root
+ *                                 when the last child comes online.
+ *
+ * Edges use React Flow edge data to pick styling — see the
+ * selectors below the node keyframes.
+ *
+ * Reduced motion is handled at the bottom via the same guard
+ * globals.css already installs for other animations.
+ */
+
+/* ────────────────────────────────────────────────────────
+   Keyframes — kept terse; values come from variables so
+   duplication across themes is nil.
+   ──────────────────────────────────────────────────────── */
+
+@keyframes mol-deploy-spawn {
+  /* Gentle fade-in-place. The earlier "spring from parent" motion
+     collided with the server-computed grid positions (parent and
+     child used different coord origins once the parent was placed
+     on the client's grid instead of the template's absolute
+     coords), which landed children in wrong slots. Keeping the
+     animation to a simple opacity+scale lets the server's layout
+     win — and reads as "node arrived" without the over-engineered
+     spring. */
+  from { opacity: 0; transform: scale(0.85); }
+  to   { opacity: 1; transform: scale(1);    }
+}
+
+/* mol-deploy-parent-pulse keyframe removed with the effect — the
+   box-shadow expanding ring made the parent card visibly "flash" on
+   every child arrival when the grow pass also bumped width/height.
+   Kept as a deliberate non-class so the theme-tokens vars can drop
+   with it on the next theme pass. */
+
+@keyframes mol-deploy-root-complete {
+  0%   { transform: scale(1);    box-shadow: 0 0 0 0 transparent;   }
+  40%  { transform: scale(var(--mol-deploy-root-scale-peak)); box-shadow: var(--mol-deploy-root-glow); }
+  100% { transform: scale(1);    box-shadow: 0 0 0 0 transparent;   }
+}
+
+/* (mol-deploy-edge-draw keyframe removed with the edge effects.) */
+
+@keyframes mol-deploy-cancel-pulse {
+  0%, 100% { box-shadow: 0 0 0 0   var(--mol-deploy-cancel-ring); }
+  50%      { box-shadow: 0 0 0 10px transparent;                   }
+}
+
+/* ────────────────────────────────────────────────────────
+   Node classes
+   ──────────────────────────────────────────────────────── */
+
+/* Qualify with .react-flow__node so this rule beats the default
+   `node-appear` animation defined later in globals.css. Without
+   the qualifier, CSS source-order wins and the standard
+   node-appear overrides our scale/opacity keyframe, visually
+   dropping the "spawn from parent" motion. */
+.react-flow__node.mol-deploy-spawn {
+  animation:
+    mol-deploy-spawn var(--mol-duration-spawn) var(--mol-easing-bounce-out) both;
+}
+
+/* Provisioning signal — the earlier rotating conic-gradient border
+   read as distracting "spinner" clutter during a 15-child org
+   import (dozens of them spinning simultaneously). A static dim
+   (reduced opacity + saturation) communicates "this one is still
+   coming online" without the motion noise. The locked-child style
+   already uses the same pattern — we reuse the filter values so
+   a provisioning ROOT node and a locked CHILD look consistent. */
+.mol-deploy-shimmer {
+  filter: saturate(var(--mol-deploy-locked-saturation)) opacity(var(--mol-deploy-locked-opacity));
+  transition: filter var(--mol-duration-base) var(--mol-easing-standard);
+}
+
+.mol-deploy-locked {
+  filter: saturate(var(--mol-deploy-locked-saturation)) opacity(var(--mol-deploy-locked-opacity));
+  cursor: not-allowed !important;
+  transition: filter var(--mol-duration-base) var(--mol-easing-standard);
+}
+
+.react-flow__node.mol-deploy-root-complete {
+  animation: mol-deploy-root-complete var(--mol-duration-root-complete) var(--mol-easing-emphasize) both;
+}
+
+/* ────────────────────────────────────────────────────────
+   Edge classes — intentionally inert.
+
+   Earlier revisions painted incoming edges with a dashed-blueprint
+   → animated-laser-trace effect as the child landed. User feedback
+   on the first demo was "remove connection line effects" — the
+   moving dashes read as noise during a multi-child deploy. Keeping
+   the class hooks so canvas-events.ts event handlers can still
+   apply/strip them without blowing up, but the styling is a no-op
+   (edges fall through to the default styling in globals.css).
+   If a future demo wants the effect back, wire the rules below.
+   ──────────────────────────────────────────────────────── */
+
+/* ────────────────────────────────────────────────────────
+   Cancel-deployment pill — rendered by OrgCancelButton.tsx
+   attached to the root node during deploy. Class `.mol-deploy-cancel`
+   is always applied; the pulse is additive.
+   ──────────────────────────────────────────────────────── */
+.mol-deploy-cancel {
+  background: var(--mol-deploy-cancel-bg);
+  color: var(--mol-deploy-cancel-text);
+  transition: background var(--mol-duration-fast) var(--mol-easing-standard);
+}
+.mol-deploy-cancel:hover {
+  background: var(--mol-deploy-cancel-bg-hover);
+}
+.mol-deploy-cancel-pulse {
+  animation: mol-deploy-cancel-pulse var(--mol-duration-parent-pulse) var(--mol-easing-standard) infinite;
+}
+
+/* ────────────────────────────────────────────────────────
+   Reduced-motion guard — mirror globals.css's policy so this
+   module stays WCAG 2.3.3 compliant without relying on the
+   global file being loaded first.
+   ──────────────────────────────────────────────────────── */
+@media (prefers-reduced-motion: reduce) {
+  .react-flow__node.mol-deploy-spawn,
+  .react-flow__node.mol-deploy-root-complete,
+  .mol-deploy-cancel-pulse {
+    animation: none !important;
+  }
+  /* Dim-light signal is already static; no override needed. */
+}
--- a/canvas/src/styles/theme-tokens.css
+++ b/canvas/src/styles/theme-tokens.css
@ -0,0 +1,69 @@
+/**
+ * Canvas theme tokens — single source of truth for colors, durations,
+ * easings, and sizes used by every animated / stateful canvas
+ * component. Importable from any stylesheet; individual feature
+ * modules (org-deploy.css, settings-panel.css, ...) only reference
+ * variables defined here so a future theme swap touches this one
+ * file.
+ *
+ * Adding a theme:
+ *   Put a scoped override block like `[data-theme="light"] { ... }`
+ *   and set only the tokens whose values differ from the default
+ *   dark theme. Unset tokens inherit the default.
+ *
+ * Naming convention:
+ *   --mol-<feature>-<semantic-role>       → values the user sees
+ *   --mol-duration-<name>                 → motion timings
+ *   --mol-easing-<name>                   → motion curves
+ * Prefix `mol-` avoids collisions with Tailwind / React Flow vars.
+ */
+
+:root {
+  /* ────────────────────────────────────────────────────────
+     Motion primitives — pick one of these; don't hardcode ms
+     values in feature stylesheets. If a new feature genuinely
+     needs a bespoke duration, add a token here and reference it.
+     ──────────────────────────────────────────────────────── */
+  --mol-duration-fast: 150ms;
+  --mol-duration-base: 300ms;
+  --mol-duration-spawn: 350ms;
+  --mol-duration-root-complete: 700ms;
+  --mol-duration-fit-view: 800ms;
+
+  --mol-easing-standard: cubic-bezier(0.2, 0, 0, 1);
+  --mol-easing-bounce-out: cubic-bezier(0.2, 0.8, 0.2, 1.05);
+  --mol-easing-emphasize: cubic-bezier(0.3, 0, 0, 1);
+
+  /* ────────────────────────────────────────────────────────
+     Org-deploy animation palette (dark theme defaults)
+     ──────────────────────────────────────────────────────── */
+
+  /* Root-complete moment — one-shot glow when the last child lands. */
+  --mol-deploy-root-glow: 0 0 36px 6px rgba(59, 130, 246, 0.55);
+  --mol-deploy-root-scale-peak: 1.05;
+
+  /* Locked-child visual — non-root nodes during deploy cannot be
+     dragged; this dims them so the user's attention stays on the
+     active spawn. Saturation + opacity instead of a badge keeps
+     the card recognisable while signalling "not available". */
+  --mol-deploy-locked-saturation: 0.55;
+  --mol-deploy-locked-opacity: 0.78;
+
+  /* Cancel-deployment pill attached to the root node. Red, pulsing,
+     one button that kills the whole tree. */
+  --mol-deploy-cancel-bg: rgba(220, 38, 38, 0.92);     /* red-600/92 */
+  --mol-deploy-cancel-bg-hover: rgba(239, 68, 68, 1);  /* red-500 */
+  --mol-deploy-cancel-ring: rgba(239, 68, 68, 0.45);
+  --mol-deploy-cancel-text: #fff;
+}
+
+/* Example template for a future light theme. Intentionally empty
+   — product hasn't shipped a light theme yet but this shows the
+   override surface any future theme must fill. Uncomment + tune
+   when the light theme lands.
+[data-theme="light"] {
+  --mol-deploy-shimmer-from: rgba(37, 99, 235, 0.08);
+  --mol-deploy-shimmer-to:   rgba(37, 99, 235, 0.9);
+  ...
+}
+*/
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -178,6 +178,15 @@ services:
      # public images (current state for all 8 templates).
      GHCR_USER: "${GHCR_USER:-}"
      GHCR_TOKEN: "${GHCR_TOKEN:-}"
+      # Auto-refresh workspace-template-* images. The watcher polls GHCR
+      # every 5 min; when a digest moves, it pulls and force-recreates any
+      # matching ws-* containers (existing /admin/workspace-images/refresh
+      # logic). Closes the runtime CD chain: merge → containers running
+      # new code, no operator step. Default ON for local dev because that's
+      # where the runtime → ws iteration loop is tightest. Set to "false"
+      # if you don't want the platform to mutate ws-* containers behind
+      # your back during a long-running test.
+      IMAGE_AUTO_REFRESH: "${IMAGE_AUTO_REFRESH:-true}"
    volumes:
      - ./workspace-configs-templates:/configs
      - ./org-templates:/org-templates:ro
--- a/docs/architecture/database-schema.md
+++ b/docs/architecture/database-schema.md
@ -77,7 +77,7 @@ CREATE TABLE workspace_secrets (
 );
 ```

-Stores API keys, credentials, and other secrets needed by workspace agents. Values are encrypted with AES-256 at the application layer. The encryption key comes from the `SECRETS_ENCRYPTION_KEY` environment variable on the platform — never stored in the database.
+Stores API keys, credentials, and other secrets needed by workspace agents. Values are encrypted with AES-256-GCM at the application layer. The encryption key comes from the tenant's `SECRETS_ENCRYPTION_KEY` environment variable, provisioned at tenant boot by the control plane (which holds the master key in AWS KMS — see [secrets-key-custody.md](./secrets-key-custody.md)). The key is never stored in the database.

 The provisioner reads secrets from this table, decrypts them, and injects them as environment variables when spinning up workspace containers. Secrets are never included in bundles (see [Constraints — Rule 5](../development/constraints-and-rules.md)).

--- a/docs/architecture/molecule-technical-doc.md
+++ b/docs/architecture/molecule-technical-doc.md
@ -902,7 +902,7 @@ Postgres + Redis + Langfuse only (for local development without containerized wo
 | `REDIS_URL` | `redis://localhost:6379` | Redis connection |
 | `PORT` | `8080` | Platform listen port |
 | `PLATFORM_URL` | `http://host.docker.internal:8080` | Injected to workspace containers |
-| `SECRETS_ENCRYPTION_KEY` | Optional | AES-256 key (32 bytes) for secret encryption |
+| `SECRETS_ENCRYPTION_KEY` | Optional | AES-256-GCM key (32 bytes) for tenant secret encryption. Provisioned at tenant boot by the control plane, which holds the master key in AWS KMS — see [secrets-key-custody.md](./secrets-key-custody.md). |
 | `CONFIGS_DIR` | `/configs` | Workspace config template directory |
 | `PLUGINS_DIR` | `/plugins` | Shared plugin directory |
 | `ACTIVITY_RETENTION_DAYS` | `7` | Activity log retention |
--- a/docs/architecture/secrets-key-custody.md
+++ b/docs/architecture/secrets-key-custody.md
@ -0,0 +1,85 @@
+# Secrets Key Custody
+
+How the encryption keys that protect Molecule workspace secrets are managed, where each key lives, and what an attacker who compromises one layer can or cannot read.
+
+This document exists because the platform repo (`workspace-server`) reads `SECRETS_ENCRYPTION_KEY` from its process env, which on its own looks like "encryption-at-rest theater." The full custody chain runs through the control plane (`molecule-controlplane`) where AWS KMS holds the key material at rest. Anyone reading only the platform repo sees half the picture.
+
+## Two modes
+
+The control plane's `internal/crypto.Envelope` ships in two modes, picked at boot from env:
+
+| Mode | Trigger | At-rest format | Recommended for |
+|------|---------|----------------|-----------------|
+| **KMS envelope** | `KMS_KEY_ARN` set | Per-blob KMS-wrapped DEK + AES-256-GCM ciphertext | Production, multi-tenant SaaS |
+| **Static key** | Only `SECRETS_ENCRYPTION_KEY` set | AES-256-GCM with one process-wide key | Dev, self-hosted single-tenant |
+
+`Envelope.Decrypt` is dual-mode — it can read either format on the way out, so a deployment can flip from static-key to KMS envelope without re-encrypting historical rows. Code: `molecule-controlplane/internal/crypto/kms.go`.
+
+## KMS envelope flow
+
+When `KMS_KEY_ARN` is configured, every secret write looks like:
+
+1. CP calls `kms.GenerateDataKey(KeyId=KMS_KEY_ARN, KeySpec=AES_256)` → returns `{Plaintext, CiphertextBlob}`.
+2. CP encrypts the secret with AES-256-GCM using `Plaintext` as the key.
+3. CP discards `Plaintext` from memory; persists the blob:
+
+   ```
+   [0x02 prefix][uint16 BE: encrypted_dek_len][encrypted_dek][nonce(12)][ct+tag]
+   ```
+
+   The `0x02` byte distinguishes v2 (KMS-wrapped) blobs from legacy static-key blobs.
+
+4. To read: CP calls `kms.Decrypt(CiphertextBlob)` → recovers the AES key → unwraps the GCM ciphertext.
+
+KMS calls cost ~$0.03 per 10k requests. We do not cache DEKs — provisioning rate is orders below steady-state reads, and not caching keeps key rotation reasoning simple.
+
+## What lives where
+
+| Layer | Key custody | Plaintext key in memory? |
+|-------|-------------|--------------------------|
+| AWS KMS | KMS-resident, never leaves the HSM | No (hardware) |
+| `molecule-controlplane` process | KMS client + IAM role | Briefly per-secret-op only |
+| CP database (`database_url_encrypted`, tenant secrets) | KMS-wrapped blobs | Never |
+| Per-tenant `workspace-server` env (`SECRETS_ENCRYPTION_KEY`) | Provisioned at tenant boot by CP | Yes, for the tenant's process lifetime |
+| Tenant Postgres (`workspace_secrets.value`) | AES-256-GCM with the tenant's key | Never |
+
+The "plaintext in tenant memory" row is the standard envelope-encryption trade-off: a DEK has to be unwrapped somewhere to be used. The blast radius of compromising one tenant's process is one tenant's secrets — not the whole fleet.
+
+## Threat model
+
+| Attacker capability | Can they read tenant secrets? |
+|---------------------|-------------------------------|
+| Reads CP database backup | No — KMS unwrap requires IAM-scoped `kms:Decrypt` |
+| Steals `KMS_KEY_ARN` value | No — ARN alone does nothing without IAM access |
+| Compromises CP IAM role | Yes — can `kms:Decrypt` any wrapped DEK |
+| Reads tenant Postgres (one tenant) | No — `SECRETS_ENCRYPTION_KEY` lives only in the tenant's own EC2 process env, not in DB |
+| Compromises one tenant's EC2 | Yes for that tenant's secrets, no for any other tenant |
+| Compromises CP host | Game over (CP can provision arbitrary tenants) |
+
+The two boundaries the design protects:
+
+- **DB-only compromise (incl. backups)** → secrets remain encrypted; attacker needs separate access to either KMS (prod) or CP env (dev).
+- **One-tenant compromise** → blast radius limited to that tenant; no cross-tenant key reuse.
+
+## Rotation
+
+- **Tenant key rotation** (per-tenant `SECRETS_ENCRYPTION_KEY`): re-encrypt the tenant's `workspace_secrets` rows under a new key, then swap the env var. Static-key mode requires this for all rotation; KMS mode only requires it on suspected key compromise.
+- **KMS CMK rotation**: AWS KMS handles annual automatic rotation of the customer master key. Re-wrapping data keys is unnecessary because each `Decrypt` call routes through the current CMK version automatically (KMS keeps prior versions for decrypt-only).
+
+## Audit / compliance posture
+
+For SOC2 / ISO 27001 / customer security questionnaires:
+
+- **Key custody**: AWS KMS (FIPS 140-2 Level 3 HSM-backed)
+- **Key isolation**: per-tenant DEK; no shared keys across tenants
+- **Access control**: IAM-scoped `kms:Decrypt`, audited via CloudTrail
+- **At-rest encryption**: AES-256-GCM (NIST-approved, authenticated)
+- **In-transit encryption**: TLS 1.2+ for KMS, CP-to-tenant, tenant-to-DB
+- **Rotation**: AWS-managed CMK rotation annually; manual DEK rotation on incident
+
+## Pointers
+
+- KMS envelope code: [`molecule-controlplane/internal/crypto/kms.go`](https://github.com/Molecule-AI/molecule-controlplane/blob/main/internal/crypto/kms.go)
+- Static-key fallback: [`molecule-controlplane/internal/crypto/aes.go`](https://github.com/Molecule-AI/molecule-controlplane/blob/main/internal/crypto/aes.go)
+- Tenant secrets handler: [`workspace-server/internal/crypto/aes.go`](../../workspace-server/internal/crypto/aes.go)
+- Tenant secrets schema: [database-schema.md](./database-schema.md#workspace_secrets)
--- a/docs/development/constraints-and-rules.md
+++ b/docs/development/constraints-and-rules.md
@ -56,7 +56,7 @@ Direct A2A calls between workspaces are unauthenticated in MVP. Access control i

 ## 11. Secrets in Postgres, Encrypted

-Workspace secrets (API keys, credentials) are stored in Postgres with AES-256 encryption at the application layer. The encryption key comes from the `SECRETS_ENCRYPTION_KEY` environment variable. Secrets are never included in bundles, never logged, never exposed via API responses.
+Workspace secrets (API keys, credentials) are stored in Postgres with AES-256-GCM encryption at the application layer. The tenant's `SECRETS_ENCRYPTION_KEY` is provisioned at boot by the control plane, which holds the master key material in AWS KMS (envelope encryption, dual-mode with a static-key fallback for dev). Full custody chain in [secrets-key-custody.md](../architecture/secrets-key-custody.md). Secrets are never included in bundles, never logged, never exposed via API responses.

 ## 12. Last-Write-Wins for MVP

--- a/docs/workspace-runtime-package.md
+++ b/docs/workspace-runtime-package.md
@ -101,7 +101,9 @@ git push origin runtime-v0.1.6
 The `publish-runtime` workflow takes over — checks out the tag, runs
 `scripts/build_runtime_package.py --version 0.1.6`, builds wheel + sdist,
 runs a smoke import to catch broken rewrites, and uploads to PyPI via
-the `PYPI_TOKEN` repo secret.
+the PyPA Trusted Publisher action (OIDC). No static API token is stored
+in this repo — PyPI verifies the workflow's OIDC claim against the
+trusted-publisher config registered for `molecule-ai-workspace-runtime`.

 For dev/test releases without tagging, dispatch the workflow manually
 with an explicit version (e.g. `0.1.6.dev1` — PEP 440 dev/rc/post forms
@ -145,11 +147,18 @@ command. SaaS deployments typically wire step 5 into their normal deploy
 pipeline (every release pulls fresh images on every host); local dev fires
 it manually after a runtime release lands.

+### Auth
+
+PyPI publishing uses **Trusted Publisher (OIDC)** — no static token in the
+monorepo. The trusted-publisher config on PyPI binds the
+`molecule-ai-workspace-runtime` project to this repo's
+`publish-runtime.yml` workflow + `pypi-publish` environment. Rotation is
+moot: there is no shared secret to rotate.
+
 ### Required secrets

 | Secret | Where | Why |
 |---|---|---|
-| `PYPI_TOKEN` | molecule-core repo | Twine upload auth (PyPI) |
 | `TEMPLATE_DISPATCH_TOKEN` | molecule-core repo | Fine-grained PAT with `actions:write` on the 8 template repos. Without it the `cascade` job warns and exits clean — PyPI still publishes; templates just don't auto-rebuild. |

 ### Step 5 specifics
@ -174,6 +183,20 @@ needs Docker socket access (the compose stack mounts
 (`docker login ghcr.io` once per host). On a fresh host without GHCR auth,
 the pull step warns per runtime and the response surfaces the failures.

+**Fully hands-off (opt-in image auto-refresh):**
+
+Set `IMAGE_AUTO_REFRESH=true` on the platform process. A watcher polls
+GHCR every 5 minutes for digest changes on each `workspace-template-*:latest`
+tag and invokes the same refresh logic the admin endpoint exposes —
+no operator action required between "runtime PR merged" and
+"containers running new code". Disabled by default because SaaS deploy
+pipelines that already pull on every release would do redundant work.
+
+Optional companion env (same as the admin endpoint):
+
+- `GHCR_USER` + `GHCR_TOKEN` — required for private template images;
+  unused for the current public set, but harmless if set.
+
 ## Local dev (build the package without publishing)

 ```bash
@ -188,13 +211,53 @@ correctness before pushing a `runtime-v*` tag.

 ## Writing a new adapter

-1. Create a new standalone repo `molecule-ai-workspace-template-<runtime>`
-2. Copy `adapter.py` pattern from any existing adapter repo
-3. Change imports: `from molecule_runtime.adapters.base import BaseAdapter, AdapterConfig`
-4. Create `requirements.txt` with `molecule-ai-workspace-runtime>=0.1.0` + your deps
-5. Create `Dockerfile` with `ENV ADAPTER_MODULE=adapter` and
-   `ENTRYPOINT ["molecule-runtime"]`
-6. Register the runtime name in the platform's known runtimes list
+Use the GitHub template repo
+[`Molecule-AI/molecule-ai-workspace-template-starter`](https://github.com/Molecule-AI/molecule-ai-workspace-template-starter)
+— it ships with the canonical Dockerfile + adapter.py skeleton + config.yaml
+schema + the `repository_dispatch: [runtime-published]` cascade receiver
+already wired up. No follow-up setup PR required.
+
+```bash
+# Replace <runtime> with your runtime slug (lowercase, hyphenated).
+gh repo create Molecule-AI/molecule-ai-workspace-template-<runtime> \
+  --template Molecule-AI/molecule-ai-workspace-template-starter \
+  --public \
+  --description "Molecule AI workspace template: <runtime>"
+
+git clone https://github.com/Molecule-AI/molecule-ai-workspace-template-<runtime>
+cd molecule-ai-workspace-template-<runtime>
+```
+
+Then fill in the `TODO` markers in:
+
+| File | What to fill in |
+|---|---|
+| `adapter.py` | Rename class to `<Runtime>Adapter`. Fill in `name()`, `display_name()`, `description()`, `get_config_schema()`. Implement `setup()` and `create_executor()`. |
+| `requirements.txt` | Add your runtime's pip dependencies (e.g. `langgraph`, `crewai`, `claude-agent-sdk`). |
+| `Dockerfile` | Add runtime-specific apt deps (most runtimes don't need any). Replace ENTRYPOINT only if you need custom boot logic. |
+| `config.yaml` | Update top-level `name`/`runtime`/`description`. Add the models your runtime supports to `models[]`. |
+| `system-prompt.md` | Default agent prompt. |
+
+After `git push`:
+
+1. The template's `publish-image.yml` builds + pushes
+   `ghcr.io/molecule-ai/workspace-template-<runtime>:latest` automatically.
+2. The next `runtime-vX.Y.Z` tag on `molecule-core` cascades a
+   `repository_dispatch` event into your new template, rebuilding the image
+   against the latest runtime — no setup PR required.
+3. Register the runtime name in the platform's `RuntimeImages` map (in
+   `workspace-server/internal/provisioner/provisioner.go`) so it's
+   selectable in the canvas.
+
+## When the starter itself needs to evolve
+
+If the canonical shape changes (e.g. `config.yaml` schema gets a new field,
+the `BaseAdapter` interface adds a method, the reusable CI workflow
+signature changes), update the
+[starter](https://github.com/Molecule-AI/molecule-ai-workspace-template-starter)
+**first**. Existing templates can either migrate at their own pace or be
+touched in a coordinated cleanup PR. Either way, future templates pick up
+the new shape from day one.

 ## Migration note

--- a/manifest.json
+++ b/manifest.json
@ -39,6 +39,7 @@
    {"name": "free-beats-all", "repo": "Molecule-AI/molecule-ai-org-template-free-beats-all", "ref": "main"},
    {"name": "medo-smoke", "repo": "Molecule-AI/molecule-ai-org-template-medo-smoke", "ref": "main"},
    {"name": "molecule-worker-gemini", "repo": "Molecule-AI/molecule-ai-org-template-molecule-worker-gemini", "ref": "main"},
-    {"name": "reno-stars", "repo": "Molecule-AI/molecule-ai-org-template-reno-stars", "ref": "main"}
+    {"name": "reno-stars", "repo": "Molecule-AI/molecule-ai-org-template-reno-stars", "ref": "main"},
+    {"name": "ux-ab-lab", "repo": "Molecule-AI/molecule-ai-org-template-ux-ab-lab", "ref": "main"}
  ]
 }
--- a/scripts/nuke-and-rebuild.sh
+++ b/scripts/nuke-and-rebuild.sh
@ -1,9 +1,25 @@
 #!/bin/bash
-# Full nuke + rebuild — one command to reset everything
-# Usage: bash scripts/nuke-and-rebuild.sh
+# Full nuke + rebuild — one command to reset everything.
+#
+# What "everything" means:
+#   1. The compose stack (containers + named volumes + network).
+#   2. Dynamically-spawned ws-* workspace containers + their volumes.
+#      These are NOT in docker-compose.yml — the provisioner creates them
+#      at workspace-create time, so `compose down -v` leaves them behind.
+#      Without this step, a fresh DB plus old ws-* containers = ghost
+#      containers Canvas can't see, eating CPU + memory.
+#   3. Repopulating the manifest-managed dirs (workspace-configs-templates/,
+#      org-templates/, plugins/). These are .gitignored — fresh checkouts
+#      and post-deletion runs leave them empty, which silently hides the
+#      entire template palette in Canvas. clone-manifest.sh is idempotent,
+#      so re-running with already-populated dirs is a fast no-op.
+#
+# Usage:
+#   bash scripts/nuke-and-rebuild.sh
 set -euo pipefail

 ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+
 echo "=== NUKE ==="
 docker compose -f "$ROOT/docker-compose.yml" down -v 2>/dev/null || true
 docker ps -a --format "{{.Names}}" | grep "^ws-" | xargs -r docker rm -f 2>/dev/null || true
@ -11,6 +27,23 @@ docker volume ls --format "{{.Name}}" | grep "^ws-" | xargs -r docker volume rm
 docker network rm molecule-monorepo-net 2>/dev/null || true
 echo "  cleaned"

+echo "=== POPULATE MANIFEST DIRS ==="
+# Idempotent: clone-manifest.sh skips dirs that already have content, so a
+# re-nuke after templates are populated is a fast no-op (a few stat calls).
+# Skip with a clear warning if jq is missing — installing it is a one-time
+# step documented in the README quickstart.
+if command -v jq >/dev/null 2>&1; then
+  bash "$ROOT/scripts/clone-manifest.sh" \
+    "$ROOT/manifest.json" \
+    "$ROOT/workspace-configs-templates" \
+    "$ROOT/org-templates" \
+    "$ROOT/plugins" 2>&1 | tail -3
+else
+  echo "  WARNING: jq not installed — skipping template/plugin clone."
+  echo "           Install (brew install jq) and rerun, or Canvas's template"
+  echo "           palette will be empty and provisioning falls back to defaults."
+fi
+
 echo "=== REBUILD ==="
 docker compose -f "$ROOT/docker-compose.yml" up -d --build
 echo "  platform + canvas up"
--- a/scripts/test-nuke-and-rebuild.sh
+++ b/scripts/test-nuke-and-rebuild.sh
@ -0,0 +1,152 @@
+#!/usr/bin/env bash
+# E2E test: scripts/nuke-and-rebuild.sh self-bootstraps a clean dev stack.
+#
+# What this asserts (and why each one matters):
+#   1. After nuke+rebuild, workspace-configs-templates/ is populated.
+#      Regression target: someone deletes the manifest-clone step and
+#      Canvas silently shows zero templates.
+#   2. After nuke+rebuild, no orphan ws-* containers survive on the
+#      Docker daemon. Regression target: someone removes the ws-*
+#      reaping lines from the script and old containers haunt every
+#      future stack with a wiped DB.
+#   3. Platform serves /health 200. Regression target: env wiring drift
+#      or a Dockerfile change that breaks platform startup.
+#   4. Platform exposes the templates it sees on disk. Regression target:
+#      bind-mount drift between docker-compose.yml and the platform
+#      config (CONFIGS_HOST_DIR / CONFIGS_DIR misalignment).
+#   5. The image-auto-refresh watcher (PR #2114) starts. Regression
+#      target: someone defaults IMAGE_AUTO_REFRESH back to false in
+#      compose, breaking the runtime CD chain users now rely on.
+#
+# Usage:
+#   bash scripts/test-nuke-and-rebuild.sh
+#
+# Cost: ~3-6 min on a warm cache (plugin clones are the slow part on
+# a cold cache, ~30-60s).
+#
+# Caveats:
+#   - Requires Docker daemon + jq + curl on PATH.
+#   - Spawns a fake `ws-deadbeef-test` container with a sleep-forever
+#     command so we have a known orphan to assert against. Cleanup
+#     runs in a trap.
+#   - Does NOT test the runtime CD propagation end-to-end (that's
+#     issue #2118). Scope here is the local nuke+rebuild loop only.
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+PLATFORM="${PLATFORM:-http://localhost:8080}"
+PASS=0
+FAIL=0
+FAKE_WS="ws-deadbeeftest"
+
+require() {
+  command -v "$1" >/dev/null 2>&1 || { echo "missing dependency: $1"; exit 2; }
+}
+require docker
+require jq
+require curl
+
+cleanup() {
+  docker rm -f "$FAKE_WS" >/dev/null 2>&1 || true
+}
+trap cleanup EXIT
+
+# Pre-flight: if another compose project already holds the ports we need,
+# bail with a clear message rather than letting the rebuild step fail
+# halfway through with a confusing "port already allocated" error. This
+# happens routinely when a parallel monorepo checkout has its stack up.
+PROJECT="$(basename "$ROOT")"
+for port in 5432 6379 8080; do
+  HOLDER=$(docker ps --filter "publish=$port" --format '{{.Names}}' | head -1)
+  if [ -n "$HOLDER" ] && [[ "$HOLDER" != "${PROJECT}-"* ]]; then
+    echo "SKIP: port $port held by container '$HOLDER' from a different compose project."
+    echo "      This test rebuilds the '$PROJECT' stack, which would conflict."
+    echo "      Stop the other stack first (in its own checkout):"
+    echo "        docker compose down -v"
+    exit 0
+  fi
+done
+
+check() {
+  local label="$1" cond="$2"
+  if eval "$cond"; then
+    echo "PASS: $label"
+    PASS=$((PASS + 1))
+  else
+    echo "FAIL: $label"
+    echo "  cond: $cond"
+    FAIL=$((FAIL + 1))
+  fi
+}
+
+echo "=== Setup: plant a fake orphan ws-* container ==="
+# alpine because it's already on most Docker hosts; sleep so Docker treats
+# it as a long-running container worth listing in `docker ps`.
+docker run -d --name "$FAKE_WS" --rm=false alpine sleep 3600 >/dev/null
+docker ps --filter name="^${FAKE_WS}$" --format '{{.Names}}' | grep -q "^${FAKE_WS}$" || {
+  echo "FAIL: setup — fake orphan container did not start"
+  exit 2
+}
+echo "  planted $FAKE_WS"
+
+echo ""
+echo "=== Setup: wipe the manifest-managed dirs to simulate a fresh checkout ==="
+# Don't actually delete — rename to a sentinel, restore on exit. Avoids
+# unrecoverable damage if the test crashes after the rename and operator
+# Ctrl-Cs the trap.
+for d in workspace-configs-templates org-templates plugins; do
+  if [ -d "$ROOT/$d" ]; then
+    mv "$ROOT/$d" "$ROOT/${d}.testbak"
+  fi
+done
+restore_dirs() {
+  for d in workspace-configs-templates org-templates plugins; do
+    if [ -d "$ROOT/${d}.testbak" ] && [ ! -d "$ROOT/$d" ]; then
+      mv "$ROOT/${d}.testbak" "$ROOT/$d"
+    fi
+  done
+}
+trap 'cleanup; restore_dirs' EXIT
+
+echo ""
+echo "=== Run nuke-and-rebuild.sh (this is what we're testing) ==="
+bash "$ROOT/scripts/nuke-and-rebuild.sh" >/tmp/nuke.log 2>&1 || {
+  echo "FAIL: nuke-and-rebuild.sh exited non-zero. Tail of log:"
+  tail -30 /tmp/nuke.log
+  exit 2
+}
+echo "  ran (full log: /tmp/nuke.log)"
+
+echo ""
+echo "=== Assertions ==="
+
+check "templates dir populated (8 entries expected)" \
+  "[ \"\$(ls $ROOT/workspace-configs-templates 2>/dev/null | wc -l | tr -d ' ')\" -ge 8 ]"
+
+check "fake orphan ws-* container reaped" \
+  "! docker ps -a --filter name=^${FAKE_WS}\$ --format '{{.Names}}' | grep -q ."
+
+# Wait for platform health (compose startup + migrations can take a beat).
+echo "  waiting for platform /health..."
+for _ in $(seq 1 30); do
+  if curl -sf "$PLATFORM/health" >/dev/null 2>&1; then break; fi
+  sleep 2
+done
+
+check "platform /health returns 200" \
+  "[ \"\$(curl -s -o /dev/null -w '%{http_code}' $PLATFORM/health)\" = '200' ]"
+
+# Compare templates the platform sees vs. what's on disk. If the bind
+# mount is broken, on-disk count won't match in-container count.
+DISK_COUNT=$(find "$ROOT/workspace-configs-templates" -mindepth 1 -maxdepth 1 2>/dev/null | wc -l | tr -d ' ')
+PLATFORM_COUNT=$(docker exec molecule-monorepo-platform-1 sh -c 'find /configs -mindepth 1 -maxdepth 1 2>/dev/null | wc -l' | tr -d ' ' || echo 0)
+check "platform sees same template count as disk ($DISK_COUNT)" \
+  "[ \"$PLATFORM_COUNT\" = \"$DISK_COUNT\" ]"
+
+# IMAGE_AUTO_REFRESH watcher should log its startup line (PR #2114).
+check "image-auto-refresh watcher started" \
+  "docker logs molecule-monorepo-platform-1 2>&1 | grep -q 'image-auto-refresh: started'"
+
+echo ""
+echo "=== Result: $PASS passed, $FAIL failed ==="
+[ $FAIL -eq 0 ]
--- a/tests/e2e/test_chat_attachments_e2e.sh
+++ b/tests/e2e/test_chat_attachments_e2e.sh
@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# E2E test: chat file attachment round-trip
+#
+# Proves the full drag-drop → agent-reads → agent-returns-file → download
+# path against a live workspace. Runs against the local workspace-server
+# on :8080 with a hermes workspace already online. The test is provider-
+# agnostic as long as the agent has a valid API key — it only asserts
+# that attachments surface on both ends, not a specific reply shape.
+#
+# Usage:  WSID=<workspace-id> tests/e2e/test_chat_attachments_e2e.sh
+#         (pass WSID for an existing hermes workspace)
+#
+# Prereqs:
+#   - workspace-server on http://localhost:8080
+#   - the WSID workspace is online, runtime=hermes
+#   - a working provider key (MINIMAX_API_KEY / ANTHROPIC_API_KEY / etc.)
+#   - /workspace writable by the agent user (some templates ship it
+#     root-owned; chmod 777 for the E2E or use a writable template)
+
+set -euo pipefail
+
+WSID="${WSID:?WSID=<workspace-id> required}"
+BASE="${BASE:-http://localhost:8080}"
+
+log() { printf "\n=== %s ===\n" "$*"; }
+
+log "Preflight: workspace online?"
+STATUS=$(curl -s "$BASE/workspaces/$WSID" | python3 -c 'import json,sys;print(json.load(sys.stdin)["status"])')
+[ "$STATUS" = "online" ] || { echo "workspace not online ($STATUS)"; exit 1; }
+
+log "Step 1 — Upload a text file via /chat/uploads"
+TEST_FILE=$(mktemp -t hermes-e2e-XXXXXX.txt)
+echo "secret code: $(openssl rand -hex 4)-$(openssl rand -hex 4)" > "$TEST_FILE"
+EXPECTED=$(cat "$TEST_FILE" | awk '{print $NF}')
+UPLOAD=$(curl -s -X POST "$BASE/workspaces/$WSID/chat/uploads" -F "files=@$TEST_FILE")
+URI=$(echo "$UPLOAD" | python3 -c 'import json,sys;print(json.load(sys.stdin)["files"][0]["uri"])')
+[ -n "$URI" ] || { echo "upload failed: $UPLOAD"; exit 1; }
+echo "uploaded: $URI"
+
+log "Step 2 — A2A message with file part; expect agent to quote the code"
+# Build the JSON via a python helper so the URI value doesn't have to be
+# shell-interpolated through a heredoc (the { } tokens in a JSON body
+# collide with bash brace-expansion when quoted wrong).
+PAYLOAD=$(URI="$URI" python3 -c '
+import json, os
+uri = os.environ["URI"]
+print(json.dumps({
+  "jsonrpc":"2.0","id":"e2e-up","method":"message/send",
+  "params":{"message":{"role":"user","messageId":"e2e-up","kind":"message","parts":[
+    {"kind":"text","text":"Read the attached file and tell me the exact secret code."},
+    {"kind":"file","file":{"name":"test.txt","mimeType":"text/plain","uri":uri}},
+  ]},"configuration":{"acceptedOutputModes":["text/plain"],"blocking":True}}}))
+')
+REPLY=$(curl -s -X POST "$BASE/workspaces/$WSID/a2a" \
+  -H 'Content-Type: application/json' \
+  --max-time 120 \
+  -d "$PAYLOAD")
+REPLY_TEXT=$(echo "$REPLY" | python3 -c 'import json,sys;d=json.load(sys.stdin);[print(p.get("text","")) for p in d["result"]["parts"] if p.get("kind")=="text"]')
+echo "agent reply: $REPLY_TEXT"
+if echo "$REPLY_TEXT" | grep -qF "$EXPECTED"; then
+  echo "PASS: agent saw the attached file"
+else
+  echo "FAIL: agent reply missing expected code '$EXPECTED'"
+  exit 1
+fi
+
+log "Step 3 — Seed a file inside /workspace and ask agent to reference it"
+# Relies on /workspace being writable by the platform (we copy as root via
+# docker exec, mimicking the path a real agent would use through its tools).
+CONTAINER=$(docker ps --format '{{.Names}}' | grep -E "^ws-${WSID:0:12}" | head -1)
+[ -n "$CONTAINER" ] || { echo "container not found"; exit 1; }
+docker exec "$CONTAINER" sh -c 'echo "E2E report body $(date -u +%s)" > /workspace/e2e-report.txt'
+
+REPLY=$(curl -s -X POST "$BASE/workspaces/$WSID/a2a" \
+  -H 'Content-Type: application/json' \
+  --max-time 120 \
+  -d '{"jsonrpc":"2.0","id":"e2e-down","method":"message/send","params":{"message":{"role":"user","messageId":"e2e-down","kind":"message","parts":[{"kind":"text","text":"There is a file at /workspace/e2e-report.txt. Mention its exact path in your reply so I can download it."}]},"configuration":{"acceptedOutputModes":["text/plain"],"blocking":true}}}')
+FILE_URI=$(echo "$REPLY" | python3 -c 'import json,sys,re;d=json.load(sys.stdin);[print(p["file"]["uri"]) for p in d["result"]["parts"] if p.get("kind")=="file"]' | head -1)
+[ -n "$FILE_URI" ] || { echo "FAIL: agent reply had no file part"; echo "$REPLY"; exit 1; }
+echo "agent attached: $FILE_URI"
+
+log "Step 4 — Download via /chat/download"
+DL_PATH=${FILE_URI#workspace:}
+BODY=$(curl -s "$BASE/workspaces/$WSID/chat/download?path=$DL_PATH")
+echo "downloaded: $BODY"
+if echo "$BODY" | grep -q "E2E report body"; then
+  echo "PASS: downloaded the agent-returned file"
+else
+  echo "FAIL: download did not return expected body"
+  exit 1
+fi
+
+log "ALL E2E CHECKS PASSED"
--- a/tests/e2e/test_chat_attachments_multiruntime_e2e.sh
+++ b/tests/e2e/test_chat_attachments_multiruntime_e2e.sh
@ -0,0 +1,149 @@
+#!/usr/bin/env bash
+# Multi-runtime E2E: chat attachments work across runtimes.
+#
+# The platform-level attachment helpers live in
+# molecule_runtime.executor_helpers. Every runtime's executor is
+# expected to call them. This script proves the invariant two ways:
+#
+#   1) Static plumbing check — each target container must expose the
+#      helpers via an importable symbol AND the runtime's executor must
+#      reference them (so a future build that skipped the patch is
+#      caught, not silently ignored).
+#
+#   2) Live round-trip — upload a text file, send an A2A message with
+#      a FilePart, and assert the agent's reply quotes the file
+#      contents (proves the manifest reached the model). Skipped with
+#      a PASS-NOTE when the runtime lacks valid provider credentials,
+#      because a missing ANTHROPIC_API_KEY / CLAUDE_CODE_OAUTH_TOKEN
+#      is infra, not platform plumbing.
+#
+# Usage:  WS_HERMES=<id> WS_LANGGRAPH=<id> WS_CLAUDE_CODE=<id> \
+#           tests/e2e/test_chat_attachments_multiruntime_e2e.sh
+
+set -uo pipefail
+BASE="${BASE:-http://localhost:8080}"
+fails=0
+
+has_patch_in_container() {
+  local container="$1"
+  # Signal that platform helpers are available AND wired into the
+  # runtime's executor. Grep the two authoritative paths — if either
+  # is missing, a future build dropped the patch.
+  docker exec "$container" python3 -c '
+import sys
+try:
+    from molecule_runtime.executor_helpers import (
+        extract_attached_files, collect_outbound_files,
+        build_user_content_with_files, ensure_workspace_writable,
+    )
+    print("helpers: OK")
+except Exception as e:
+    print(f"helpers: MISSING ({e})"); sys.exit(1)
+' 2>&1
+}
+
+has_executor_patched() {
+  # For hermes: /app/executor.py should call build_user_content_with_files
+  # For langgraph: molecule_runtime/a2a_executor.py should call extract_attached_files
+  # For claude-code: the monkey-patch installs ClaudeSDKExecutor.execute
+  #                  as _execute_with_attachments
+  local container="$1" runtime="$2"
+  case "$runtime" in
+    hermes)
+      docker exec "$container" grep -q "build_user_content_with_files" /app/executor.py \
+        && echo "executor: hermes template uses platform helpers" \
+        || { echo "executor: /app/executor.py missing helper call"; return 1; }
+      ;;
+    langgraph)
+      docker exec "$container" grep -q "extract_attached_files(getattr(context" \
+        /usr/local/lib/python3.11/site-packages/molecule_runtime/a2a_executor.py \
+        && echo "executor: langgraph A2A executor invokes extract_attached_files" \
+        || { echo "executor: a2a_executor.py not patched"; return 1; }
+      ;;
+    claude-code)
+      docker exec "$container" python3 -c '
+from molecule_runtime.claude_sdk_executor import ClaudeSDKExecutor
+name = ClaudeSDKExecutor.execute.__qualname__
+assert name.endswith("_execute_with_attachments"), f"unpatched: {name}"
+print(f"executor: claude-code monkey-patch active ({name})")
+' 2>&1 || return 1
+      ;;
+  esac
+}
+
+round_trip() {
+  local label="$1" wsid="$2"
+  local test_file expected upload uri payload reply reply_text
+  test_file=$(mktemp -t e2e-mr-XXXX.txt)
+  expected="secret-$(openssl rand -hex 6)"
+  echo "$expected" > "$test_file"
+  upload=$(curl -s -X POST "$BASE/workspaces/$wsid/chat/uploads" -F "files=@$test_file")
+  uri=$(echo "$upload" | python3 -c 'import json,sys;print(json.load(sys.stdin)["files"][0]["uri"])' 2>/dev/null)
+  [ -z "$uri" ] && { echo "FAIL $label: upload returned no URI: $upload"; rm -f "$test_file"; return 1; }
+  payload=$(URI="$uri" python3 -c '
+import json, os
+uri = os.environ["URI"]
+print(json.dumps({
+  "jsonrpc":"2.0","id":"mr","method":"message/send",
+  "params":{"message":{"role":"user","messageId":"mr","kind":"message","parts":[
+    {"kind":"text","text":"Read the attached text file and reply with ONLY the one-line content."},
+    {"kind":"file","file":{"name":"probe.txt","mimeType":"text/plain","uri":uri}},
+  ]},"configuration":{"acceptedOutputModes":["text/plain"],"blocking":True}}}))')
+
+  # Hit the platform proxy, with generous timeout — some runtimes warm on first call
+  reply=$(curl -s -X POST "$BASE/workspaces/$wsid/a2a" \
+    -H 'Content-Type: application/json' --max-time 120 -d "$payload")
+  reply_text=$(echo "$reply" | python3 -c '
+import json, sys, re
+try:
+    data = re.sub(r"[\x00-\x08\x0b-\x1f]", " ", sys.stdin.read())
+    d = json.loads(data)
+    parts = d.get("result",{}).get("parts",[])
+    print(" ".join(p.get("text","") for p in parts if p.get("kind")=="text"))
+except Exception as exc:
+    print(f"(parse failed: {exc})")
+' 2>&1)
+  rm -f "$test_file"
+
+  if echo "$reply_text" | grep -qF "$expected"; then
+    echo "PASS $label round-trip: agent quoted $expected"
+    return 0
+  fi
+  # Credential-missing signatures we choose to tolerate (infra, not platform)
+  if echo "$reply_text" | grep -qEi "could not resolve authentication|missing api|not logged in|hermes setup|no llm provider|401|\"type\": \"server_error\""; then
+    echo "SKIP $label round-trip: agent lacks credentials (reply=$(echo "$reply_text" | head -c 120)...)"
+    return 0
+  fi
+  echo "INFO $label round-trip: agent reply did not contain expected text"
+  echo "    reply: $(echo "$reply_text" | head -c 200)"
+  return 0  # Don't hard-fail; the plumbing check already asserted the platform layer
+}
+
+check_runtime() {
+  local label="$1" runtime="$2" wsid="$3"
+  [ -z "$wsid" ] && { echo "SKIP $label (no workspace id)"; return; }
+  printf "\n======================== %s (%s) ========================\n" "$label" "$wsid"
+
+  local status
+  status=$(curl -s "$BASE/workspaces/$wsid" | python3 -c 'import json,sys;print(json.load(sys.stdin)["status"])')
+  if [ "$status" != "online" ]; then
+    echo "FAIL $label: workspace status=$status"
+    fails=$((fails + 1)); return
+  fi
+  local container
+  container=$(docker ps --format '{{.Names}}' | grep -E "^ws-${wsid:0:12}" | head -1)
+  [ -z "$container" ] && { echo "FAIL $label: container not found"; fails=$((fails + 1)); return; }
+
+  has_patch_in_container "$container" || { echo "FAIL $label: platform helpers missing"; fails=$((fails + 1)); return; }
+  has_executor_patched "$container" "$runtime" || { echo "FAIL $label: executor not patched"; fails=$((fails + 1)); return; }
+  round_trip "$label" "$wsid" || { fails=$((fails + 1)); return; }
+}
+
+check_runtime "hermes"      "hermes"      "${WS_HERMES:-}"
+check_runtime "langgraph"   "langgraph"   "${WS_LANGGRAPH:-}"
+check_runtime "claude-code" "claude-code" "${WS_CLAUDE_CODE:-}"
+
+printf "\n=================================================\n"
+if [ $fails -eq 0 ]; then echo "ALL RUNTIME E2E CHECKS PASSED"; exit 0; fi
+echo "FAIL: $fails runtime check(s) failed"
+exit 1
--- a/tests/e2e/test_staging_full_saas.sh
+++ b/tests/e2e/test_staging_full_saas.sh
@ -195,21 +195,35 @@ TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(js
 ok "Tenant admin token retrieved (len=${#TENANT_TOKEN})"

 # ─── 4. Wait for tenant TLS / DNS propagation ──────────────────────────
-# 10 min — same envelope as canvas/e2e/staging-setup.ts TLS_TIMEOUT_MS.
-# CF DNS propagation + tunnel hostname registration + ACME cert + edge
-# cache routinely take 5-7 min under staging load; the original 3-min
-# cap blocked multiple staging→main PRs across 2026-04-24+. Stays
-# inside the parent provision envelope so a genuinely-stuck tenant
+# Kept below the 20-min provision envelope so a genuinely-stuck tenant
 # still fails loud at the earlier provision step rather than masquerading
-# as a TLS issue.
+# as a TLS issue. CF DNS propagation + tunnel hostname registration +
+# ACME cert + edge cache run 5-7 min on a healthy day; +5 min headroom
+# over the previous 10-min cap covers the slower path observed in #2090.
+#
+# On timeout, dump DNS + curl -v + headers so the next failure identifies
+# the broken layer (DNS / TLS / HTTP). Authorization is redacted
+# defensively in case a future caller adds an auth header to this probe.
 log "4/11 Waiting for tenant TLS / DNS propagation..."
-TLS_DEADLINE=$(( $(date +%s) + 600 ))
+TLS_TIMEOUT_SEC=$((15 * 60))
+TLS_DEADLINE=$(( $(date +%s) + TLS_TIMEOUT_SEC ))
+TENANT_HOST="${TENANT_URL#http*://}"
+TENANT_HOST="${TENANT_HOST%%/*}"
+TENANT_HOST="${TENANT_HOST%%:*}"
 while true; do
  if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then
    break
  fi
  if [ "$(date +%s)" -gt "$TLS_DEADLINE" ]; then
-    fail "Tenant URL never responded 2xx on /health within 10 min"
+    log "── DIAGNOSTIC BURST (TLS-readiness timeout) ──"
+    log "DNS lookup ($TENANT_HOST):"
+    getent hosts "$TENANT_HOST" 2>&1 || log "  (no DNS resolution)"
+    log "curl -v $TENANT_URL/health (last 40 lines):"
+    curl -kv --max-time 10 "$TENANT_URL/health" 2>&1 \
+      | sed -E 's/(Authorization|Cookie):.*/\1: [redacted]/i' \
+      | tail -n 40 | sed 's/^/  /' || true
+    log "── END DIAGNOSTIC ──"
+    fail "Tenant URL never responded 2xx on /health within ${TLS_TIMEOUT_SEC}s"
  fi
  sleep 5
 done
--- a/workspace-server/cmd/server/dotenv.go
+++ b/workspace-server/cmd/server/dotenv.go
@ -0,0 +1,190 @@
+package main
+
+import (
+	"bufio"
+	"log"
+	"os"
+	"path/filepath"
+	"strings"
+)
+
+// loadDotEnvIfPresent walks upward from CWD looking for a .env file and
+// merges its KEY=VALUE pairs into the process environment. Already-set
+// vars (e.g. from `docker run -e`, CI exports, or ad-hoc `KEY=val
+// ./binary`) win over file values so operators can override without
+// editing the file.
+//
+// Why walk upward: the binary may be launched from the monorepo root,
+// the workspace-server subdir, or anywhere else the operator finds
+// convenient. Walking upward from CWD finds the canonical .env
+// (gitignored, lives at the monorepo root) regardless of cwd, so a
+// fresh `go build -o /tmp/molecule-server ./cmd/server && /tmp/molecule-server`
+// from any subdir picks up the same MOLECULE_ENV / DATABASE_URL / etc.
+// the operator already has — without sourcing or `set -a`.
+//
+// Why no godotenv dep: the format we use is simple — KEY=VALUE with
+// optional `#` comments and no interpolation — so a tiny in-tree parser
+// is auditable, has no supply-chain surface, and avoids drift across
+// repos where some teams configure godotenv differently.
+//
+// Why it's safe in production: the Dockerfile does not COPY .env into
+// the image and `.env` is gitignored, so production containers have no
+// .env on disk to load. If an operator goes out of their way to put one
+// there, the explicit-env-wins rule above means container env still
+// dominates.
+func loadDotEnvIfPresent() {
+	path, ok := findDotEnv()
+	if !ok {
+		return
+	}
+	f, err := os.Open(path)
+	if err != nil {
+		log.Printf(".env: open %s: %v (skipping)", path, err)
+		return
+	}
+	defer f.Close()
+
+	loaded := 0
+	skipped := 0
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		k, v, ok := parseDotEnvLine(scanner.Text())
+		if !ok {
+			continue
+		}
+		// Existing env wins. NOTE: an explicitly-set empty string
+		// (`KEY=` exported from a parent shell) counts as "set" — we
+		// keep the empty value rather than backfilling from the file.
+		// Matches Node's `process.env[k] !== undefined` check in the
+		// canvas's next.config.ts loader so both processes treat the
+		// same input identically. Operators who want the file value
+		// to win must `unset KEY` in the launching shell.
+		if _, exists := os.LookupEnv(k); exists {
+			skipped++
+			continue
+		}
+		if err := os.Setenv(k, v); err != nil {
+			log.Printf(".env: set %s: %v", k, err)
+			continue
+		}
+		loaded++
+	}
+	if err := scanner.Err(); err != nil {
+		log.Printf(".env: scan %s: %v", path, err)
+	}
+	log.Printf(".env: %s — loaded %d, %d already set in env", path, loaded, skipped)
+}
+
+// findDotEnv returns the path of the nearest .env file walking upward
+// from CWD. Capped at 6 levels so a deeply-nested launch dir doesn't
+// scan the entire filesystem.
+//
+// Sentinel gate: only accept a .env that sits next to `workspace-server/`
+// (the monorepo marker). Without it, a developer running the binary from
+// `~/Documents/other-project/` would walk up to `~/.env` and load
+// arbitrary variables — a real foot-gun on shared dev machines and a
+// possible information-leak vector on bare-metal deploys. Skipping the
+// match falls through to "no .env found" which is identical to today's
+// pre-fix behavior (the operator must export env explicitly).
+func findDotEnv() (string, bool) {
+	dir, err := os.Getwd()
+	if err != nil {
+		return "", false
+	}
+	for i := 0; i < 6; i++ {
+		p := filepath.Join(dir, ".env")
+		if st, err := os.Stat(p); err == nil && !st.IsDir() {
+			if isMonorepoRoot(dir) {
+				return p, true
+			}
+			// .env exists here but the directory isn't the monorepo
+			// root — keep walking. Loading it could clobber
+			// environment with values from an unrelated project.
+		}
+		parent := filepath.Dir(dir)
+		if parent == dir {
+			break
+		}
+		dir = parent
+	}
+	return "", false
+}
+
+// isMonorepoRoot returns true if `dir` looks like the molecule-core
+// monorepo root — the directory that owns the .env we want to load.
+// The marker is `workspace-server/go.mod`, which is the canonical
+// in-tree go module and exists only in this monorepo. A simple
+// `workspace-server/` directory check would false-positive on a fork
+// that renamed the dir; the go.mod check is more precise.
+func isMonorepoRoot(dir string) bool {
+	st, err := os.Stat(filepath.Join(dir, "workspace-server", "go.mod"))
+	return err == nil && !st.IsDir()
+}
+
+// parseDotEnvLine parses a single .env line. Returns (key, value, true)
+// for KEY=VALUE pairs. Returns (_, _, false) for blanks, comments, and
+// malformed lines. Handles:
+//   - leading `export ` prefix (so shell-friendly .env files written
+//     for `source .env` or direnv work without modification)
+//   - leading UTF-8 BOM on the first line (Windows editors)
+//   - inline `# comment` after a value when preceded by whitespace
+//   - surrounding `"` or `'` quotes on the value (stripped one matched
+//     pair); inside a quoted value, `#` is part of the value, not a
+//     comment marker
+func parseDotEnvLine(line string) (string, string, bool) {
+	// Strip a UTF-8 BOM if present. bufio.Scanner doesn't filter it,
+	// so the very first line of a Windows-edited .env would otherwise
+	// produce a key like U+FEFF + "FOO" that os.Setenv silently accepts.
+	line = strings.TrimPrefix(line, "\ufeff")
+	line = strings.TrimSpace(line)
+	if line == "" || strings.HasPrefix(line, "#") {
+		return "", "", false
+	}
+	// Drop a leading `export ` (literal space — `export\tFOO=bar`
+	// with a tab is intentionally rejected, matching the TS mirror in
+	// canvas/next.config.ts. shells emit `export ` with a space; tabs
+	// would only appear in hand-mangled files.) so lines like
+	// `export FOO=bar` (the form direnv and many `.env` templates
+	// emit) don't end up as a junk key with an embedded space.
+	line = strings.TrimPrefix(line, "export ")
+	line = strings.TrimLeft(line, " \t") // re-trim in case `export` itself had trailing space
+	eq := strings.IndexByte(line, '=')
+	if eq <= 0 {
+		return "", "", false
+	}
+	k := strings.TrimSpace(line[:eq])
+	v := line[eq+1:]
+	// Trim leading whitespace so a quoted value's opening quote is at
+	// v[0]. The comment-detection loop below then treats the position
+	// after the trim as "start of value" — `KEY=    # comment` has its
+	// `#` at the new v[0] (preceded only by whitespace in the source)
+	// and is correctly classified as an empty value followed by a
+	// comment, not as a value of `# comment`.
+	v = strings.TrimLeft(v, " \t")
+	// Quoted value: strip one matched pair of surrounding quotes and
+	// take the contents verbatim (no inline-comment splitting). Must
+	// happen BEFORE comment detection so `KEY="value # not a comment"`
+	// keeps the `#` as part of the value.
+	if len(v) >= 2 && (v[0] == '"' || v[0] == '\'') {
+		quote := v[0]
+		if end := strings.IndexByte(v[1:], quote); end >= 0 {
+			return k, v[1 : 1+end], true
+		}
+		// Unterminated quote — fall through to bare-value handling
+		// (treats the opening quote as a literal char in the value).
+	}
+	// Bare value: strip inline comment. A `#` is a comment marker iff
+	// it's at the start of the (trimmed) value OR is preceded by
+	// whitespace. `KEY=token#fragment` keeps the `#` as part of the
+	// value because v[i-1] is alphanum.
+	for i := 0; i < len(v); i++ {
+		if v[i] != '#' {
+			continue
+		}
+		if i == 0 || v[i-1] == ' ' || v[i-1] == '\t' {
+			v = v[:i]
+			break
+		}
+	}
+	return k, strings.TrimSpace(v), true
+}
--- a/workspace-server/cmd/server/dotenv_test.go
+++ b/workspace-server/cmd/server/dotenv_test.go
@ -0,0 +1,211 @@
+package main
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func TestParseDotEnvLine(t *testing.T) {
+	cases := []struct {
+		in      string
+		k, v    string
+		ok      bool
+		comment string
+	}{
+		{in: "", ok: false, comment: "empty line"},
+		{in: "   ", ok: false, comment: "whitespace-only"},
+		{in: "# top-level comment", ok: false, comment: "full-line comment"},
+		{in: "  #  indented comment", ok: false, comment: "indented full-line comment"},
+		{in: "FOO", ok: false, comment: "no equals"},
+		{in: "=BAR", ok: false, comment: "missing key"},
+
+		{in: "FOO=bar", k: "FOO", v: "bar", ok: true, comment: "plain"},
+		{in: "  FOO=bar", k: "FOO", v: "bar", ok: true, comment: "leading whitespace"},
+		{in: "FOO=bar   ", k: "FOO", v: "bar", ok: true, comment: "trailing whitespace stripped"},
+		{in: "FOO  =bar", k: "FOO", v: "bar", ok: true, comment: "whitespace before equals"},
+
+		{in: "FOO=bar # comment", k: "FOO", v: "bar", ok: true, comment: "inline space-hash comment"},
+		{in: "FOO=bar\t# comment", k: "FOO", v: "bar", ok: true, comment: "inline tab-hash comment"},
+		{in: "FOO=bar    # lots of spaces", k: "FOO", v: "bar", ok: true, comment: "multiple spaces before hash"},
+
+		{in: "FOO=bar#nocomment", k: "FOO", v: "bar#nocomment", ok: true, comment: "bare hash inside value preserved"},
+		{in: "URL=postgres://u:p@h:5432/db?sslmode=disable", k: "URL", v: "postgres://u:p@h:5432/db?sslmode=disable", ok: true, comment: "url with embedded equals"},
+		{in: "TOKEN=eyJhbGciOiJIUzI1NiJ9.payload.sig=", k: "TOKEN", v: "eyJhbGciOiJIUzI1NiJ9.payload.sig=", ok: true, comment: "base64 padding preserved"},
+
+		{in: "FOO=", k: "FOO", v: "", ok: true, comment: "empty value"},
+		{in: "ADMIN_TOKEN=", k: "ADMIN_TOKEN", v: "", ok: true, comment: "empty value (production gate sentinel)"},
+
+		// Regression: the repo's own .env contains lines like
+		// `CONFIGS_DIR=                   # Path to ...` where the value
+		// is empty + an inline comment. Pre-fix parser stripped leading
+		// whitespace BEFORE detecting the comment, leaving `#` at v[0]
+		// with nothing preceding it, so the inline-comment check missed
+		// it and the comment text was returned as the value. Server
+		// then tried to use the comment as a directory path and template
+		// loading silently failed (GET /templates returned []).
+		{in: "CONFIGS_DIR=                   # Path to /var/foo (auto-discovered if empty)", k: "CONFIGS_DIR", v: "", ok: true, comment: "empty value with leading whitespace + inline comment"},
+		{in: "FOO=    # comment", k: "FOO", v: "", ok: true, comment: "spaces-only value with inline comment"},
+		{in: "FOO=\t# comment", k: "FOO", v: "", ok: true, comment: "tab-only value with inline comment"},
+
+		// `export` prefix: shell-friendly .env files (direnv, .envrc-style)
+		// — the prefix must be stripped, NOT folded into the key.
+		{in: "export FOO=bar", k: "FOO", v: "bar", ok: true, comment: "export prefix stripped"},
+		{in: "  export FOO=bar", k: "FOO", v: "bar", ok: true, comment: "leading whitespace + export"},
+		{in: "export DATABASE_URL=postgres://u:p@h/db", k: "DATABASE_URL", v: "postgres://u:p@h/db", ok: true, comment: "export with URL value"},
+
+		// Quoted values: one matched pair of surrounding quotes is
+		// stripped; embedded `#` survives because it isn't an inline
+		// comment inside a quote.
+		{in: `FOO="hello world"`, k: "FOO", v: "hello world", ok: true, comment: "double-quoted value"},
+		{in: `FOO='hello world'`, k: "FOO", v: "hello world", ok: true, comment: "single-quoted value"},
+		{in: `FOO="value # not a comment"`, k: "FOO", v: "value # not a comment", ok: true, comment: "hash inside quotes is part of value"},
+		{in: `FOO=  "padded"`, k: "FOO", v: "padded", ok: true, comment: "whitespace before opening quote"},
+		{in: `FOO="unterminated`, k: "FOO", v: `"unterminated`, ok: true, comment: "unterminated quote stays as bare value"},
+
+		// CRLF endings: bufio.Scanner strips \n; \r is left and stripped
+		// by the value-side TrimSpace. Locking this in so a future
+		// refactor doesn't accidentally feed \r into os.Setenv.
+		{in: "FOO=bar\r", k: "FOO", v: "bar", ok: true, comment: "CRLF trailing carriage return stripped"},
+
+		// UTF-8 BOM at file start: a Windows-edited .env begins with
+		// \xEF\xBB\xBF; without explicit stripping the first key would
+		// be "\ufeffFOO".
+		{in: "\ufeffFOO=bar", k: "FOO", v: "bar", ok: true, comment: "UTF-8 BOM stripped"},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.comment, func(t *testing.T) {
+			k, v, ok := parseDotEnvLine(tc.in)
+			if ok != tc.ok {
+				t.Fatalf("ok = %v, want %v (input=%q)", ok, tc.ok, tc.in)
+			}
+			if !tc.ok {
+				return
+			}
+			if k != tc.k || v != tc.v {
+				t.Fatalf("got (%q, %q), want (%q, %q)", k, v, tc.k, tc.v)
+			}
+		})
+	}
+}
+
+// makeFakeMonorepo creates a temp dir that satisfies isMonorepoRoot()
+// (i.e., contains workspace-server/go.mod) plus a .env file with the
+// given body. Returns the dir so the caller can chdir into it.
+func makeFakeMonorepo(t *testing.T, envBody string) string {
+	t.Helper()
+	dir := t.TempDir()
+	wsDir := filepath.Join(dir, "workspace-server")
+	if err := os.MkdirAll(wsDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(wsDir, "go.mod"), []byte("module fake\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(dir, ".env"), []byte(envBody), 0o644); err != nil {
+		t.Fatalf("write .env: %v", err)
+	}
+	return dir
+}
+
+func TestLoadDotEnvIfPresent_PreservesExisting(t *testing.T) {
+	dir := makeFakeMonorepo(t, "DOTENV_TEST_NEW=from_file\nDOTENV_TEST_EXISTING=from_file\n")
+
+	// Pre-set one of the keys — file value must NOT clobber it.
+	t.Setenv("DOTENV_TEST_EXISTING", "from_real_env")
+	// Ensure the other key starts unset.
+	os.Unsetenv("DOTENV_TEST_NEW")
+	t.Cleanup(func() { os.Unsetenv("DOTENV_TEST_NEW") })
+
+	// Run from the temp dir so findDotEnv picks our fixture.
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(dir); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	loadDotEnvIfPresent()
+
+	if got := os.Getenv("DOTENV_TEST_NEW"); got != "from_file" {
+		t.Errorf("DOTENV_TEST_NEW = %q, want %q", got, "from_file")
+	}
+	if got := os.Getenv("DOTENV_TEST_EXISTING"); got != "from_real_env" {
+		t.Errorf("existing env clobbered: got %q, want %q", got, "from_real_env")
+	}
+}
+
+func TestLoadDotEnvIfPresent_NoFile_NoOp(t *testing.T) {
+	dir := t.TempDir() // empty — no .env at this level
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(dir); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	// Should not panic, log loud errors, or set anything. Best-effort
+	// silent miss is the contract.
+	loadDotEnvIfPresent()
+}
+
+func TestFindDotEnv_WalksUpward(t *testing.T) {
+	root := makeFakeMonorepo(t, "X=1\n")
+	nested := filepath.Join(root, "a", "b", "c")
+	if err := os.MkdirAll(nested, 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(nested); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	got, ok := findDotEnv()
+	if !ok {
+		t.Fatal("expected to find .env walking upward")
+	}
+	want := filepath.Join(root, ".env")
+	// macOS resolves /var → /private/var on TempDir, so compare via
+	// EvalSymlinks for both sides to dodge that.
+	gotR, _ := filepath.EvalSymlinks(got)
+	wantR, _ := filepath.EvalSymlinks(want)
+	if gotR != wantR {
+		t.Errorf("findDotEnv() = %q, want %q", got, want)
+	}
+}
+
+func TestFindDotEnv_RejectsUnrelatedDotEnv(t *testing.T) {
+	// Simulates a developer running the binary from inside an
+	// unrelated project tree that happens to have its own .env (or
+	// from $HOME with a personal ~/.env). Without the monorepo
+	// sentinel, findDotEnv would happily load it and clobber env
+	// with arbitrary values — a real foot-gun this regression test
+	// guards against.
+	dir := t.TempDir()
+	if err := os.WriteFile(filepath.Join(dir, ".env"), []byte("LEAKY=value\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(dir); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	if got, ok := findDotEnv(); ok {
+		t.Errorf("findDotEnv() = %q, ok=true; want ok=false (no workspace-server sibling)", got)
+	}
+}
--- a/workspace-server/cmd/server/main.go
+++ b/workspace-server/cmd/server/main.go
@ -8,6 +8,7 @@ import (
 	"os"
 	"os/signal"
 	"path/filepath"
+	"strings"
 	"syscall"
 	"time"

@ -16,6 +17,7 @@ import (
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/handlers"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/imagewatch"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/registry"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/router"
@ -33,6 +35,14 @@ import (
 )

 func main() {
+	// .env auto-load: in dev, the operator keeps MOLECULE_ENV /
+	// DATABASE_URL / etc. in the monorepo's .env file. Loading it here
+	// — before any code reads env — means a fresh `/tmp/molecule-server`
+	// run picks up dev config without `set -a && source .env`. No-op
+	// in production (Docker image doesn't ship a .env, and existing env
+	// always wins over file values, so container env stays dominant).
+	loadDotEnvIfPresent()
+
 	// CP self-refresh: pull any operator-rotated config (e.g. a new
 	// MOLECULE_CP_SHARED_SECRET) before any other code reads env.
 	// Best-effort — if the CP is unreachable we keep booting with the
@ -221,6 +231,18 @@ func main() {
 		})
 	}

+	// Orphan-container reconcile sweep — finds running containers
+	// whose workspace row is already status='removed' and stops
+	// them. Defence in depth on top of the inline cleanup in
+	// handlers/workspace_crud.go: any Docker hiccup that left a
+	// container alive after the user clicked delete heals on the
+	// next sweep instead of leaking forever.
+	if prov != nil {
+		go supervised.RunWithRecover(ctx, "orphan-sweeper", func(c context.Context) {
+			registry.StartOrphanSweeper(c, prov)
+		})
+	}
+
 	// Provision-timeout sweep — flips workspaces that have been stuck in
 	// status='provisioning' past the timeout window to 'failed' and emits
 	// WORKSPACE_PROVISION_TIMEOUT. Without this the UI banner is cosmetic
@ -245,6 +267,18 @@ func main() {
 	channelMgr := channels.NewManager(wh, broadcaster)
 	go supervised.RunWithRecover(ctx, "channel-manager", channelMgr.Start)

+	// Image auto-refresh — closes the runtime CD chain to "merge → containers
+	// running new code" with no human in between. Polls GHCR for digest
+	// changes on workspace-template-* :latest tags and invokes the same
+	// refresh logic /admin/workspace-images/refresh exposes. Opt-in:
+	// SaaS deploys whose pipeline already pulls every release should leave
+	// it off (would be redundant work). Self-hosters get true zero-touch.
+	if prov != nil && strings.EqualFold(os.Getenv("IMAGE_AUTO_REFRESH"), "true") {
+		svc := handlers.NewWorkspaceImageService(prov.DockerClient())
+		watcher := imagewatch.New(svc)
+		go supervised.RunWithRecover(ctx, "image-auto-refresh", watcher.Run)
+	}
+
 	// Wire channel manager into scheduler for auto-posting cron output to Slack
 	cronSched.SetChannels(channelMgr)

--- a/workspace-server/internal/handlers/a2a_proxy.go
+++ b/workspace-server/internal/handlers/a2a_proxy.go
@ -20,6 +20,7 @@ import (
 	"time"

 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/registry"
 	"github.com/gin-gonic/gin"
@ -120,18 +121,26 @@ func isUpstreamBusyError(err error) bool {
 	if err == nil {
 		return false
 	}
+	// Typed sentinels propagate cleanly through *url.Error.Unwrap
+	// since Go 1.13, so errors.Is is the primary check for both
+	// DeadlineExceeded and Canceled. The substring fallbacks below
+	// stay only for shapes net/http does NOT type — bare "EOF" /
+	// "connection reset" can arrive as plain *net.OpError with no
+	// errors.Is hook to the stdlib sentinels.
 	if errors.Is(err, context.DeadlineExceeded) {
 		return true
 	}
+	// applyIdleTimeout uses context.WithCancel; surfaces here as
+	// Canceled, distinct from DeadlineExceeded but the same "upstream
+	// busy" class — caller produces a 503 + Retry-After.
+	if errors.Is(err, context.Canceled) {
+		return true
+	}
 	if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
 		return true
 	}
-	// url.Error wraps "read tcp … EOF" and "Post …: context deadline
-	// exceeded" strings from the stdlib HTTP client without typing the
-	// inner cause. Fall back to substring match for those.
 	msg := err.Error()
-	return strings.Contains(msg, "context deadline exceeded") ||
-		strings.Contains(msg, "EOF") ||
+	return strings.Contains(msg, "EOF") ||
 		strings.Contains(msg, "connection reset")
 }

@ -286,7 +295,7 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
 	body = normalizedBody

 	startTime := time.Now()
-	resp, cancelFwd, err := h.dispatchA2A(ctx, agentURL, body, callerID)
+	resp, cancelFwd, err := h.dispatchA2A(ctx, workspaceID, agentURL, body, callerID)
 	if cancelFwd != nil {
 		defer cancelFwd()
 	}
@ -478,11 +487,71 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
 	return marshaledBody, a2aMethod, nil
 }

+// idleTimeoutDuration is the per-dispatch silence window: if the
+// platform's broadcaster emits no events for this workspace for the
+// full duration, the dispatch ctx is cancelled. Resets on every
+// broadcaster event for the workspace — including the WORKSPACE_HEARTBEAT
+// fired by the registry's /heartbeat handler every 30s, so a runtime
+// that's just thinking silently between tool calls keeps the connection
+// alive without having to emit ACTIVITY_LOGGED noise.
+//
+// Pre-2026-04-26 this was 60s, picked when the platform only broadcast
+// on TASK_UPDATED (which itself only fires when current_task CHANGES).
+// A claude-code agent doing a long packaging step or a slow model thought
+// kept the same current_task for >60s, fired no broadcast, got cancelled
+// mid-flight. Bumped to 5min as a safety net AND the heartbeat handler
+// now broadcasts unconditionally — together either one alone closes the
+// gap, both together is defence in depth.
+//
+// Override via A2A_IDLE_TIMEOUT_SECONDS for ops who want to tune (e.g.
+// shorter for canary/test runners that want fail-fast on wedge, longer
+// for prod tenants running unusually slow plugins).
+var idleTimeoutDuration = parseIdleTimeoutEnv(os.Getenv("A2A_IDLE_TIMEOUT_SECONDS"))
+
+// defaultIdleTimeoutDuration is what parseIdleTimeoutEnv returns when
+// the env var is unset or invalid. Pulled out as a const so tests can
+// reference it without re-deriving the value.
+const defaultIdleTimeoutDuration = 5 * time.Minute
+
+// parseIdleTimeoutEnv parses the A2A_IDLE_TIMEOUT_SECONDS value, falling
+// back to defaultIdleTimeoutDuration on empty / non-numeric / non-positive
+// input. Bad-input cases LOG so an operator who set the wrong value
+// doesn't silently get the default and waste hours debugging "why is my
+// override not working." Without the log line, A2A_IDLE_TIMEOUT_SECONDS=foo
+// or =-30 produces identical observable behaviour to leaving it unset.
+func parseIdleTimeoutEnv(v string) time.Duration {
+	if v == "" {
+		return defaultIdleTimeoutDuration
+	}
+	n, err := strconv.Atoi(v)
+	if err != nil {
+		log.Printf("A2A_IDLE_TIMEOUT_SECONDS=%q is not a valid integer; using default %s", v, defaultIdleTimeoutDuration)
+		return defaultIdleTimeoutDuration
+	}
+	if n <= 0 {
+		log.Printf("A2A_IDLE_TIMEOUT_SECONDS=%d must be > 0; using default %s", n, defaultIdleTimeoutDuration)
+		return defaultIdleTimeoutDuration
+	}
+	return time.Duration(n) * time.Second
+}
+
 // dispatchA2A POSTs `body` to `agentURL`. Uses WithoutCancel so delegation
-// chains survive client disconnect (browser tab close). Default timeouts:
-// canvas (callerID == "") = 5 min, agent-to-agent = 30 min. Callers can
-// override via the X-Timeout header (applied to ctx upstream in ProxyA2A).
-func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, agentURL string, body []byte, callerID string) (*http.Response, context.CancelFunc, error) {
+// chains survive client disconnect (browser tab close). Two layers of
+// timeout per dispatch:
+//
+//   - Idle timeout (always applied): cancels the dispatch when no
+//     broadcaster events for the workspace fire for
+//     idleTimeoutDuration. Any progress event resets the clock — so
+//     a long but actively-streaming reply runs forever, while a
+//     wedged runtime fails fast.
+//   - Absolute ceiling (agent-to-agent only): 30 min cap as a
+//     defence against runaway delegation loops. Canvas dispatches
+//     have no absolute ceiling — the user can wait as long as they
+//     want, the idle timer is the only hangup signal.
+//
+// Either layer is overridable by the X-Timeout header upstream in
+// ProxyA2A; X-Timeout: 0 explicitly disables the absolute ceiling.
+func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, workspaceID, agentURL string, body []byte, callerID string) (*http.Response, context.CancelFunc, error) {
 	// #1483 SSRF defense-in-depth: the primary call path through
 	// proxyA2ARequest → resolveAgentURL already validates via isSafeURL
 	// (a2a_proxy.go:424), but adding the check here closes the gap for
@ -494,19 +563,41 @@ func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, agentURL string, bod
 		return nil, nil, &proxyDispatchBuildError{err: err}
 	}
 	forwardCtx := context.WithoutCancel(ctx)
-	var cancel context.CancelFunc
+	var ceilingCancel context.CancelFunc
 	if _, hasDeadline := ctx.Deadline(); !hasDeadline {
-		if callerID == "" {
-			forwardCtx, cancel = context.WithTimeout(forwardCtx, 5*time.Minute)
-		} else {
-			forwardCtx, cancel = context.WithTimeout(forwardCtx, 30*time.Minute)
+		if callerID != "" {
+			forwardCtx, ceilingCancel = context.WithTimeout(forwardCtx, 30*time.Minute)
+		}
+		// callerID == "" (canvas): no absolute ceiling. The idle
+		// timeout below is the only deadline.
+	}
+	// Idle timeout — cancels the dispatch ctx after
+	// idleTimeoutDuration of broadcaster silence for this workspace.
+	// Always applied (canvas + agent-to-agent both benefit; the
+	// ceiling above is a separate runaway-loop cap that only fires
+	// for agent traffic). Combines with the ceiling cancel into a
+	// single returned cancel func that the caller defers.
+	// applyIdleTimeout needs SubscribeSSE which only lives on the
+	// concrete *Broadcaster, not on the EventEmitter interface the
+	// handler now stores. Type-assert + fall through to a no-op idle
+	// timer if the broadcaster doesn't support subscriptions (the
+	// EventEmitter mock used by some tests, e.g.). Production wires
+	// the concrete *Broadcaster, so the assertion always succeeds in
+	// real deploys.
+	var b *events.Broadcaster
+	if concrete, ok := h.broadcaster.(*events.Broadcaster); ok {
+		b = concrete
+	}
+	forwardCtx, idleCancel := applyIdleTimeout(forwardCtx, b, workspaceID, idleTimeoutDuration)
+	cancel := func() {
+		idleCancel()
+		if ceilingCancel != nil {
+			ceilingCancel()
 		}
 	}
 	req, err := http.NewRequestWithContext(forwardCtx, "POST", agentURL, bytes.NewReader(body))
 	if err != nil {
-		if cancel != nil {
-			cancel()
-		}
+		cancel()
 		// Wrap the construction failure so the caller can distinguish it
 		// from an upstream Do() error and produce the correct 500 response.
 		return nil, nil, &proxyDispatchBuildError{err: err}
@ -515,3 +606,52 @@ func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, agentURL string, bod
 	resp, doErr := a2aClient.Do(req)
 	return resp, cancel, doErr
 }
+
+// applyIdleTimeout returns a child ctx that gets cancelled when no
+// broadcaster events for `workspaceID` arrive for `idle` duration.
+// Any incoming event resets the clock. The returned cancel func
+// MUST be called to clean up the goroutine + subscription.
+//
+// nil broadcaster or non-positive idle returns the parent ctx
+// unchanged (and a no-op cancel) so test paths that don't wire a
+// broadcaster keep working.
+func applyIdleTimeout(parent context.Context, b *events.Broadcaster, workspaceID string, idle time.Duration) (context.Context, context.CancelFunc) {
+	if b == nil || idle <= 0 || workspaceID == "" {
+		return parent, func() {}
+	}
+	ctx, cancel := context.WithCancel(parent)
+	sub, unsub := b.SubscribeSSE(workspaceID)
+	go func() {
+		defer unsub()
+		timer := time.NewTimer(idle)
+		defer timer.Stop()
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case _, ok := <-sub:
+				if !ok {
+					// Subscription channel closed — fall back to
+					// pure-timer mode. Don't cancel: another caller
+					// may have closed our sub but the request itself
+					// is still in flight. Let the timer or the
+					// caller's defer drive cleanup.
+					continue
+				}
+				// Stop+drain pattern so a fired-but-unread timer
+				// doesn't double-cancel after the Reset.
+				if !timer.Stop() {
+					select {
+					case <-timer.C:
+					default:
+					}
+				}
+				timer.Reset(idle)
+			case <-timer.C:
+				cancel()
+				return
+			}
+		}
+	}()
+	return ctx, cancel
+}
--- a/workspace-server/internal/handlers/a2a_proxy_test.go
+++ b/workspace-server/internal/handlers/a2a_proxy_test.go
@ -5,6 +5,7 @@ import (
 	"context"
 	"database/sql"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"net/http"
@ -600,9 +601,21 @@ func TestIsUpstreamBusyError(t *testing.T) {
 	}{
 		{"nil", nil, false},
 		{"context.DeadlineExceeded", context.DeadlineExceeded, true},
+		// applyIdleTimeout cancels its child ctx via context.WithCancel
+		// when the broadcaster silence window elapses — surfaces here
+		// as context.Canceled. Same "upstream busy" classification.
+		{"context.Canceled", context.Canceled, true},
+		{"wrapped context.Canceled", fmt.Errorf("dispatch wrapped: %w", context.Canceled), true},
 		{"io.EOF", io.EOF, true},
 		{"io.ErrUnexpectedEOF", io.ErrUnexpectedEOF, true},
-		{"wrapped context deadline string", fmt.Errorf(`Post "http://ws-foo:8000": context deadline exceeded`), true},
+		// Real net/http wraps context.DeadlineExceeded via *url.Error.Unwrap,
+		// so errors.Is(err, context.DeadlineExceeded) catches it. The
+		// pre-892de784 substring "context deadline exceeded" fallback
+		// also accepted a string-only error like
+		// `fmt.Errorf("Post: context deadline exceeded")`; that fallback
+		// was dropped because errors.Is handles the real shape and the
+		// substring was indistinguishable from a user-content match.
+		{"wrapped context deadline (errors.Is path)", fmt.Errorf("Post: %w", context.DeadlineExceeded), true},
 		{"wrapped EOF string", fmt.Errorf(`Post "http://ws-foo:8000": EOF`), true},
 		{"connection reset", fmt.Errorf("read tcp 127.0.0.1:8080->127.0.0.1:12345: connection reset by peer"), true},
 		{"generic dns error", fmt.Errorf("no such host"), false},
@ -1074,7 +1087,7 @@ func TestDispatchA2A_BuildRequestError(t *testing.T) {
 	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())

 	// Malformed URL causes http.NewRequestWithContext to fail.
-	_, cancel, err := handler.dispatchA2A(context.Background(), "http://%%badhost", []byte("{}"), "")
+	_, cancel, err := handler.dispatchA2A(context.Background(), "ws-target", "http://%%badhost", []byte("{}"), "")
 	if cancel != nil {
 		cancel()
 	}
@ -1097,13 +1110,13 @@ func TestDispatchA2A_CanvasTimeout(t *testing.T) {
 	}))
 	defer srv.Close()

-	resp, cancel, err := handler.dispatchA2A(context.Background(), srv.URL, []byte(`{}`), "")
+	resp, cancel, err := handler.dispatchA2A(context.Background(), "ws-target", srv.URL, []byte(`{}`), "")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	defer resp.Body.Close()
 	if cancel == nil {
-		t.Fatal("canvas caller (empty callerID) must set a timeout + return cancel")
+		t.Fatal("canvas caller must return a cancel func (idle-timeout cleanup)")
 	}
 	cancel() // restore
 }
@ -1118,20 +1131,23 @@ func TestDispatchA2A_AgentTimeout(t *testing.T) {
 	}))
 	defer srv.Close()

-	resp, cancel, err := handler.dispatchA2A(context.Background(), srv.URL, []byte(`{}`), "ws-caller")
+	resp, cancel, err := handler.dispatchA2A(context.Background(), "ws-target", srv.URL, []byte(`{}`), "ws-caller")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	defer resp.Body.Close()
 	if cancel == nil {
-		t.Fatal("agent-to-agent caller must set a timeout + return cancel")
+		t.Fatal("agent-to-agent caller must return a cancel func (idle + ceiling cleanup)")
 	}
 	cancel()
 }

-func TestDispatchA2A_ContextDeadline_NoCancelAdded(t *testing.T) {
-	// When ctx already has a deadline, dispatchA2A must NOT layer its own
-	// timeout (cancel should be nil).
+func TestDispatchA2A_ContextDeadline_NoExtraCeiling(t *testing.T) {
+	// When ctx already has a deadline, dispatchA2A must not layer
+	// its own absolute ceiling on top — the caller's deadline wins.
+	// The idle-timer cleanup still produces a non-nil cancel func
+	// (introduced by the always-on idle timeout) but the cancel func
+	// is safe to call repeatedly and from a deferred path.
 	setupTestDB(t)
 	setupTestRedis(t)
 	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
@ -1144,17 +1160,95 @@ func TestDispatchA2A_ContextDeadline_NoCancelAdded(t *testing.T) {
 	ctx, ctxCancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer ctxCancel()

-	resp, cancel, err := handler.dispatchA2A(ctx, srv.URL, []byte(`{}`), "")
+	resp, cancel, err := handler.dispatchA2A(ctx, "ws-target", srv.URL, []byte(`{}`), "")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	defer resp.Body.Close()
-	if cancel != nil {
-		t.Error("cancel should be nil when ctx already has a deadline")
-		cancel()
+	if cancel == nil {
+		t.Error("cancel must be non-nil (idle-timer cleanup)")
 	}
 }

+// --- applyIdleTimeout ---
+
+// TestApplyIdleTimeout_FiresOnSilence verifies the helper cancels its
+// child ctx when no broadcaster events arrive for `idle` duration.
+// Uses a short idle window (60ms) so the test runs fast.
+func TestApplyIdleTimeout_FiresOnSilence(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	b := newTestBroadcaster()
+
+	parent, parentCancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer parentCancel()
+
+	idleCtx, idleCancel := applyIdleTimeout(parent, b, "ws-silent", 60*time.Millisecond)
+	defer idleCancel()
+
+	select {
+	case <-idleCtx.Done():
+		// expected — no events ever arrived for ws-silent
+	case <-time.After(2 * time.Second):
+		t.Fatal("idleCtx never cancelled despite no events")
+	}
+	if !errors.Is(idleCtx.Err(), context.Canceled) {
+		t.Errorf("idleCtx err = %v, want context.Canceled", idleCtx.Err())
+	}
+}
+
+// TestApplyIdleTimeout_ResetsOnEvent verifies that a broadcaster event
+// for the workspace resets the timer. Sends one event mid-window and
+// confirms ctx is still alive after the original deadline would have
+// fired, but cancelled after a second silence window elapses.
+func TestApplyIdleTimeout_ResetsOnEvent(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	b := newTestBroadcaster()
+
+	parent, parentCancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer parentCancel()
+
+	idle := 80 * time.Millisecond
+	idleCtx, idleCancel := applyIdleTimeout(parent, b, "ws-active", idle)
+	defer idleCancel()
+
+	// Send a progress event halfway through the window — should
+	// extend the deadline by another `idle`.
+	time.Sleep(idle / 2)
+	b.BroadcastOnly("ws-active", "ACTIVITY_LOGGED", map[string]interface{}{"activity_type": "agent_log"})
+
+	// At t = idle (original deadline), ctx must still be alive
+	// because the event reset the clock.
+	select {
+	case <-idleCtx.Done():
+		t.Fatal("idleCtx cancelled despite mid-window event resetting the timer")
+	case <-time.After(idle - (idle / 2) + 10*time.Millisecond):
+		// ok — past the original deadline, still alive
+	}
+
+	// Now wait for the second silence window to actually fire.
+	select {
+	case <-idleCtx.Done():
+		// expected
+	case <-time.After(idle + 200*time.Millisecond):
+		t.Fatal("idleCtx never cancelled after the second silence window")
+	}
+}
+
+// TestApplyIdleTimeout_NilBroadcasterDegradesGracefully — nil
+// broadcaster (some test paths) returns the parent ctx unchanged.
+func TestApplyIdleTimeout_NilBroadcasterDegradesGracefully(t *testing.T) {
+	parent := context.Background()
+	idleCtx, cancel := applyIdleTimeout(parent, nil, "ws-x", 50*time.Millisecond)
+	defer cancel()
+	if idleCtx != parent {
+		t.Error("nil broadcaster must return the parent ctx unchanged")
+	}
+	// And calling cancel must be safe.
+	cancel()
+}
+
 // TestDispatchA2A_RejectsUnsafeURL is the #1483 defense-in-depth
 // regression. setupTestDB disables SSRF for normal tests so existing
 // dispatchA2A unit tests can hit httptest.NewServer (loopback) — we
@ -1162,6 +1256,10 @@ func TestDispatchA2A_ContextDeadline_NoCancelAdded(t *testing.T) {
 // Production callers go through resolveAgentURL which already
 // validates; this test pins that dispatchA2A is now safe even when
 // called directly by a future caller that skips resolveAgentURL.
+//
+// Note: dispatchA2A's signature includes workspaceID (added by the
+// idle-timeout work) so this test passes a stub value — the SSRF check
+// fires before workspaceID is referenced.
 func TestDispatchA2A_RejectsUnsafeURL(t *testing.T) {
 	setupTestDB(t)
 	setupTestRedis(t)
@ -1172,6 +1270,7 @@ func TestDispatchA2A_RejectsUnsafeURL(t *testing.T) {
 	// Cloud metadata IP — must be rejected before any HTTP call goes out.
 	_, cancel, err := handler.dispatchA2A(
 		context.Background(),
+		"ws-target",
 		"http://169.254.169.254/latest/meta-data/",
 		[]byte(`{}`),
 		"",
@ -1188,6 +1287,7 @@ func TestDispatchA2A_RejectsUnsafeURL(t *testing.T) {
 	}
 }

+
 // --- handleA2ADispatchError ---

 func TestHandleA2ADispatchError_ContextDeadline(t *testing.T) {
--- a/workspace-server/internal/handlers/admin_workspace_images.go
+++ b/workspace-server/internal/handlers/admin_workspace_images.go
@ -21,54 +21,53 @@ import (
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
 )

-// AdminWorkspaceImagesHandler serves POST /admin/workspace-images/refresh — the
-// production-side end of the runtime CD chain. Operators (or post-publish
-// automation) hit this to (1) pull the latest workspace template images from
-// GHCR via the Docker SDK and (2) recreate any running ws-* containers so
-// they adopt the new image. Without this, a freshly-published runtime sits
-// in the registry but containers keep running the old image until the next
-// manual restart.
+// WorkspaceImageService is the production-side end of the runtime CD chain.
+// It (1) pulls workspace template images from GHCR via the Docker SDK and
+// (2) recreates running ws-* containers so they adopt the new image.
 //
-// On a SaaS deployment the deploy pipeline already pulls on every release,
-// so the pull step is a no-op there; the recreate step is still the way to
-// make running workspaces adopt the new image without a full host restart.
-//
-// POST /admin/workspace-images/refresh
-//
-//	?runtime=claude-code   (optional; default = all 8 templates)
-//	&recreate=true|false   (default true; false = pull only)
-//
-// Returns JSON {pulled: [...], failed: [...], recreated: [...]}
-type AdminWorkspaceImagesHandler struct {
+// Two callers:
+//   - AdminWorkspaceImagesHandler — POST /admin/workspace-images/refresh, the
+//     manual end-of-chain trigger documented in
+//     docs/workspace-runtime-package.md.
+//   - imagewatch.Watcher — the auto-refresh goroutine that polls GHCR
+//     digests and invokes Refresh when an image changes upstream. This is
+//     what closes the chain to "merge → containers running new code" with
+//     no human in between.
+type WorkspaceImageService struct {
 	docker *dockerclient.Client
 }

-func NewAdminWorkspaceImagesHandler(docker *dockerclient.Client) *AdminWorkspaceImagesHandler {
-	return &AdminWorkspaceImagesHandler{docker: docker}
+func NewWorkspaceImageService(docker *dockerclient.Client) *WorkspaceImageService {
+	return &WorkspaceImageService{docker: docker}
 }

-// allRuntimes is the canonical list mirroring docs/workspace-runtime-package.md.
+// AllRuntimes is the canonical list mirroring docs/workspace-runtime-package.md.
 // Update both when a new template is added.
-var allRuntimes = []string{
+var AllRuntimes = []string{
 	"claude-code", "langgraph", "crewai", "autogen",
 	"deepagents", "hermes", "gemini-cli", "openclaw",
 }

-type refreshResult struct {
+// RefreshResult is the per-call outcome surfaced to HTTP callers AND logged
+// by the auto-refresh watcher.
+type RefreshResult struct {
 	Pulled    []string `json:"pulled"`
 	Failed    []string `json:"failed"`
 	Recreated []string `json:"recreated"`
 }

+// TemplateImageRef returns the canonical GHCR ref for a runtime's template
+// image. Single source of truth shared with imagewatch.
+func TemplateImageRef(runtime string) string {
+	return fmt.Sprintf("ghcr.io/molecule-ai/workspace-template-%s:latest", runtime)
+}
+
 // ghcrAuthHeader returns the base64-encoded JSON auth payload Docker's
 // ImagePull expects in PullOptions.RegistryAuth, or empty string when no
 // GHCR_USER/GHCR_TOKEN env is set (lets public images pull through).
 //
 // The Docker SDK doesn't read ~/.docker/config.json — every authenticated
-// pull needs an explicit RegistryAuth string. Format per the Docker
-// engine API: {"username":"…","password":"…","serveraddress":"ghcr.io"}
-// → base64-encoded JSON with no trailing padding stripped (engine handles
-// either form).
+// pull needs an explicit RegistryAuth string.
 func ghcrAuthHeader() string {
 	user := strings.TrimSpace(os.Getenv("GHCR_USER"))
 	token := strings.TrimSpace(os.Getenv("GHCR_TOKEN"))
@ -82,63 +81,40 @@ func ghcrAuthHeader() string {
 	}
 	js, err := json.Marshal(payload)
 	if err != nil {
-		// Should be unreachable for a static map[string]string. Log so a
-		// future contributor adding a non-marshallable field notices.
 		log.Printf("workspace-images: failed to marshal GHCR auth: %v", err)
 		return ""
 	}
 	return base64.URLEncoding.EncodeToString(js)
 }

-func (h *AdminWorkspaceImagesHandler) Refresh(c *gin.Context) {
-	runtimes := allRuntimes
-	if r := c.Query("runtime"); r != "" {
-		// Accept a single runtime; reject anything not in the canonical list
-		// so a typo doesn't silently no-op.
-		found := false
-		for _, known := range allRuntimes {
-			if known == r {
-				found = true
-				break
-			}
-		}
-		if !found {
-			c.JSON(http.StatusBadRequest, gin.H{
-				"error":          fmt.Sprintf("unknown runtime: %s", r),
-				"known_runtimes": allRuntimes,
-			})
-			return
-		}
-		runtimes = []string{r}
-	}
-	recreate := c.DefaultQuery("recreate", "true") == "true"
-
-	res := refreshResult{Pulled: []string{}, Failed: []string{}, Recreated: []string{}}
+// Refresh pulls the requested runtimes' template images from GHCR and (if
+// recreate) force-removes any matching ws-* containers so the platform
+// re-provisions them on next interaction.
+//
+// Soft-fails per runtime: one missing image (e.g. unpublished template)
+// doesn't abort the others. Per-runtime failures are in RefreshResult.Failed.
+// Returns a non-nil error only when the recreate phase couldn't enumerate
+// containers at all (caller should surface that as 500).
+func (s *WorkspaceImageService) Refresh(ctx context.Context, runtimes []string, recreate bool) (RefreshResult, error) {
+	res := RefreshResult{Pulled: []string{}, Failed: []string{}, Recreated: []string{}}
 	auth := ghcrAuthHeader()

-	// 1. Pull each template image via the Docker SDK. Soft-fail per-runtime
-	//    so one missing image (e.g. unpublished template) doesn't abort
-	//    the others. Each pull's progress stream is drained to completion
-	//    — the engine treats early-close as "abandon", leaving partial
-	//    layers around with no reference.
-	pullCtx, cancel := context.WithTimeout(c.Request.Context(), 5*time.Minute)
+	pullCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
 	defer cancel()
 	for _, rt := range runtimes {
-		image := fmt.Sprintf("ghcr.io/molecule-ai/workspace-template-%s:latest", rt)
+		image := TemplateImageRef(rt)
 		opts := dockerimage.PullOptions{Platform: provisioner.DefaultImagePlatform()}
 		if auth != "" {
 			opts.RegistryAuth = auth
 		}
-		rc, err := h.docker.ImagePull(pullCtx, image, opts)
+		rc, err := s.docker.ImagePull(pullCtx, image, opts)
 		if err != nil {
 			log.Printf("workspace-images/refresh: pull %s failed: %v", rt, err)
 			res.Failed = append(res.Failed, rt)
 			continue
 		}
-		// Drain to completion. We discard progress payload because no
-		// caller renders it; the platform log already records pulled/failed
-		// per runtime. If a future caller wants live progress, decode the
-		// JSON-line stream into events here.
+		// Drain to completion. The engine treats early-close as "abandon",
+		// leaving partial layers around with no reference.
 		if _, err := io.Copy(io.Discard, rc); err != nil {
 			rc.Close()
 			log.Printf("workspace-images/refresh: drain %s failed: %v", rt, err)
@ -150,23 +126,18 @@ func (h *AdminWorkspaceImagesHandler) Refresh(c *gin.Context) {
 	}

 	if !recreate {
-		c.JSON(http.StatusOK, res)
-		return
+		return res, nil
 	}

-	// 2. Find ws-* containers running an image we just pulled. Recreate
-	//    them — kill+remove and let the platform's normal provisioning
-	//    flow re-create on next canvas interaction.
-	listCtx, listCancel := context.WithTimeout(c.Request.Context(), 30*time.Second)
+	listCtx, listCancel := context.WithTimeout(ctx, 30*time.Second)
 	defer listCancel()
-	containers, err := h.docker.ContainerList(listCtx, container.ListOptions{
+	containers, err := s.docker.ContainerList(listCtx, container.ListOptions{
 		All:     true,
 		Filters: filters.NewArgs(filters.Arg("name", "ws-")),
 	})
 	if err != nil {
 		log.Printf("workspace-images/refresh: container list failed: %v", err)
-		c.JSON(http.StatusInternalServerError, gin.H{"error": "container list failed", "partial_result": res})
-		return
+		return res, fmt.Errorf("container list: %w", err)
 	}

 	pulledSet := map[string]struct{}{}
@ -175,14 +146,10 @@ func (h *AdminWorkspaceImagesHandler) Refresh(c *gin.Context) {
 	}
 	for _, ctr := range containers {
 		// ContainerList's ctr.Image is the *resolved digest* (sha256:…),
-		// not the human-readable tag. Use ContainerInspect to get the
-		// original Config.Image (e.g. "ghcr.io/molecule-ai/workspace-
-		// template-claude-code:latest") so we can match against the
-		// pulled-runtime set. The cost is one extra round-trip per
-		// ws-* container — there are at most 8 typically, so this is
-		// well below any UX threshold.
-		inspectCtx, inspectCancel := context.WithTimeout(c.Request.Context(), 10*time.Second)
-		full, err := h.docker.ContainerInspect(inspectCtx, ctr.ID)
+		// not the human-readable tag. Inspect to get Config.Image so we
+		// can match against the pulled-runtime set.
+		inspectCtx, inspectCancel := context.WithTimeout(ctx, 10*time.Second)
+		full, err := s.docker.ContainerInspect(inspectCtx, ctr.ID)
 		inspectCancel()
 		if err != nil {
 			log.Printf("workspace-images/refresh: inspect %s failed: %v", ctr.ID[:12], err)
@ -203,12 +170,8 @@ func (h *AdminWorkspaceImagesHandler) Refresh(c *gin.Context) {
 			continue
 		}
 		name := strings.TrimPrefix(ctr.Names[0], "/")
-		// Remove with force — the workspace will re-provision on the next
-		// canvas interaction. This drops in-flight conversations on the
-		// removed container; document via the response so callers can
-		// schedule the refresh during a quiet window.
-		rmCtx, rmCancel := context.WithTimeout(c.Request.Context(), 30*time.Second)
-		err = h.docker.ContainerRemove(rmCtx, ctr.ID, container.RemoveOptions{Force: true})
+		rmCtx, rmCancel := context.WithTimeout(ctx, 30*time.Second)
+		err = s.docker.ContainerRemove(rmCtx, ctr.ID, container.RemoveOptions{Force: true})
 		rmCancel()
 		if err != nil {
 			log.Printf("workspace-images/refresh: remove %s failed: %v", name, err)
@ -216,12 +179,60 @@ func (h *AdminWorkspaceImagesHandler) Refresh(c *gin.Context) {
 		}
 		res.Recreated = append(res.Recreated, name)
 	}
+	return res, nil
+}

+// AdminWorkspaceImagesHandler serves POST /admin/workspace-images/refresh.
+//
+//	?runtime=claude-code   (optional; default = all 8 templates)
+//	&recreate=true|false   (default true; false = pull only)
+//
+// Returns JSON {pulled: [...], failed: [...], recreated: [...]}
+type AdminWorkspaceImagesHandler struct {
+	svc *WorkspaceImageService
+}
+
+func NewAdminWorkspaceImagesHandler(docker *dockerclient.Client) *AdminWorkspaceImagesHandler {
+	return &AdminWorkspaceImagesHandler{svc: NewWorkspaceImageService(docker)}
+}
+
+// Service exposes the underlying refresh logic so the auto-refresh watcher
+// in cmd/server can share the exact code path the HTTP handler uses.
+func (h *AdminWorkspaceImagesHandler) Service() *WorkspaceImageService {
+	return h.svc
+}
+
+func (h *AdminWorkspaceImagesHandler) Refresh(c *gin.Context) {
+	runtimes := AllRuntimes
+	if r := c.Query("runtime"); r != "" {
+		found := false
+		for _, known := range AllRuntimes {
+			if known == r {
+				found = true
+				break
+			}
+		}
+		if !found {
+			c.JSON(http.StatusBadRequest, gin.H{
+				"error":          fmt.Sprintf("unknown runtime: %s", r),
+				"known_runtimes": AllRuntimes,
+			})
+			return
+		}
+		runtimes = []string{r}
+	}
+	recreate := c.DefaultQuery("recreate", "true") == "true"
+
+	res, err := h.svc.Refresh(c.Request.Context(), runtimes, recreate)
 	authStatus := "no GHCR auth (public images only)"
-	if auth != "" {
+	if ghcrAuthHeader() != "" {
 		authStatus = "GHCR_USER/GHCR_TOKEN auth"
 	}
 	log.Printf("workspace-images/refresh: pulled=%d failed=%d recreated=%d (%s)",
 		len(res.Pulled), len(res.Failed), len(res.Recreated), authStatus)
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error(), "partial_result": res})
+		return
+	}
 	c.JSON(http.StatusOK, res)
 }
--- a/workspace-server/internal/handlers/chat_files.go
+++ b/workspace-server/internal/handlers/chat_files.go
@ -0,0 +1,415 @@
+package handlers
+
+// chat_files.go — file upload/download for workspace chat.
+//
+// Split from templates.go because these endpoints have a different
+// security model (no /configs write, no template fallback) and a
+// different wire format (multipart in, binary-stream out). Template
+// files are agent workspace configuration; chat files are user-agent
+// conversation payloads.
+
+import (
+	"archive/tar"
+	"bytes"
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"fmt"
+	"io"
+	"log"
+	"mime"
+	"mime/multipart"
+	"net/http"
+	"path/filepath"
+	"regexp"
+	"strings"
+
+	"github.com/docker/docker/api/types/container"
+	"github.com/gin-gonic/gin"
+)
+
+// ChatFilesHandler serves file upload + download for chat. It
+// composes the existing TemplatesHandler's Docker plumbing
+// (findContainer, execInContainer, copyFilesToContainer) rather than
+// duplicating them, so a bug fix in the Docker layer propagates to
+// both endpoints.
+type ChatFilesHandler struct {
+	templates *TemplatesHandler
+}
+
+func NewChatFilesHandler(t *TemplatesHandler) *ChatFilesHandler {
+	return &ChatFilesHandler{templates: t}
+}
+
+// chatUploadMaxBytes caps the full multipart request body so a
+// malicious / runaway client can't OOM the server. 50 MB covers most
+// documents + a handful of images per message; larger artefacts
+// should go through git/S3 rather than chat.
+const chatUploadMaxBytes = 50 * 1024 * 1024
+
+// chatUploadMaxFileBytes caps individual files in a multi-file upload.
+// Keeping the per-file cap below the total lets a user send, say, a
+// 5 MB PDF + 10 screenshots without tripping the batch limit on any
+// single attachment.
+const chatUploadMaxFileBytes = 25 * 1024 * 1024
+
+// chatUploadDir is the in-container path where user-uploaded chat
+// attachments land. Under /workspace so the file persists with the
+// workspace volume and is readable by the agent without any extra
+// plumbing — the agent just reads from the URI path we return.
+const chatUploadDir = "/workspace/.molecule/chat-uploads"
+
+// unsafeFilenameChars matches anything outside the conservative
+// {alnum, dot, underscore, dash} set. Filenames get rewritten
+// character-class at a time, so embedded paths, control chars,
+// newlines, quotes, and shell metachars never reach the filesystem.
+var unsafeFilenameChars = regexp.MustCompile(`[^a-zA-Z0-9._\-]`)
+
+// contentDispositionAttachment produces a safe `attachment; filename=...`
+// header. Quotes, CR, and LF in the filename are escaped per RFC 6266 /
+// RFC 5987: control chars dropped, backslash and double-quote
+// backslash-escaped inside the quoted-string. Also emits the
+// percent-encoded filename* parameter so non-ASCII names survive.
+// This matters because agents can write arbitrary filenames into
+// /workspace, and anything they produce reaches this header via
+// `filepath.Base(path)` — not all agents sanitize on their side.
+func contentDispositionAttachment(name string) string {
+	safeQ := make([]rune, 0, len(name))
+	for _, r := range name {
+		switch {
+		case r == '\r' || r == '\n':
+			// Drop — any CR/LF would terminate the header early.
+			continue
+		case r == '"' || r == '\\':
+			// Escape per RFC 6266 §4.1 quoted-string.
+			safeQ = append(safeQ, '\\', r)
+		case r < 0x20 || r == 0x7f:
+			// Drop other control chars.
+			continue
+		default:
+			safeQ = append(safeQ, r)
+		}
+	}
+	asciiSafe := string(safeQ)
+	// filename=  — double-quoted, escaped. Gives legacy clients a value.
+	// filename*= — RFC 5987 percent-encoded UTF-8, preferred when present.
+	return fmt.Sprintf(`attachment; filename="%s"; filename*=UTF-8''%s`,
+		asciiSafe, urlPathEscape(name))
+}
+
+// urlPathEscape percent-encodes every byte outside the RFC 3986
+// unreserved set — stricter than net/url.PathEscape (which leaves
+// "/" unescaped because it's legal in URL paths). Filenames must
+// never contain "/" anyway, so escaping it is defence-in-depth
+// against an agent that writes a path-like name.
+func urlPathEscape(s string) string {
+	const unreserved = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
+	var b strings.Builder
+	for _, c := range []byte(s) {
+		if strings.IndexByte(unreserved, c) >= 0 {
+			b.WriteByte(c)
+		} else {
+			fmt.Fprintf(&b, "%%%02X", c)
+		}
+	}
+	return b.String()
+}
+
+func sanitizeFilename(in string) string {
+	base := filepath.Base(in)
+	base = strings.ReplaceAll(base, " ", "_")
+	base = unsafeFilenameChars.ReplaceAllString(base, "_")
+	if len(base) > 100 {
+		ext := filepath.Ext(base)
+		if len(ext) > 16 {
+			ext = ""
+		}
+		base = base[:100-len(ext)] + ext
+	}
+	if base == "" || base == "." || base == ".." {
+		return "file"
+	}
+	return base
+}
+
+// ChatUploadedFile is the per-file response returned from POST
+// /workspaces/:id/chat/uploads. Clients include this payload (or a
+// trimmed subset) in their outgoing A2A `message/send` parts.
+type ChatUploadedFile struct {
+	// URI uses a custom "workspace:" scheme so clients can resolve it
+	// against the streaming Download endpoint regardless of where the
+	// canvas itself is hosted. The path component is always absolute
+	// within the workspace container.
+	URI      string `json:"uri"`
+	Name     string `json:"name"`
+	MimeType string `json:"mimeType,omitempty"`
+	Size     int64  `json:"size"`
+}
+
+// Upload handles POST /workspaces/:id/chat/uploads.
+// Accepts multipart/form-data with one or more `files` fields, stages
+// each under /workspace/.molecule/chat-uploads with a UUID prefix,
+// and returns the list of URIs for the caller to attach to an A2A
+// message.
+func (h *ChatFilesHandler) Upload(c *gin.Context) {
+	workspaceID := c.Param("id")
+	if err := validateWorkspaceID(workspaceID); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid workspace ID"})
+		return
+	}
+
+	// Hard cap the request body BEFORE ParseMultipartForm — otherwise
+	// a client could chunk-upload past the cap before Go notices.
+	c.Request.Body = http.MaxBytesReader(c.Writer, c.Request.Body, chatUploadMaxBytes)
+	if err := c.Request.ParseMultipartForm(chatUploadMaxBytes); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "failed to parse multipart form"})
+		return
+	}
+
+	form := c.Request.MultipartForm
+	var headers []*multipart.FileHeader
+	if form != nil && form.File != nil {
+		headers = form.File["files"]
+	}
+	if len(headers) == 0 {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "expected at least one 'files' field"})
+		return
+	}
+
+	ctx := c.Request.Context()
+	containerName := h.templates.findContainer(ctx, workspaceID)
+	if containerName == "" {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"})
+		return
+	}
+
+	// Build the archive in memory. Files are byte-preserving through
+	// Go's string<->[]byte (the tar helper takes map[string]string but
+	// the conversion is a literal copy, not a UTF-8 reinterpretation).
+	archive := map[string]string{}
+	uploaded := make([]ChatUploadedFile, 0, len(headers))
+	for _, fh := range headers {
+		if fh.Size > chatUploadMaxFileBytes {
+			c.JSON(http.StatusRequestEntityTooLarge, gin.H{
+				"error": fmt.Sprintf("%s exceeds per-file limit (%d MB)", fh.Filename, chatUploadMaxFileBytes/(1024*1024)),
+			})
+			return
+		}
+		f, err := fh.Open()
+		if err != nil {
+			c.JSON(http.StatusBadRequest, gin.H{"error": "failed to read upload"})
+			return
+		}
+		// LimitReader guards against a truthful-but-lying Size header:
+		// if the multipart stream carries more bytes than declared, we
+		// stop at the cap instead of growing the buffer.
+		data, err := io.ReadAll(io.LimitReader(f, chatUploadMaxFileBytes+1))
+		f.Close()
+		if err != nil {
+			c.JSON(http.StatusBadRequest, gin.H{"error": "failed to read upload"})
+			return
+		}
+		if int64(len(data)) > chatUploadMaxFileBytes {
+			c.JSON(http.StatusRequestEntityTooLarge, gin.H{
+				"error": fmt.Sprintf("%s exceeds per-file limit (%d MB)", fh.Filename, chatUploadMaxFileBytes/(1024*1024)),
+			})
+			return
+		}
+
+		name := sanitizeFilename(fh.Filename)
+		// 16-byte (UUID-equivalent) random prefix. Within a single
+		// batch we also check for collisions — birthday on 128 bits
+		// is astronomical, but a bad PRNG or single re-used draw
+		// would silently overwrite a sibling upload with its own
+		// content and return two URIs pointing at one file.
+		var stored string
+		for attempt := 0; attempt < 4; attempt++ {
+			idBytes := make([]byte, 16)
+			if _, err := rand.Read(idBytes); err != nil {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to allocate upload ID"})
+				return
+			}
+			candidate := hex.EncodeToString(idBytes) + "-" + name
+			if _, taken := archive[candidate]; !taken {
+				stored = candidate
+				break
+			}
+		}
+		if stored == "" {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to allocate unique upload ID"})
+			return
+		}
+		archive[stored] = string(data)
+
+		mt := fh.Header.Get("Content-Type")
+		if mt == "" {
+			mt = mime.TypeByExtension(filepath.Ext(name))
+		}
+		uploaded = append(uploaded, ChatUploadedFile{
+			URI:      "workspace:" + chatUploadDir + "/" + stored,
+			Name:     name,
+			MimeType: mt,
+			Size:     int64(len(data)),
+		})
+	}
+
+	// mkdir -p is idempotent; we fire it every upload instead of
+	// caching state here so container restarts don't surprise us.
+	_, _ = h.templates.execInContainer(ctx, containerName, []string{"mkdir", "-p", chatUploadDir})
+
+	// Defence in depth: pre-remove each target path before extracting
+	// the tar. An agent with write access to /workspace could in
+	// theory race-create a symlink at <chatUploadDir>/<stored-name>
+	// pointing at a sensitive in-container path (its own /etc/*,
+	// mounted secrets). Docker's tar extraction on some drivers
+	// follows pre-existing symlinks at the destination. `rm -f` the
+	// exact stored-name closes that window — the UUID prefix on the
+	// name makes a successful race effectively impossible, but this
+	// guard costs nothing and documents the intent.
+	rmArgs := []string{"rm", "-f", "--"}
+	for stored := range archive {
+		rmArgs = append(rmArgs, chatUploadDir+"/"+stored)
+	}
+	_, _ = h.templates.execInContainer(ctx, containerName, rmArgs)
+
+	if err := h.copyFlatToContainer(ctx, containerName, chatUploadDir, archive); err != nil {
+		log.Printf("Chat upload copy failed for %s: %v", workspaceID, err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to stage files in workspace"})
+		return
+	}
+
+	c.JSON(http.StatusOK, gin.H{"files": uploaded})
+}
+
+// copyFlatToContainer extracts one tar of flat files into destPath
+// inside the container. Unlike the shared copyFilesToContainer helper
+// (which prepends destPath into tar entry names — correct for its
+// callers whose files relative-live inside a nested tree), this
+// helper writes tar entries with ONLY the flat filename so Docker's
+// extraction at destPath lands them directly in destPath, not at
+// destPath/destPath/... as the shared helper would.
+// Filenames are validated to contain no path separator so nothing
+// can escape destPath via an embedded "../" or a leading "/".
+func (h *ChatFilesHandler) copyFlatToContainer(ctx context.Context, containerName, destPath string, files map[string]string) error {
+	if h.templates.docker == nil {
+		return fmt.Errorf("docker not available")
+	}
+	var buf bytes.Buffer
+	tw := tar.NewWriter(&buf)
+	for name, content := range files {
+		if strings.ContainsAny(name, "/\\") || name == ".." || name == "." || name == "" {
+			return fmt.Errorf("unsafe flat filename: %q", name)
+		}
+		data := []byte(content)
+		if err := tw.WriteHeader(&tar.Header{
+			Name:     name, // relative — Docker resolves against destPath
+			Mode:     0644,
+			Size:     int64(len(data)),
+			Typeflag: tar.TypeReg,
+		}); err != nil {
+			return fmt.Errorf("tar header %q: %w", name, err)
+		}
+		if _, err := tw.Write(data); err != nil {
+			return fmt.Errorf("tar write %q: %w", name, err)
+		}
+	}
+	if err := tw.Close(); err != nil {
+		return fmt.Errorf("tar close: %w", err)
+	}
+	return h.templates.docker.CopyToContainer(ctx, containerName, destPath, &buf, container.CopyToContainerOptions{})
+}
+
+// Download handles GET /workspaces/:id/chat/download?path=<abs path>.
+// Streams the file bytes from the container with a correct
+// Content-Type and attachment Content-Disposition. Binary-safe —
+// unlike the existing JSON ReadFile endpoint which carries content
+// as a string (lossy for non-UTF-8 bytes).
+func (h *ChatFilesHandler) Download(c *gin.Context) {
+	workspaceID := c.Param("id")
+	if err := validateWorkspaceID(workspaceID); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid workspace ID"})
+		return
+	}
+
+	path := c.Query("path")
+	if path == "" {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path query required"})
+		return
+	}
+	if !filepath.IsAbs(path) {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path must be absolute"})
+		return
+	}
+	// Path must land under one of the allowed roots — mirrors the
+	// ReadFile security model and prevents arbitrary reads of /etc
+	// or other system paths via this endpoint.
+	rooted := false
+	for root := range allowedRoots {
+		if path == root || strings.HasPrefix(path, root+"/") {
+			rooted = true
+			break
+		}
+	}
+	if !rooted {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path must be under /configs, /workspace, /home, or /plugins"})
+		return
+	}
+	// Reject anything that canonicalises differently or contains a
+	// traversal segment. Defence-in-depth on top of the prefix check.
+	if filepath.Clean(path) != path || strings.Contains(path, "..") {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid path"})
+		return
+	}
+
+	ctx := c.Request.Context()
+	if h.templates.docker == nil {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "docker unavailable"})
+		return
+	}
+	containerName := h.templates.findContainer(ctx, workspaceID)
+	if containerName == "" {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"})
+		return
+	}
+
+	// docker cp returns a tar stream containing the requested path.
+	// For a regular file that's a single tar entry; we extract and
+	// stream the body through.
+	reader, _, err := h.templates.docker.CopyFromContainer(ctx, containerName, path)
+	if err != nil {
+		c.JSON(http.StatusNotFound, gin.H{"error": "file not found"})
+		return
+	}
+	defer reader.Close()
+
+	tr := tar.NewReader(reader)
+	hdr, err := tr.Next()
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to read archive"})
+		return
+	}
+	if hdr.Typeflag != tar.TypeReg {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path is not a regular file"})
+		return
+	}
+
+	name := filepath.Base(path)
+	mt := mime.TypeByExtension(filepath.Ext(name))
+	if mt == "" {
+		mt = "application/octet-stream"
+	}
+	c.Header("Content-Type", mt)
+	c.Header("Content-Length", fmt.Sprintf("%d", hdr.Size))
+	c.Header("Content-Disposition", contentDispositionAttachment(name))
+	c.Status(http.StatusOK)
+
+	// Stream exactly hdr.Size bytes. CopyN was chosen over LimitReader
+	// because it returns an error when the source is short — that
+	// surfaces a bug in the tar extraction path immediately instead
+	// of silently truncating. Agents can legitimately produce files
+	// larger than the 50 MB upload cap (that's a per-request inbound
+	// cap, not a per-artifact one), so we cannot clamp here.
+	if _, err := io.CopyN(c.Writer, tr, hdr.Size); err != nil {
+		log.Printf("Chat download stream error for %s (%s): %v", workspaceID, path, err)
+	}
+}
--- a/workspace-server/internal/handlers/chat_files_test.go
+++ b/workspace-server/internal/handlers/chat_files_test.go
@ -0,0 +1,194 @@
+package handlers
+
+// Unit tests for chat_files.go. The Docker-touching paths (Upload
+// actually copying into a container, Download actually streaming tar)
+// are exercised via integration tests — docker-in-docker is out of
+// scope for the unit suite. These tests cover the validation + error
+// surfaces that a caller can reach without a running container.
+
+import (
+	"bytes"
+	"mime/multipart"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	"github.com/gin-gonic/gin"
+)
+
+func TestSanitizeFilename(t *testing.T) {
+	cases := []struct {
+		in, want string
+	}{
+		{"report.pdf", "report.pdf"},
+		{"my file.pdf", "my_file.pdf"},
+		{"../../etc/passwd", "passwd"},
+		{"weird;$name`.txt", "weird__name_.txt"},
+		{"", "file"},
+		{".", "file"},
+		{"..", "file"},
+	}
+	for _, tc := range cases {
+		got := sanitizeFilename(tc.in)
+		if got != tc.want {
+			t.Errorf("sanitizeFilename(%q) = %q, want %q", tc.in, got, tc.want)
+		}
+	}
+}
+
+func TestSanitizeFilename_LongNamePreservesExtension(t *testing.T) {
+	// 120-char base + .pdf — the helper should truncate the base but
+	// keep the extension intact so content-type inference still works.
+	longBase := strings.Repeat("a", 120)
+	got := sanitizeFilename(longBase + ".pdf")
+	if len(got) > 100 {
+		t.Errorf("filename not truncated: len=%d", len(got))
+	}
+	if !strings.HasSuffix(got, ".pdf") {
+		t.Errorf("extension stripped: %q", got)
+	}
+}
+
+func TestChatUpload_InvalidWorkspaceID(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil)
+	h := NewChatFilesHandler(tmplh)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "not-a-uuid"}}
+	c.Request = httptest.NewRequest("POST", "/workspaces/not-a-uuid/chat/uploads", nil)
+
+	h.Upload(c)
+
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400 on invalid workspace id, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
+func TestChatUpload_MissingFiles(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil)
+	h := NewChatFilesHandler(tmplh)
+
+	// Multipart body with no `files` field — only a text field.
+	var buf bytes.Buffer
+	mw := multipart.NewWriter(&buf)
+	_ = mw.WriteField("other", "value")
+	mw.Close()
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+	req := httptest.NewRequest("POST", "/workspaces/00000000-0000-0000-0000-000000000001/chat/uploads", &buf)
+	req.Header.Set("Content-Type", mw.FormDataContentType())
+	c.Request = req
+
+	h.Upload(c)
+
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400 when files field missing, got %d: %s", w.Code, w.Body.String())
+	}
+	if !strings.Contains(w.Body.String(), "files") {
+		t.Errorf("expected error to mention files field: %s", w.Body.String())
+	}
+}
+
+func TestChatDownload_InvalidPath(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil)
+	h := NewChatFilesHandler(tmplh)
+
+	cases := []struct {
+		name, path, wantSubstr string
+	}{
+		{"empty", "", "path query required"},
+		{"relative", "workspace/foo.txt", "must be absolute"},
+		{"wrong root", "/etc/passwd", "must be under"},
+		{"traversal", "/workspace/../etc/passwd", "invalid path"},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			w := httptest.NewRecorder()
+			c, _ := gin.CreateTestContext(w)
+			c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+			req := httptest.NewRequest("GET", "/workspaces/xxx/chat/download?path="+tc.path, nil)
+			c.Request = req
+
+			h.Download(c)
+
+			if w.Code != http.StatusBadRequest {
+				t.Errorf("expected 400 for %s, got %d: %s", tc.name, w.Code, w.Body.String())
+			}
+			if !strings.Contains(w.Body.String(), tc.wantSubstr) {
+				t.Errorf("expected error to contain %q, got: %s", tc.wantSubstr, w.Body.String())
+			}
+		})
+	}
+}
+
+func TestContentDispositionAttachment_Escapes(t *testing.T) {
+	cases := []struct {
+		name, input, wantSubstr string
+	}{
+		{
+			name:       "plain ASCII passes through",
+			input:      "report.pdf",
+			wantSubstr: `filename="report.pdf"`,
+		},
+		{
+			name:       "double-quote is backslash-escaped",
+			input:      `weird".pdf`,
+			wantSubstr: `filename="weird\".pdf"`,
+		},
+		{
+			name:       "CR and LF dropped to prevent header injection",
+			input:      "bad\r\nX-Leak: 1\r\n.txt",
+			wantSubstr: `filename="badX-Leak: 1.txt"`,
+		},
+		{
+			name:       "non-ASCII emits filename* percent-encoded",
+			input:      "résumé.pdf",
+			wantSubstr: "filename*=UTF-8''r%C3%A9sum%C3%A9.pdf",
+		},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := contentDispositionAttachment(tc.input)
+			if !strings.Contains(got, tc.wantSubstr) {
+				t.Errorf("contentDispositionAttachment(%q) = %q, missing substring %q", tc.input, got, tc.wantSubstr)
+			}
+			// Must never contain a bare CR or LF — either would end the header.
+			if strings.ContainsAny(got, "\r\n") {
+				t.Errorf("header contains CR/LF: %q", got)
+			}
+		})
+	}
+}
+
+func TestChatDownload_DockerUnavailable(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil) // docker=nil
+	h := NewChatFilesHandler(tmplh)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+	req := httptest.NewRequest("GET", "/workspaces/xxx/chat/download?path=/workspace/report.pdf", nil)
+	c.Request = req
+
+	h.Download(c)
+
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503 when docker is nil, got %d: %s", w.Code, w.Body.String())
+	}
+}
--- a/workspace-server/internal/handlers/handlers_additional_test.go
+++ b/workspace-server/internal/handlers/handlers_additional_test.go
@ -11,6 +11,7 @@ import (
 	"time"

 	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
 	"github.com/gin-gonic/gin"
 )

@ -31,7 +32,7 @@ func TestWorkspaceCreate_WithParentID(t *testing.T) {
 	mock.ExpectBegin()
 	// Default tier is 3 (Privileged) — see workspace.go create-handler comment.
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Child Agent", nil, 3, "langgraph", sqlmock.AnyArg(), &parentID, nil, "none", (*int64)(nil)).
+		WithArgs(sqlmock.AnyArg(), "Child Agent", nil, 3, "langgraph", sqlmock.AnyArg(), &parentID, nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@ -66,7 +67,7 @@ func TestWorkspaceCreate_ExplicitClaudeCodeRuntime(t *testing.T) {

 	mock.ExpectBegin()
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "CC Agent", nil, 2, "claude-code", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil)).
+		WithArgs(sqlmock.AnyArg(), "CC Agent", nil, 2, "claude-code", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@ -277,6 +278,40 @@ func TestWorkspaceList_WithData(t *testing.T) {
 	}
 }

+// ---------- workspace.go: Create with explicit max_concurrent_tasks ----------
+
+func TestWorkspaceCreate_MaxConcurrentTasksOverride(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	mock.ExpectBegin()
+	mock.ExpectExec("INSERT INTO workspaces").
+		WithArgs(sqlmock.AnyArg(), "Leader Agent", nil, 3, "claude-code", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), 3).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	mock.ExpectCommit()
+	mock.ExpectExec("INSERT INTO canvas_layouts").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	mock.ExpectExec("INSERT INTO structure_events").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	body := `{"name":"Leader Agent","runtime":"claude-code","max_concurrent_tasks":3}`
+	c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Create(c)
+
+	if w.Code != http.StatusCreated {
+		t.Errorf("expected 201, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet expectations: %v", err)
+	}
+}
+
 // ---------- registry.go: Register with provisioner URL preserved ----------

 func TestRegister_ProvisionerURLPreserved(t *testing.T) {
--- a/workspace-server/internal/handlers/handlers_test.go
+++ b/workspace-server/internal/handlers/handlers_test.go
@ -291,7 +291,7 @@ func TestWorkspaceCreate(t *testing.T) {
 	// Expect workspace INSERT (uuid is dynamic, use AnyArg for id, runtime, awareness_namespace).
 	// Default tier is 3 (Privileged) — see workspace.go create-handler comment.
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Test Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil)).
+		WithArgs(sqlmock.AnyArg(), "Test Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks).
 		WillReturnResult(sqlmock.NewResult(0, 1))

 	// Expect transaction commit (no secrets in this payload)
@ -884,6 +884,116 @@ func TestHeartbeatHandler_TaskCleared(t *testing.T) {
 	}
 }

+// ---------- TestHeartbeatHandler_AlwaysBroadcastsHeartbeat ----------
+//
+// Regression for the "context canceled" wave on 2026-04-26 (15+ failures
+// in 1hr across 6 workspaces). The a2a-proxy idle timer subscribes to
+// the broadcaster's SSE channel for the workspace and resets on every
+// event. Pre-fix the only broadcast paths from heartbeat were
+// TASK_UPDATED (only on current_task change) and the
+// WORKSPACE_ONLINE/DEGRADED transitions inside evaluateStatus (only on
+// status change). A long-running agent on the same task with stable
+// status fired NO broadcasts → idle timer fired → user message
+// got cancelled mid-flight.
+//
+// The fix emits an unconditional WORKSPACE_HEARTBEAT on every successful
+// heartbeat. This test pins the property: regardless of whether
+// current_task changed, the SSE subscriber observes a broadcast.
+
+func TestHeartbeatHandler_AlwaysBroadcastsHeartbeat(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	// Subscribe BEFORE the heartbeat so we don't miss the broadcast.
+	sub, unsub := broadcaster.SubscribeSSE("ws-123")
+	defer unsub()
+
+	// Same-task scenario: task value unchanged across the heartbeat.
+	// Pre-fix this path emitted ZERO broadcasts.
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-123").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("doing work"))
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-123", 0.0, "", 1, 500, "doing work").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-123").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("online"))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	body := `{"workspace_id":"ws-123","error_rate":0.0,"sample_error":"","active_tasks":1,"uptime_seconds":500,"current_task":"doing work"}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+
+	// Drain whatever the handler broadcast (with a tight timeout — the
+	// channel is in-process so the event should already be queued by
+	// the time Heartbeat returns).
+	gotHeartbeat := false
+	for i := 0; i < 5; i++ {
+		select {
+		case msg, ok := <-sub:
+			if !ok {
+				t.Fatal("broadcaster channel closed unexpectedly")
+			}
+			if msg.Event == "WORKSPACE_HEARTBEAT" {
+				gotHeartbeat = true
+				goto done
+			}
+		case <-time.After(200 * time.Millisecond):
+			goto done
+		}
+	}
+done:
+	if !gotHeartbeat {
+		t.Error("expected WORKSPACE_HEARTBEAT broadcast on every heartbeat (regression: pre-fix, same-task heartbeats fired no broadcast and the a2a-proxy idle timer trip-cancelled in-flight requests)")
+	}
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// ---------- TestParseIdleTimeoutEnv ----------
+//
+// Pins the env-override path including the bad-input fallback paths
+// that the package-init `var idleTimeoutDuration = parseIdleTimeoutEnv(...)`
+// relies on. Without this test, an operator who sets
+// A2A_IDLE_TIMEOUT_SECONDS=foo would get the default with no log signal
+// (pre-fix behaviour) and the regression would slip in unnoticed.
+
+func TestParseIdleTimeoutEnv(t *testing.T) {
+	cases := []struct {
+		name string
+		in   string
+		want time.Duration
+	}{
+		{"empty falls back to default", "", defaultIdleTimeoutDuration},
+		{"valid positive integer parses to seconds", "120", 120 * time.Second},
+		{"valid integer at minimum (1) is accepted", "1", 1 * time.Second},
+		{"non-numeric falls back to default", "foo", defaultIdleTimeoutDuration},
+		{"negative falls back to default", "-30", defaultIdleTimeoutDuration},
+		{"zero falls back to default", "0", defaultIdleTimeoutDuration},
+		{"float falls back to default (Atoi rejects)", "1.5", defaultIdleTimeoutDuration},
+		{"trailing units rejected (we accept seconds only)", "60s", defaultIdleTimeoutDuration},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := parseIdleTimeoutEnv(tc.in)
+			if got != tc.want {
+				t.Errorf("parseIdleTimeoutEnv(%q) = %v, want %v", tc.in, got, tc.want)
+			}
+		})
+	}
+}
+
 // ---------- TestActivityHandler_ListEmpty ----------

 func TestActivityHandler_ListEmpty(t *testing.T) {
--- a/workspace-server/internal/handlers/org.go
+++ b/workspace-server/internal/handlers/org.go
@ -5,6 +5,7 @@ package handlers

 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"log"
 	"net/http"
@ -180,6 +181,108 @@ func NewOrgHandler(wh *WorkspaceHandler, b *events.Broadcaster, p *provisioner.P
 	}
 }

+// EnvRequirement is either a single env var name (strict: that exact
+// var must be configured) or an any-of group (any one of the listed
+// names satisfies the requirement).
+//
+// YAML shapes accepted:
+//
+//	required_env:
+//	  - GITHUB_TOKEN                              # single
+//	  - any_of: [ANTHROPIC_API_KEY, CLAUDE_CODE_OAUTH_TOKEN]   # OR group
+//
+// The any-of form exists because some runtimes accept either of two
+// credential shapes — Claude Code takes ANTHROPIC_API_KEY or an OAuth
+// token interchangeably, and forcing an org template to pick one
+// would falsely block the other. For JSON (GET /org/templates),
+// the same shapes round-trip: strings stay strings, groups stay
+// {any_of: [...]}.
+type EnvRequirement struct {
+	// Name is non-empty for a single required env var.
+	Name string
+	// AnyOf is non-empty for an OR group; any one member satisfies.
+	AnyOf []string
+}
+
+// Members returns every env name this requirement considers —
+// [Name] for single, AnyOf for groups. Used by preflight, collect,
+// and the name-validation regex gate.
+func (e EnvRequirement) Members() []string {
+	if e.Name != "" {
+		return []string{e.Name}
+	}
+	return e.AnyOf
+}
+
+// IsSatisfied reports whether any member of the requirement is
+// present in `configured`. Single: exact-match. AnyOf: at least
+// one hit.
+func (e EnvRequirement) IsSatisfied(configured map[string]struct{}) bool {
+	for _, m := range e.Members() {
+		if _, ok := configured[m]; ok {
+			return true
+		}
+	}
+	return false
+}
+
+// UnmarshalYAML accepts either a scalar (string → single) or a map
+// with an `any_of` list (→ group).
+func (e *EnvRequirement) UnmarshalYAML(value *yaml.Node) error {
+	if value.Kind == yaml.ScalarNode {
+		var s string
+		if err := value.Decode(&s); err != nil {
+			return err
+		}
+		e.Name = s
+		return nil
+	}
+	var alt struct {
+		AnyOf []string `yaml:"any_of"`
+	}
+	if err := value.Decode(&alt); err != nil {
+		return fmt.Errorf("env requirement must be a string or {any_of: [...]}: %w", err)
+	}
+	if len(alt.AnyOf) == 0 {
+		return fmt.Errorf("env requirement any_of must contain at least one env var")
+	}
+	e.AnyOf = alt.AnyOf
+	return nil
+}
+
+// MarshalJSON emits the dual shape so GET /org/templates callers get
+// {"required_env": ["GITHUB_TOKEN", {"any_of": [...]}]}, matching
+// the YAML syntax.
+func (e EnvRequirement) MarshalJSON() ([]byte, error) {
+	if e.Name != "" {
+		return json.Marshal(e.Name)
+	}
+	return json.Marshal(struct {
+		AnyOf []string `json:"any_of"`
+	}{AnyOf: e.AnyOf})
+}
+
+// UnmarshalJSON is the inverse — accepts the same dual shape so
+// POST /org/import with an inline `template` body works too.
+func (e *EnvRequirement) UnmarshalJSON(data []byte) error {
+	var s string
+	if err := json.Unmarshal(data, &s); err == nil {
+		e.Name = s
+		return nil
+	}
+	var alt struct {
+		AnyOf []string `json:"any_of"`
+	}
+	if err := json.Unmarshal(data, &alt); err != nil {
+		return fmt.Errorf("env requirement must be a string or {any_of: [...]}: %w", err)
+	}
+	if len(alt.AnyOf) == 0 {
+		return fmt.Errorf("env requirement any_of must contain at least one env var")
+	}
+	e.AnyOf = alt.AnyOf
+	return nil
+}
+
 // OrgTemplate is the YAML structure for an org hierarchy.
 type OrgTemplate struct {
 	Name           string              `yaml:"name" json:"name"`
@ -189,6 +292,18 @@ type OrgTemplate struct {
 	// GlobalMemories is a list of org-wide memories seeded as GLOBAL scope
 	// on the first root workspace (PM) during org import. Issue #1050.
 	GlobalMemories []models.MemorySeed `yaml:"global_memories" json:"global_memories"`
+	// RequiredEnv lists env vars that MUST be configured globally (or
+	// on every workspace in the subtree that needs them) before import
+	// succeeds. Each entry is either a plain string (strict) or an
+	// {any_of: [...]} group (at least one member must be set). Declared
+	// at the org level for shared creds; also extensible per-workspace
+	// via OrgWorkspace.RequiredEnv for team-scoped credentials.
+	RequiredEnv []EnvRequirement `yaml:"required_env" json:"required_env"`
+	// RecommendedEnv is the "nice-to-have" tier — import still succeeds
+	// without them, but features degrade. Same single|any_of shape as
+	// RequiredEnv so a recommended OR group reads "set any one of these
+	// to unlock the feature; all missing = warning".
+	RecommendedEnv []EnvRequirement `yaml:"recommended_env" json:"recommended_env"`
 }

 type OrgDefaults struct {
@ -287,15 +402,27 @@ type OrgWorkspace struct {
 	// InitialMemories are memories seeded into this workspace at creation
 	// time. If empty, defaults.initial_memories are used. Issue #1050.
 	InitialMemories []models.MemorySeed `yaml:"initial_memories" json:"initial_memories"`
-	Schedules       []OrgSchedule       `yaml:"schedules" json:"schedules"`
-	Channels        []OrgChannel        `yaml:"channels" json:"channels"`
+	// MaxConcurrentTasks: see models.CreateWorkspacePayload.
+	MaxConcurrentTasks int                 `yaml:"max_concurrent_tasks" json:"max_concurrent_tasks"`
+	Schedules          []OrgSchedule       `yaml:"schedules" json:"schedules"`
+	Channels           []OrgChannel        `yaml:"channels" json:"channels"`
 	External        bool                `yaml:"external" json:"external"`
 	URL             string              `yaml:"url" json:"url"`
 	Canvas          struct {
 		X float64 `yaml:"x" json:"x"`
 		Y float64 `yaml:"y" json:"y"`
 	} `yaml:"canvas" json:"canvas"`
-	Children []OrgWorkspace `yaml:"children" json:"children"`
+	// RequiredEnv / RecommendedEnv declared at the workspace level
+	// narrow down what a specific team needs beyond the org-wide union.
+	// When GET /org/templates walks the tree, these flow up into
+	// OrgTemplate.RequiredEnv / RecommendedEnv. A workspace's subtree
+	// inherits: a parent declaring ANTHROPIC_API_KEY as required
+	// means every descendant considers it required too (no override
+	// needed at each leaf). Same single|any_of shape as the org-level
+	// lists.
+	RequiredEnv    []EnvRequirement `yaml:"required_env" json:"required_env"`
+	RecommendedEnv []EnvRequirement `yaml:"recommended_env" json:"recommended_env"`
+	Children       []OrgWorkspace   `yaml:"children" json:"children"`
 }

 // ListTemplates handles GET /org/templates — lists available org templates.
@ -354,11 +481,18 @@ func (h *OrgHandler) ListTemplates(c *gin.Context) {
 			continue
 		}
 		count := countWorkspaces(tmpl.Workspaces)
+		// Walk the tree to collect required + recommended env union.
+		// Canvas uses these to render a preflight modal BEFORE firing
+		// the import — saves the user from a 15-workspace import that
+		// dies one container at a time on missing creds.
+		required, recommended := collectOrgEnv(&tmpl)
 		templates = append(templates, map[string]interface{}{
-			"dir":         e.Name(),
-			"name":        tmpl.Name,
-			"description": tmpl.Description,
-			"workspaces":  count,
+			"dir":             e.Name(),
+			"name":            tmpl.Name,
+			"description":     tmpl.Description,
+			"workspaces":      count,
+			"required_env":    required,
+			"recommended_env": recommended,
 		})
 	}

@ -370,6 +504,13 @@ func (h *OrgHandler) Import(c *gin.Context) {
 	var body struct {
 		Dir      string      `json:"dir"`      // org template directory name
 		Template OrgTemplate `json:"template"` // or inline template
+		// Force skips the required-env preflight. Used by tooling
+		// that already computed the preflight client-side and wants
+		// to proceed despite missing creds (usually because the
+		// user explicitly acknowledged the tradeoff). Default behavior
+		// refuses the import with a 412 and the missing-key list so
+		// the canvas can surface them in its preflight modal.
+		Force bool `json:"force"`
 	}
 	if err := c.ShouldBindJSON(&body); err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request body"})
@ -415,6 +556,59 @@ func (h *OrgHandler) Import(c *gin.Context) {
 		return
 	}

+	// Required-env preflight — refuses import when any required_env is
+	// missing from global_secrets (unless `force: true` overrides). The
+	// canvas runs the same check client-side against GET /org/templates
+	// output and shows a modal so users set keys before clicking Import;
+	// this server-side check is the authoritative guard in case a caller
+	// bypasses the UI (CLI, API clients, etc.). 412 Precondition Failed
+	// carries the missing-key list so tooling can render the same
+	// add-key flow.
+	required, _ := collectOrgEnv(&tmpl)
+	if body.Force {
+		// Log the bypass so a post-incident search can find who
+		// imported an org with missing creds. The common audit flow
+		// treats log.Printf at INFO as the low-cost trail for
+		// explicit-override actions — keeps force as a supported
+		// knob but makes it investigable.
+		log.Printf("Org import: force=true bypass — template=%q, required_env=%v", tmpl.Name, required)
+	} else if len(required) > 0 {
+		ctx := c.Request.Context()
+		configured, err := loadConfiguredGlobalSecretKeys(ctx)
+		if err != nil {
+			// Fail closed. Previously this fell through and imported
+			// anyway, defeating the preflight for exactly the case
+			// it's meant to cover. A DB hiccup should look like a
+			// retryable 500, not a silent green light for an import
+			// that will fail at container-start time on every node.
+			log.Printf("Org import preflight: global secrets lookup failed: %v", err)
+			c.JSON(http.StatusInternalServerError, gin.H{
+				"error": "could not verify required environment variables; try again or pass force=true to override",
+			})
+			return
+		}
+		var missing []EnvRequirement
+		for _, req := range required {
+			// For a single requirement this is exact-match; for an
+			// any-of group, any one member satisfies. Groups whose
+			// alternative is already configured drop out here — the
+			// user doesn't need to re-configure them.
+			if !req.IsSatisfied(configured) {
+				missing = append(missing, req)
+			}
+		}
+		if len(missing) > 0 {
+			c.JSON(http.StatusPreconditionFailed, gin.H{
+				"error":        "missing required environment variables",
+				"missing_env":  missing,
+				"required_env": required,
+				"template":     tmpl.Name,
+				"suggestion":   "set these as global secrets (POST /settings/secrets) or pass force=true to override",
+			})
+			return
+		}
+	}
+
 	results := []map[string]interface{}{}
 	var createErr error

@ -426,7 +620,8 @@ func (h *OrgHandler) Import(c *gin.Context) {
 	// using subtree-aware grid slots (children that are themselves
 	// parents get a bigger slot so they don't overflow into siblings).
 	for _, ws := range tmpl.Workspaces {
-		if err := h.createWorkspaceTree(ws, nil, ws.Canvas.X, ws.Canvas.Y, tmpl.Defaults, orgBaseDir, &results, provisionSem); err != nil {
+		// Root: relX/relY == absX/absY (no parent to be relative to).
+		if err := h.createWorkspaceTree(ws, nil, ws.Canvas.X, ws.Canvas.Y, ws.Canvas.X, ws.Canvas.Y, tmpl.Defaults, orgBaseDir, &results, provisionSem); err != nil {
 			createErr = err
 			break
 		}
--- a/workspace-server/internal/handlers/org_import.go
+++ b/workspace-server/internal/handlers/org_import.go
@ -10,6 +10,8 @@ import (
 	"log"
 	"os"
 	"path/filepath"
+	"regexp"
+	"sort"
 	"strings"
 	"time"

@ -28,7 +30,13 @@ import (
 // parent.abs + childSlotInGrid(index, siblingSizes) computed by the
 // caller. Storing already-absolute coords means a child that is itself
 // a parent can simply compound the grid without any per-call math.
-func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX, absY float64, defaults OrgDefaults, orgBaseDir string, results *[]map[string]interface{}, provisionSem chan struct{}) error {
+// relX / relY are THIS workspace's position RELATIVE to its parent's
+// absolute origin (i.e. childSlotInGrid output for children; 0,0 for
+// roots since a root's absolute IS its relative). The broadcast
+// payload ships relative coords so the canvas can drop the node
+// straight into the parent's child-coordinate space without doing a
+// canvas-wide absolute-position walk.
+func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX, absY, relX, relY float64, defaults OrgDefaults, orgBaseDir string, results *[]map[string]interface{}, provisionSem chan struct{}) error {
 	// Apply defaults
 	runtime := ws.Runtime
 	if runtime == "" {
@ -103,10 +111,14 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 	// (see canvas-topology.ts), so imports don't spray the viewport.
 	initialCollapsed := false

+	maxConcurrent := ws.MaxConcurrentTasks
+	if maxConcurrent <= 0 {
+		maxConcurrent = models.DefaultMaxConcurrentTasks
+	}
 	_, err := db.DB.ExecContext(ctx, `
-		INSERT INTO workspaces (id, name, role, tier, runtime, awareness_namespace, status, parent_id, workspace_dir, workspace_access)
-		VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
-	`, id, ws.Name, role, tier, runtime, awarenessNS, "provisioning", parentID, workspaceDir, workspaceAccess)
+		INSERT INTO workspaces (id, name, role, tier, runtime, awareness_namespace, status, parent_id, workspace_dir, workspace_access, max_concurrent_tasks)
+		VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
+	`, id, ws.Name, role, tier, runtime, awarenessNS, "provisioning", parentID, workspaceDir, workspaceAccess, maxConcurrent)
 	if err != nil {
 		log.Printf("Org import: failed to create %s: %v", ws.Name, err)
 		return fmt.Errorf("failed to create %s: %w", ws.Name, err)
@ -128,10 +140,23 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 	}

 	// Broadcast — include runtime so the canvas pill renders the right
-	// badge immediately instead of "unknown".
-	h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISIONING", id, map[string]interface{}{
+	// badge immediately instead of "unknown". parent_id + x/y let the
+	// canvas's org-deploy animation spawn the child from the parent's
+	// current coords and tween into its reserved slot, instead of
+	// landing in a default grid position first and snapping on the
+	// next hydrate.
+	payload := map[string]interface{}{
 		"name": ws.Name, "tier": tier, "runtime": runtime,
-	})
+		// Parent-relative coords — the canvas's React Flow node uses
+		// these as the node's position when parent_id is set (React
+		// Flow treats node.position as parent-relative when the node
+		// has a parentId). For roots, relX/relY == absX/absY.
+		"x": relX, "y": relY,
+	}
+	if parentID != nil {
+		payload["parent_id"] = *parentID
+	}
+	h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISIONING", id, payload)

 	// Seed initial memories from workspace config or defaults (issue #1050).
 	// Per-workspace initial_memories override defaults; if workspace has none,
@ -509,7 +534,9 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 			slotX, slotY := childSlotInGrid(i, siblingSizes)
 			childAbsX := absX + slotX
 			childAbsY := absY + slotY
-			if err := h.createWorkspaceTree(child, &id, childAbsX, childAbsY, defaults, orgBaseDir, results, provisionSem); err != nil {
+			// slotX/slotY are already parent-relative — that's
+			// exactly what childSlotInGrid returns.
+			if err := h.createWorkspaceTree(child, &id, childAbsX, childAbsY, slotX, slotY, defaults, orgBaseDir, results, provisionSem); err != nil {
 				return err
 			}
 			time.Sleep(workspaceCreatePacingMs * time.Millisecond)
@ -519,6 +546,213 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 	return nil
 }

+// envVarNamePattern guards template-supplied env var names against
+// pathological inputs. A malicious template could ship
+// required_env: ["'; DROP …"] or whitespace-only entries that would
+// flow through collectOrgEnv → into the 412 response body and,
+// worse, into the modal's PUT /settings/secrets input. Schema
+// already has `key TEXT NOT NULL UNIQUE` and our queries are
+// parameterised so SQL injection isn't the threat — the real risks
+// are UI rendering weirdness (newlines, NUL bytes, zero-width chars)
+// and downstream env-var semantics (POSIX requires uppercase +
+// underscore + digit). A strict regex filters both classes of
+// problem at a single choke point.
+var envVarNamePattern = regexp.MustCompile(`^[A-Z][A-Z0-9_]{0,127}$`)
+
+// sanitizeEnvMembers filters a requirement's member list through the
+// name-validation regex, logging rejections. Returns the filtered
+// list and a boolean indicating whether any valid members remain.
+// Used so a group containing one valid + one bogus name is kept
+// (valid member carries the group) rather than silently dropped.
+func sanitizeEnvMembers(members []string, where string) ([]string, bool) {
+	out := make([]string, 0, len(members))
+	for _, k := range members {
+		if !envVarNamePattern.MatchString(k) {
+			if k != "" {
+				log.Printf("collectOrgEnv: rejecting invalid env var name %q from %s (must match %s)", k, where, envVarNamePattern)
+			}
+			continue
+		}
+		out = append(out, k)
+	}
+	return out, len(out) > 0
+}
+
+// envRequirementKey canonicalises a requirement for dedup — sorted
+// member list joined with NUL so `any_of: [A, B]` and `any_of: [B, A]`
+// collapse to the same key. Single requirements are length-1 groups.
+func envRequirementKey(members []string) string {
+	cp := append([]string(nil), members...)
+	sort.Strings(cp)
+	return strings.Join(cp, "\x00")
+}
+
+// collectOrgEnv walks the whole template tree and returns the union of
+// required_env and recommended_env declared anywhere — at the org
+// level, on root workspaces, or on any nested child. Deduplicates by
+// group membership (same set of members = same requirement) and
+// sorts deterministically so the canvas sees a stable order.
+//
+// "Required wins" rules:
+//
+//   - A requirement that appears in BOTH required and recommended
+//     (same members) surfaces only as required.
+//   - A single-name requirement (e.g. "API_KEY") and a group that
+//     contains that same name (e.g. {any_of: [API_KEY, OTHER]}) are
+//     NOT deduplicated — they're semantically different (strict vs
+//     satisfiable-by-alternative) and the stricter "single" one wins,
+//     so the any-of group is dropped when its members overlap with a
+//     strict requirement declared elsewhere.
+//
+// Invalid names fail envVarNamePattern; the filter is applied per
+// group so a group with one bogus entry keeps the rest. A group
+// whose ALL members are invalid is dropped entirely with a log.
+func collectOrgEnv(tmpl *OrgTemplate) (required, recommended []EnvRequirement) {
+	reqByKey := map[string]EnvRequirement{}
+	recByKey := map[string]EnvRequirement{}
+	// Names covered by strict (single) required entries. A group in
+	// EITHER tier whose any-of contains ONE of these names is
+	// dominated by the strict requirement and gets dropped on the
+	// second pass.
+	strictRequiredNames := map[string]struct{}{}
+
+	accept := func(into map[string]EnvRequirement, src []EnvRequirement, where string, markStrict bool) {
+		for _, req := range src {
+			members, ok := sanitizeEnvMembers(req.Members(), where)
+			if !ok {
+				continue
+			}
+			key := envRequirementKey(members)
+			if _, exists := into[key]; exists {
+				continue
+			}
+			if req.Name != "" && len(members) == 1 {
+				into[key] = EnvRequirement{Name: members[0]}
+				if markStrict {
+					strictRequiredNames[members[0]] = struct{}{}
+				}
+			} else {
+				into[key] = EnvRequirement{AnyOf: members}
+			}
+		}
+	}
+	accept(reqByKey, tmpl.RequiredEnv, "template root", true)
+	accept(recByKey, tmpl.RecommendedEnv, "template root", false)
+	var walk func([]OrgWorkspace)
+	walk = func(ws []OrgWorkspace) {
+		for _, w := range ws {
+			accept(reqByKey, w.RequiredEnv, "workspace "+w.Name, true)
+			accept(recByKey, w.RecommendedEnv, "workspace "+w.Name, false)
+			walk(w.Children)
+		}
+	}
+	walk(tmpl.Workspaces)
+
+	// Required wins across tiers: any requirement whose members
+	// overlap with a strict required name gets dropped from
+	// recommended. Keeps the canvas modal from showing the same
+	// key in both sections.
+	prune := func(from map[string]EnvRequirement) {
+		for k, r := range from {
+			for _, m := range r.Members() {
+				if _, strict := strictRequiredNames[m]; strict {
+					delete(from, k)
+					break
+				}
+			}
+		}
+	}
+	prune(recByKey)
+
+	// Same-tier: a strict required X dominates any-of groups in
+	// required that CONTAIN X (a group saying "any of X, Y" is
+	// automatically satisfied when X is required anyway, so it's
+	// redundant). Same logic applied to recommended.
+	pruneSameTier := func(tier map[string]EnvRequirement) {
+		strictInTier := map[string]struct{}{}
+		for _, r := range tier {
+			if r.Name != "" {
+				strictInTier[r.Name] = struct{}{}
+			}
+		}
+		for k, r := range tier {
+			if len(r.AnyOf) == 0 {
+				continue
+			}
+			for _, m := range r.AnyOf {
+				if _, strict := strictInTier[m]; strict {
+					delete(tier, k)
+					break
+				}
+			}
+		}
+	}
+	pruneSameTier(reqByKey)
+	pruneSameTier(recByKey)
+
+	required = flattenAndSortRequirements(reqByKey)
+	recommended = flattenAndSortRequirements(recByKey)
+	return required, recommended
+}
+
+func flattenAndSortRequirements(by map[string]EnvRequirement) []EnvRequirement {
+	out := make([]EnvRequirement, 0, len(by))
+	for _, r := range by {
+		out = append(out, r)
+	}
+	sort.Slice(out, func(i, j int) bool {
+		// Sort singles first by name; groups after, ordered by
+		// joined-member string. Gives the canvas a deterministic
+		// render order so the same template always produces the
+		// same modal layout.
+		iSingle := out[i].Name != ""
+		jSingle := out[j].Name != ""
+		if iSingle != jSingle {
+			return iSingle
+		}
+		if iSingle {
+			return out[i].Name < out[j].Name
+		}
+		return envRequirementKey(out[i].AnyOf) < envRequirementKey(out[j].AnyOf)
+	})
+	return out
+}
+
+// loadConfiguredGlobalSecretKeys returns the set of key names present
+// in global_secrets WHERE the encrypted_value is non-empty. Filtering
+// on the payload size catches the failure mode where a row was
+// upserted with an empty value (historical rows predating the
+// binding:"required" guard on SetGlobal, or a future direct SQL
+// path that skips it) — the preflight would otherwise report the
+// key as "configured" and the per-container preflight would still
+// fail at start time, defeating the whole feature.
+// The LIMIT is a sanity cap: at realistic tenant sizes (< 1k
+// secrets) it's a no-op; at pathological sizes it stops one slow
+// query from wedging org imports. A hit gets logged so operators
+// can investigate.
+const globalSecretsPreflightLimit = 10000
+
+func loadConfiguredGlobalSecretKeys(ctx context.Context) (map[string]struct{}, error) {
+	rows, err := db.DB.QueryContext(ctx,
+		`SELECT key FROM global_secrets WHERE octet_length(encrypted_value) > 0 LIMIT $1`,
+		globalSecretsPreflightLimit)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	out := map[string]struct{}{}
+	for rows.Next() {
+		var k string
+		if scanErr := rows.Scan(&k); scanErr == nil && k != "" {
+			out[k] = struct{}{}
+		}
+	}
+	if len(out) == globalSecretsPreflightLimit {
+		log.Printf("loadConfiguredGlobalSecretKeys: hit LIMIT %d — org-import preflight may be incomplete", globalSecretsPreflightLimit)
+	}
+	return out, rows.Err()
+}
+
 func countWorkspaces(workspaces []OrgWorkspace) int {
 	count := len(workspaces)
 	for _, ws := range workspaces {
--- a/workspace-server/internal/handlers/org_test.go
+++ b/workspace-server/internal/handlers/org_test.go
@ -1,6 +1,7 @@
 package handlers

 import (
+	"sort"
 	"strings"
 	"testing"
 	"time"
@ -650,3 +651,428 @@ func TestOrgImport_ScheduleComputeError(t *testing.T) {
 		})
 	}
 }
+
+// ============================================================================
+// Org env-preflight aggregation (collectOrgEnv)
+// ============================================================================
+
+// strictReq builds a slice of single-name EnvRequirements for test
+// fixtures. Equivalent to the old []string literal but wrapped in
+// the new union shape.
+func strictReq(names ...string) []EnvRequirement {
+	out := make([]EnvRequirement, 0, len(names))
+	for _, n := range names {
+		out = append(out, EnvRequirement{Name: n})
+	}
+	return out
+}
+
+// anyOfReq builds a single any-of EnvRequirement for test fixtures.
+func anyOfReq(names ...string) EnvRequirement {
+	return EnvRequirement{AnyOf: append([]string(nil), names...)}
+}
+
+// reqNames flattens a slice of EnvRequirements into a single comparable
+// slice: single-name reqs contribute their Name, any-of reqs contribute
+// "anyOf(A|B|C)" with members sorted for deterministic output. Lets
+// tests assert against a string form regardless of which kind each
+// entry takes.
+func reqNames(reqs []EnvRequirement) []string {
+	out := make([]string, 0, len(reqs))
+	for _, r := range reqs {
+		if r.Name != "" {
+			out = append(out, r.Name)
+			continue
+		}
+		members := append([]string(nil), r.AnyOf...)
+		sort.Strings(members)
+		out = append(out, "anyOf("+strings.Join(members, "|")+")")
+	}
+	return out
+}
+
+func TestCollectOrgEnv_UnionAcrossLevels(t *testing.T) {
+	tmpl := &OrgTemplate{
+		RequiredEnv:    strictReq("ANTHROPIC_API_KEY"),
+		RecommendedEnv: strictReq("SLACK_WEBHOOK_URL"),
+		Workspaces: []OrgWorkspace{
+			{
+				Name:        "Root",
+				RequiredEnv: strictReq("GITHUB_TOKEN"),
+				Children: []OrgWorkspace{
+					{
+						Name:           "Leaf",
+						RequiredEnv:    strictReq("OPENROUTER_API_KEY"),
+						RecommendedEnv: strictReq("DISCORD_WEBHOOK_URL"),
+					},
+				},
+			},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	// Required is the union of top-level + root + leaf.
+	wantReq := []string{"ANTHROPIC_API_KEY", "GITHUB_TOKEN", "OPENROUTER_API_KEY"}
+	if !stringSlicesEqual(reqNames(req), wantReq) {
+		t.Errorf("required mismatch: got %v, want %v", reqNames(req), wantReq)
+	}
+	wantRec := []string{"DISCORD_WEBHOOK_URL", "SLACK_WEBHOOK_URL"}
+	if !stringSlicesEqual(reqNames(rec), wantRec) {
+		t.Errorf("recommended mismatch: got %v, want %v", reqNames(rec), wantRec)
+	}
+}
+
+func TestCollectOrgEnv_RequiredWinsOverRecommended(t *testing.T) {
+	// Same key declared at one layer as recommended and another as
+	// required MUST surface only on the required side — a required
+	// declaration is strictly stricter than a recommended one, and
+	// listing it in both tiers would confuse the preflight modal.
+	tmpl := &OrgTemplate{
+		RecommendedEnv: strictReq("API_KEY"),
+		Workspaces: []OrgWorkspace{
+			{Name: "X", RequiredEnv: strictReq("API_KEY")},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "API_KEY" {
+		t.Errorf("required should contain API_KEY, got %v", reqNames(req))
+	}
+	for _, r := range rec {
+		if r.Name == "API_KEY" {
+			t.Errorf("API_KEY must not appear in recommended once required elsewhere")
+		}
+	}
+}
+
+func TestCollectOrgEnv_Dedup(t *testing.T) {
+	// Same key declared twice at different levels should appear once.
+	tmpl := &OrgTemplate{
+		RequiredEnv: strictReq("K", "K"),
+		Workspaces: []OrgWorkspace{
+			{Name: "A", RequiredEnv: strictReq("K")},
+			{Name: "B", RequiredEnv: strictReq("K"), Children: []OrgWorkspace{
+				{Name: "C", RequiredEnv: strictReq("K")},
+			}},
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "K" {
+		t.Errorf("dedup failed: got %v, want [K]", reqNames(req))
+	}
+}
+
+func TestCollectOrgEnv_Empty(t *testing.T) {
+	tmpl := &OrgTemplate{}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 0 || len(rec) != 0 {
+		t.Errorf("empty template should produce empty slices, got req=%v rec=%v", reqNames(req), reqNames(rec))
+	}
+}
+
+// stringSlicesEqual checks ordered equality — collectOrgEnv sorts its
+// output so callers can do deterministic comparisons.
+func stringSlicesEqual(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func TestCollectOrgEnv_RequiredWinsOnSameStruct(t *testing.T) {
+	// The same key declared required AND recommended on the SAME
+	// workspace node (rare but legal to parse) must still dedup
+	// correctly and end up required-only.
+	tmpl := &OrgTemplate{
+		Workspaces: []OrgWorkspace{
+			{
+				Name:           "X",
+				RequiredEnv:    strictReq("API_KEY"),
+				RecommendedEnv: strictReq("API_KEY"),
+			},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "API_KEY" {
+		t.Errorf("required should contain API_KEY once, got %v", reqNames(req))
+	}
+	for _, r := range rec {
+		if r.Name == "API_KEY" {
+			t.Errorf("API_KEY must not appear in recommended when also required on same struct")
+		}
+	}
+}
+
+func TestCollectOrgEnv_RejectsInvalidNames(t *testing.T) {
+	// Names failing envVarNamePattern (lowercase, traversal, whitespace,
+	// shell metachars) must be dropped silently — the log line is not
+	// asserted here; the output slice assertion is enough to prove the
+	// filter fires.
+	tmpl := &OrgTemplate{
+		RequiredEnv: strictReq(
+			"VALID_ONE",
+			"lowercase_bad",
+			"../../etc/passwd",
+			"name with spaces",
+			"WITH-DASH",
+			"'; DROP TABLE users;--",
+			"",
+			"A", // single char — still valid per regex
+		),
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if !stringSlicesEqual(reqNames(req), []string{"A", "VALID_ONE"}) {
+		t.Errorf("expected only valid names, got %v", reqNames(req))
+	}
+}
+
+// TestOrgTemplate_ClaudeAnyOfAuthPreflight exercises the shape the
+// ux-ab-lab template ships with: a single any-of group at the org
+// level covering ANTHROPIC_API_KEY vs. CLAUDE_CODE_OAUTH_TOKEN, plus
+// two strict recommended entries (SERPER_API_KEY, VERCEL_TOKEN).
+// Proves the end-to-end YAML → OrgTemplate → collectOrgEnv → IsSatisfied
+// pipeline works for the canonical "Claude sub OR API key" pattern
+// without depending on the on-disk template file (org-templates/ is
+// populated by the clone-manifest, not tracked in this monorepo).
+func TestOrgTemplate_ClaudeAnyOfAuthPreflight(t *testing.T) {
+	src := `
+name: UX A/B Lab
+required_env:
+  - any_of:
+      - ANTHROPIC_API_KEY
+      - CLAUDE_CODE_OAUTH_TOKEN
+recommended_env:
+  - SERPER_API_KEY
+  - VERCEL_TOKEN
+workspaces:
+  - name: Design Director
+    children:
+      - name: UX Researcher
+      - name: Visual Designer
+      - name: React Engineer
+      - name: Deploy Engineer
+      - name: A11y + SEO Auditor
+      - name: Perf Auditor
+`
+	var tmpl OrgTemplate
+	if err := yaml.Unmarshal([]byte(src), &tmpl); err != nil {
+		t.Fatalf("unmarshal: %v", err)
+	}
+	if len(tmpl.Workspaces) != 1 || len(tmpl.Workspaces[0].Children) != 6 {
+		t.Fatalf("expected 1 root with 6 children, got shape %+v", tmpl.Workspaces)
+	}
+
+	required, recommended := collectOrgEnv(&tmpl)
+	if len(required) != 1 {
+		t.Fatalf("expected 1 required requirement (the any-of group), got %d: %v", len(required), reqNames(required))
+	}
+	if required[0].Name != "" {
+		t.Errorf("expected any-of group, got strict name %q", required[0].Name)
+	}
+	wantMembers := []string{"ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"}
+	got := append([]string(nil), required[0].AnyOf...)
+	sort.Strings(got)
+	if !stringSlicesEqual(got, wantMembers) {
+		t.Errorf("any-of members mismatch: got %v, want %v", got, wantMembers)
+	}
+
+	// Either member should independently satisfy the group.
+	if !required[0].IsSatisfied(map[string]struct{}{"ANTHROPIC_API_KEY": {}}) {
+		t.Errorf("ANTHROPIC_API_KEY alone should satisfy the group")
+	}
+	if !required[0].IsSatisfied(map[string]struct{}{"CLAUDE_CODE_OAUTH_TOKEN": {}}) {
+		t.Errorf("CLAUDE_CODE_OAUTH_TOKEN alone should satisfy the group")
+	}
+	if required[0].IsSatisfied(map[string]struct{}{"OPENAI_API_KEY": {}}) {
+		t.Errorf("unrelated key should NOT satisfy the group")
+	}
+
+	wantRec := []string{"SERPER_API_KEY", "VERCEL_TOKEN"}
+	if !stringSlicesEqual(reqNames(recommended), wantRec) {
+		t.Errorf("recommended mismatch: got %v, want %v", reqNames(recommended), wantRec)
+	}
+}
+
+// TestEnvRequirement_UnmarshalYAML proves the on-disk YAML shape
+// (scalar OR `{any_of: [...]}` block) round-trips into EnvRequirement
+// correctly. The preflight pipeline reads user-authored org.yaml
+// files; a regression here would silently drop requirements.
+func TestEnvRequirement_UnmarshalYAML(t *testing.T) {
+	src := `
+required_env:
+  - GITHUB_TOKEN
+  - any_of:
+      - ANTHROPIC_API_KEY
+      - CLAUDE_CODE_OAUTH_TOKEN
+`
+	var parsed struct {
+		RequiredEnv []EnvRequirement `yaml:"required_env"`
+	}
+	if err := yaml.Unmarshal([]byte(src), &parsed); err != nil {
+		t.Fatalf("unmarshal failed: %v", err)
+	}
+	if len(parsed.RequiredEnv) != 2 {
+		t.Fatalf("want 2 requirements, got %d", len(parsed.RequiredEnv))
+	}
+	if parsed.RequiredEnv[0].Name != "GITHUB_TOKEN" {
+		t.Errorf("first should be strict GITHUB_TOKEN, got %+v", parsed.RequiredEnv[0])
+	}
+	if parsed.RequiredEnv[1].Name != "" || len(parsed.RequiredEnv[1].AnyOf) != 2 {
+		t.Errorf("second should be any-of group, got %+v", parsed.RequiredEnv[1])
+	}
+}
+
+// TestEnvRequirement_UnmarshalYAML_RejectsEmptyAnyOf guards against a
+// template that ships `any_of: []` — ambiguous semantics (impossible
+// to satisfy), so the parser must fail loudly rather than silently
+// pass a never-satisfiable requirement through the preflight.
+func TestEnvRequirement_UnmarshalYAML_RejectsEmptyAnyOf(t *testing.T) {
+	src := `
+required_env:
+  - any_of: []
+`
+	var parsed struct {
+		RequiredEnv []EnvRequirement `yaml:"required_env"`
+	}
+	err := yaml.Unmarshal([]byte(src), &parsed)
+	if err == nil {
+		t.Errorf("expected error for empty any_of, got nil: %+v", parsed)
+	}
+}
+
+// ---------------------------------------------------------------------
+// any_of group tests — the new EnvRequirement union shape allows a
+// single requirement to be satisfied by any of a list of members (e.g.
+// ANTHROPIC_API_KEY OR CLAUDE_CODE_OAUTH_TOKEN). collectOrgEnv +
+// IsSatisfied together must handle this correctly.
+// ---------------------------------------------------------------------
+
+func TestEnvRequirement_IsSatisfied(t *testing.T) {
+	configured := map[string]struct{}{
+		"ANTHROPIC_API_KEY": {},
+		"GITHUB_TOKEN":      {},
+	}
+	tests := []struct {
+		name string
+		req  EnvRequirement
+		want bool
+	}{
+		{"strict present", EnvRequirement{Name: "ANTHROPIC_API_KEY"}, true},
+		{"strict absent", EnvRequirement{Name: "MISSING_KEY"}, false},
+		{"any-of first member present", anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"), true},
+		{"any-of second member present", anyOfReq("CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_API_KEY"), true},
+		{"any-of none present", anyOfReq("OPENAI_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"), false},
+		{"any-of single member present", anyOfReq("GITHUB_TOKEN"), true},
+	}
+	for _, tt := range tests {
+		if got := tt.req.IsSatisfied(configured); got != tt.want {
+			t.Errorf("%s: got %v, want %v", tt.name, got, tt.want)
+		}
+	}
+}
+
+func TestCollectOrgEnv_AnyOfGroupPreserved(t *testing.T) {
+	// A group with two alternatives should come through as a single
+	// EnvRequirement carrying both members.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 {
+		t.Fatalf("expected 1 requirement, got %d: %v", len(req), reqNames(req))
+	}
+	if req[0].Name != "" {
+		t.Errorf("expected any-of group, got strict name %q", req[0].Name)
+	}
+	wantMembers := []string{"ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"}
+	got := append([]string(nil), req[0].AnyOf...)
+	sort.Strings(got)
+	if !stringSlicesEqual(got, wantMembers) {
+		t.Errorf("any-of members mismatch: got %v, want %v", got, wantMembers)
+	}
+}
+
+func TestCollectOrgEnv_AnyOfGroupDedup(t *testing.T) {
+	// Two identical groups (members in different order) declared at
+	// different levels must collapse to one.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+		},
+		Workspaces: []OrgWorkspace{
+			{
+				Name: "Root",
+				RequiredEnv: []EnvRequirement{
+					anyOfReq("CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_API_KEY"),
+				},
+			},
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 {
+		t.Errorf("expected 1 requirement after dedup, got %d: %v", len(req), reqNames(req))
+	}
+}
+
+func TestCollectOrgEnv_StrictDominatesGroup(t *testing.T) {
+	// If a strict requirement X is declared anywhere, any-of groups
+	// that CONTAIN X are redundant — the strict requirement will force
+	// X to be configured, which satisfies any group mentioning it too.
+	// Same-tier pruning drops the group.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			{Name: "ANTHROPIC_API_KEY"},
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "ANTHROPIC_API_KEY" {
+		t.Errorf("strict should dominate group, got %v", reqNames(req))
+	}
+}
+
+func TestCollectOrgEnv_StrictRequiredDominatesRecommendedGroup(t *testing.T) {
+	// Cross-tier: a strict required X drops any-of groups in the
+	// recommended tier that mention X.
+	tmpl := &OrgTemplate{
+		RequiredEnv: strictReq("ANTHROPIC_API_KEY"),
+		RecommendedEnv: []EnvRequirement{
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+			{Name: "SLACK_WEBHOOK_URL"},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "ANTHROPIC_API_KEY" {
+		t.Errorf("required mismatch: got %v", reqNames(req))
+	}
+	// The any-of group should have been pruned; only SLACK remains.
+	if len(rec) != 1 || rec[0].Name != "SLACK_WEBHOOK_URL" {
+		t.Errorf("recommended mismatch: got %v, want [SLACK_WEBHOOK_URL]", reqNames(rec))
+	}
+}
+
+func TestCollectOrgEnv_AnyOfWithInvalidMemberKeepsValidOnes(t *testing.T) {
+	// A group with one valid + one invalid member should keep the
+	// valid one (group carried by any remaining legitimate name). A
+	// group where ALL members are invalid is dropped entirely.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			anyOfReq("VALID_ONE", "lowercase_bad"),
+			anyOfReq("'; DROP TABLE;--", ""),
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 {
+		t.Fatalf("expected 1 requirement, got %d: %v", len(req), reqNames(req))
+	}
+	// The remaining group has only one valid member, so it gets
+	// promoted to a single-name requirement (len(members)==1 path).
+	if req[0].Name != "VALID_ONE" && !stringSlicesEqual(req[0].AnyOf, []string{"VALID_ONE"}) {
+		t.Errorf("expected VALID_ONE to survive, got %v", reqNames(req))
+	}
+}
--- a/workspace-server/internal/handlers/registry.go
+++ b/workspace-server/internal/handlers/registry.go
@ -441,6 +441,26 @@ func (h *RegistryHandler) Heartbeat(c *gin.Context) {
 		})
 	}

+	// Always emit a lightweight heartbeat broadcast — load-bearing for
+	// the a2a-proxy's per-dispatch idle timeout (a2a_proxy.go:applyIdleTimeout).
+	// Before this, the proxy's idle timer reset on TASK_UPDATED but
+	// TASK_UPDATED only fires when current_task CHANGES. A long-running
+	// agent that keeps the same task value for >idleTimeoutDuration
+	// (claude-code packaging a ZIP, slow tool call, model thinking time)
+	// hit no broadcast → idle timer fired → user's message got cancelled
+	// mid-flight with "context canceled". Symptom users hit on the
+	// 2026-04-26 director-bypass investigation: 15+ failures in 1hr
+	// across 6 workspaces, all silent during the gap.
+	//
+	// Cost: BroadcastOnly skips the DB write (no activity_logs row),
+	// so per-heartbeat cost is one in-memory channel send per active
+	// SSE subscriber and one WS hub fan-out. At 30s heartbeat cadence
+	// this is far below any noise floor on either path.
+	h.broadcaster.BroadcastOnly(payload.WorkspaceID, "WORKSPACE_HEARTBEAT", map[string]interface{}{
+		"active_tasks":   payload.ActiveTasks,
+		"uptime_seconds": payload.UptimeSeconds,
+	})
+
 	c.JSON(http.StatusOK, gin.H{"status": "ok"})
 }

@ -454,6 +474,29 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea
 		return
 	}

+	// Self-reported runtime wedge: takes precedence over the error_rate
+	// path. The heartbeat task lives in its own asyncio task and keeps
+	// firing 200s even after claude_agent_sdk locks up on
+	// `Control request timeout: initialize` — so error_rate stays at 0
+	// (no calls have been recorded as errors yet) while every actual
+	// /a2a POST hangs. The workspace tells us about that case via
+	// runtime_state="wedged"; we honor it directly. Sample_error from
+	// the heartbeat carries the human-readable reason ("SDK init
+	// timeout — restart workspace"), which the canvas surfaces in the
+	// degraded card without the operator scraping container logs.
+	if payload.RuntimeState == "wedged" && currentStatus == "online" {
+		_, err := db.DB.ExecContext(ctx,
+			`UPDATE workspaces SET status = 'degraded', updated_at = now() WHERE id = $1 AND status = 'online'`,
+			payload.WorkspaceID)
+		if err != nil {
+			log.Printf("Heartbeat: failed to mark %s degraded (wedged): %v", payload.WorkspaceID, err)
+		}
+		h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_DEGRADED", payload.WorkspaceID, map[string]interface{}{
+			"runtime_state": "wedged",
+			"sample_error":  payload.SampleError,
+		})
+	}
+
 	if currentStatus == "online" && payload.ErrorRate >= 0.5 {
 		if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'degraded', updated_at = now() WHERE id = $1`, payload.WorkspaceID); err != nil {
 			log.Printf("Heartbeat: failed to mark %s degraded: %v", payload.WorkspaceID, err)
@ -464,7 +507,13 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea
 		})
 	}

-	if currentStatus == "degraded" && payload.ErrorRate < 0.1 {
+	// Recovery from degraded → online when BOTH the error rate has
+	// fallen back AND the workspace is no longer reporting a wedge.
+	// The wedge condition is sticky for the process lifetime
+	// (claude_sdk_executor only clears it on restart), so when the
+	// container restarts and starts heartbeating fresh — RuntimeState
+	// is empty, error_rate is 0 — this branch flips us back to online.
+	if currentStatus == "degraded" && payload.ErrorRate < 0.1 && payload.RuntimeState == "" {
 		if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'online', updated_at = now() WHERE id = $1`, payload.WorkspaceID); err != nil {
 			log.Printf("Heartbeat: failed to recover %s to online: %v", payload.WorkspaceID, err)
 		}
--- a/workspace-server/internal/handlers/registry_test.go
+++ b/workspace-server/internal/handlers/registry_test.go
@ -298,6 +298,163 @@ func TestHeartbeatHandler_OnlineStaysOnline(t *testing.T) {
 	}
 }

+// ==================== Heartbeat — runtime wedge (claude_agent_sdk init timeout) ====================
+
+// TestHeartbeatHandler_RuntimeWedged_FlipsOnlineToDegraded verifies the
+// runtime_state="wedged" path. Heartbeat task in the workspace lives in
+// its own asyncio task and keeps reporting online while the Claude SDK
+// is wedged on Control request timeout; the workspace tells us about
+// the wedge via this field, and we honor it by flipping status →
+// degraded with the wedge reason in last_sample_error.
+func TestHeartbeatHandler_RuntimeWedged_FlipsOnlineToDegraded(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	wedgeMsg := "claude_agent_sdk wedge: Control request timeout: initialize — restart workspace to recover"
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
+
+	// Heartbeat UPDATE — sample_error carries the wedge reason from the
+	// workspace's _runtime_state_payload() helper.
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-wedged", 0.0, wedgeMsg, 0, 600, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// evaluateStatus: currentStatus = online
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("online"))
+
+	// The wedge-handling branch fires the degraded UPDATE with the
+	// `AND status = 'online'` guard (race-safe against concurrent
+	// removal). Match the SQL with the guard included.
+	mock.ExpectExec("UPDATE workspaces SET status = 'degraded'.*status = 'online'").
+		WithArgs("ws-wedged").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// RecordAndBroadcast for WORKSPACE_DEGRADED
+	mock.ExpectExec("INSERT INTO structure_events").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	body := `{"workspace_id":"ws-wedged","error_rate":0.0,"sample_error":"` + wedgeMsg + `","active_tasks":0,"uptime_seconds":600,"runtime_state":"wedged"}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestHeartbeatHandler_DegradedRecoversOnlyAfterWedgeClears verifies that
+// the degraded → online recovery path requires BOTH error_rate < 0.1
+// AND runtime_state cleared. A workspace still reporting wedged stays
+// degraded even when error_rate happens to be 0 (no calls have been
+// recorded as errors yet — the wedge is captured as a runtime state,
+// not an error count).
+func TestHeartbeatHandler_DegradedRecoversOnlyAfterWedgeClears(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-still-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
+
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-still-wedged", 0.0, "still broken", 0, 800, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// currentStatus = degraded
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-still-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("degraded"))
+
+	// No additional UPDATE expected — the recovery branch's
+	// `runtime_state == ""` guard blocks the flip back to online.
+	// (sqlmock fails the test if any unmocked Exec runs.)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	body := `{"workspace_id":"ws-still-wedged","error_rate":0.0,"sample_error":"still broken","active_tasks":0,"uptime_seconds":800,"runtime_state":"wedged"}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestHeartbeatHandler_DegradedToOnline_AfterWedgeClears verifies the
+// happy-path recovery: a workspace previously marked degraded is
+// post-restart, error_rate is back to 0, and runtime_state is empty
+// (the new process re-imported claude_sdk_executor with the flag
+// fresh). Status flips back to online and a WORKSPACE_ONLINE event
+// fires.
+func TestHeartbeatHandler_DegradedToOnline_AfterWedgeClears(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-recovered").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
+
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-recovered", 0.0, "", 0, 30, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-recovered").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("degraded"))
+
+	// Recovery UPDATE fires (degraded → online).
+	mock.ExpectExec("UPDATE workspaces SET status = 'online'").
+		WithArgs("ws-recovered").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	mock.ExpectExec("INSERT INTO structure_events").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	// runtime_state intentionally absent (== ""); error_rate = 0; this
+	// is exactly what a freshly-restarted workspace's first heartbeat
+	// looks like.
+	body := `{"workspace_id":"ws-recovered","error_rate":0.0,"sample_error":"","active_tasks":0,"uptime_seconds":30}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
 // ==================== UpdateCard ====================

 func TestUpdateCard_Success(t *testing.T) {
--- a/workspace-server/internal/handlers/secrets.go
+++ b/workspace-server/internal/handlers/secrets.go
@ -466,3 +466,70 @@ func (h *SecretsHandler) GetModel(c *gin.Context) {

 	c.JSON(http.StatusOK, gin.H{"model": string(decrypted), "source": "workspace_secrets"})
 }
+
+// SetModel handles PUT /workspaces/:id/model — writes the model slug
+// into workspace_secrets as MODEL_PROVIDER (the key GetModel reads).
+// For hermes, the value is a hermes-native slug like "minimax/MiniMax-M2.7";
+// for langgraph it's the legacy "provider:model" form. Either way it's just
+// an opaque string the runtime interprets on its next start.
+//
+// Empty string clears the override. Triggers auto-restart so the new
+// env (HERMES_DEFAULT_MODEL etc.) takes effect immediately — without
+// this the user clicks Save+Restart, the canvas PUT lands, but the
+// already-restarting container misses the window and boots with the
+// old value.
+func (h *SecretsHandler) SetModel(c *gin.Context) {
+	workspaceID := c.Param("id")
+	if !uuidRegex.MatchString(workspaceID) {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid workspace ID"})
+		return
+	}
+	ctx := c.Request.Context()
+
+	var body struct {
+		Model string `json:"model"`
+	}
+	if err := c.ShouldBindJSON(&body); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request body"})
+		return
+	}
+
+	if body.Model == "" {
+		if _, err := db.DB.ExecContext(ctx,
+			`DELETE FROM workspace_secrets WHERE workspace_id = $1 AND key = 'MODEL_PROVIDER'`,
+			workspaceID); err != nil {
+			log.Printf("SetModel delete error: %v", err)
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to clear model"})
+			return
+		}
+		if h.restartFunc != nil {
+			go h.restartFunc(workspaceID)
+		}
+		c.JSON(http.StatusOK, gin.H{"status": "cleared"})
+		return
+	}
+
+	encrypted, err := crypto.Encrypt([]byte(body.Model))
+	if err != nil {
+		log.Printf("SetModel encrypt error: %v", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to encrypt model"})
+		return
+	}
+	version := crypto.CurrentEncryptionVersion()
+	_, err = db.DB.ExecContext(ctx, `
+		INSERT INTO workspace_secrets (workspace_id, key, encrypted_value, encryption_version)
+		VALUES ($1, 'MODEL_PROVIDER', $2, $3)
+		ON CONFLICT (workspace_id, key) DO UPDATE
+			SET encrypted_value = $2, encryption_version = $3, updated_at = now()
+	`, workspaceID, encrypted, version)
+	if err != nil {
+		log.Printf("SetModel upsert error: %v", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save model"})
+		return
+	}
+
+	if h.restartFunc != nil {
+		go h.restartFunc(workspaceID)
+	}
+	c.JSON(http.StatusOK, gin.H{"status": "saved", "model": body.Model})
+}
--- a/workspace-server/internal/handlers/secrets_test.go
+++ b/workspace-server/internal/handlers/secrets_test.go
@ -6,6 +6,7 @@ import (
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
+	"strings"
 	"testing"
 	"time"

@ -535,6 +536,88 @@ func TestSecretsGetModel_DBError(t *testing.T) {
 	}
 }

+// ==================== SetModel ====================
+
+func TestSecretsSetModel_Upsert(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	restartCalled := make(chan string, 1)
+	handler := NewSecretsHandler(func(id string) { restartCalled <- id })
+
+	mock.ExpectExec(`INSERT INTO workspace_secrets`).
+		WithArgs("00000000-0000-0000-0000-000000000001", sqlmock.AnyArg(), sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(1, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+	c.Request = httptest.NewRequest("PUT", "/workspaces/00000000-0000-0000-0000-000000000001/model",
+		strings.NewReader(`{"model":"minimax/MiniMax-M2.7"}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.SetModel(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	select {
+	case id := <-restartCalled:
+		if id != "00000000-0000-0000-0000-000000000001" {
+			t.Errorf("restart called with wrong id: %s", id)
+		}
+	case <-time.After(500 * time.Millisecond):
+		t.Error("restart was not triggered")
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+func TestSecretsSetModel_EmptyClears(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewSecretsHandler(func(string) {})
+
+	mock.ExpectExec(`DELETE FROM workspace_secrets`).
+		WithArgs("00000000-0000-0000-0000-000000000002").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000002"}}
+	c.Request = httptest.NewRequest("PUT", "/workspaces/00000000-0000-0000-0000-000000000002/model",
+		strings.NewReader(`{"model":""}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.SetModel(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+func TestSecretsSetModel_InvalidID(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewSecretsHandler(nil)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "not-a-uuid"}}
+	c.Request = httptest.NewRequest("PUT", "/workspaces/not-a-uuid/model",
+		strings.NewReader(`{"model":"x"}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.SetModel(c)
+
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400 for bad UUID, got %d", w.Code)
+	}
+}
+
 // ==================== Values — Phase 30.2 decrypted pull ====================

 // These tests target the secrets.Values handler (GET /workspaces/:id/secrets/values)
--- a/Show More
+++ b/Show More