diff --git a/.github/workflows/auto-tag-runtime.yml b/.github/workflows/auto-tag-runtime.yml
new file mode 100644
index 00000000..2b9070bc
--- /dev/null
+++ b/.github/workflows/auto-tag-runtime.yml
@@ -0,0 +1,113 @@
+name: auto-tag-runtime
+
+# Auto-tag runtime releases on every merge to main that touches workspace/.
+# This is the entry point of the runtime CD chain:
+#
+#   merge PR → auto-tag-runtime (this) → publish-runtime → cascade → template
+#   image rebuilds → repull on hosts.
+#
+# Default bump is patch. Override via PR label `release:minor` or
+# `release:major` BEFORE merging — the label is read off the merged PR
+# associated with the push commit.
+#
+# Skips when:
+#   - The push isn't to main (other branches don't auto-release).
+#   - The merge commit message contains `[skip-release]` (escape hatch
+#     for cleanup PRs that touch workspace/ but shouldn't ship).
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "workspace/**"
+      - "scripts/build_runtime_package.py"
+      - ".github/workflows/auto-tag-runtime.yml"
+      - ".github/workflows/publish-runtime.yml"
+
+permissions:
+  contents: write    # to push the new tag
+  pull-requests: read # to read labels off the merged PR
+
+concurrency:
+  # Serialize tag bumps so two near-simultaneous merges can't both think
+  # they're 0.1.6 and race to push the same tag.
+  group: auto-tag-runtime
+  cancel-in-progress: false
+
+jobs:
+  tag:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0    # need full tag history for `git describe` / sort
+
+      - name: Skip when commit asks
+        id: skip
+        run: |
+          MSG=$(git log -1 --format=%B "${{ github.sha }}")
+          if echo "$MSG" | grep -qiE '\[skip-release\]|\[no-release\]'; then
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            echo "Commit message contains [skip-release] — no tag will be created."
+          else
+            echo "skip=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Determine bump kind from PR label
+        id: bump
+        if: steps.skip.outputs.skip != 'true'
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          # The merged PR for this push commit. `gh pr list --search` finds
+          # closed PRs whose merge commit matches; we take the first.
+          PR=$(gh pr list --state merged --search "${{ github.sha }}" --json number,labels --jq '.[0]' 2>/dev/null || echo "")
+          if [ -z "$PR" ] || [ "$PR" = "null" ]; then
+            echo "No merged PR found for ${{ github.sha }} — defaulting to patch bump."
+            echo "kind=patch" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          LABELS=$(echo "$PR" | jq -r '.labels[].name')
+          if echo "$LABELS" | grep -qx 'release:major'; then
+            echo "kind=major" >> "$GITHUB_OUTPUT"
+          elif echo "$LABELS" | grep -qx 'release:minor'; then
+            echo "kind=minor" >> "$GITHUB_OUTPUT"
+          else
+            echo "kind=patch" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Compute next version from latest runtime-v* tag
+        id: version
+        if: steps.skip.outputs.skip != 'true'
+        run: |
+          # Find the highest runtime-vX.Y.Z tag. `sort -V` handles semver
+          # ordering; `grep` filters to the right tag prefix.
+          LATEST=$(git tag --list 'runtime-v*' | sort -V | tail -1)
+          if [ -z "$LATEST" ]; then
+            # No prior tag — start the runtime line at 0.1.0.
+            CURRENT="0.0.0"
+          else
+            CURRENT="${LATEST#runtime-v}"
+          fi
+          MAJOR=$(echo "$CURRENT" | cut -d. -f1)
+          MINOR=$(echo "$CURRENT" | cut -d. -f2)
+          PATCH=$(echo "$CURRENT" | cut -d. -f3)
+          case "${{ steps.bump.outputs.kind }}" in
+            major) MAJOR=$((MAJOR+1)); MINOR=0; PATCH=0;;
+            minor) MINOR=$((MINOR+1)); PATCH=0;;
+            patch) PATCH=$((PATCH+1));;
+          esac
+          NEW="$MAJOR.$MINOR.$PATCH"
+          echo "current=$CURRENT" >> "$GITHUB_OUTPUT"
+          echo "new=$NEW" >> "$GITHUB_OUTPUT"
+          echo "Bumping runtime $CURRENT → $NEW (${{ steps.bump.outputs.kind }})"
+
+      - name: Push new tag
+        if: steps.skip.outputs.skip != 'true'
+        run: |
+          NEW_TAG="runtime-v${{ steps.version.outputs.new }}"
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git tag -a "$NEW_TAG" -m "runtime $NEW_TAG (auto-bump from ${{ steps.bump.outputs.kind }})"
+          git push origin "$NEW_TAG"
+          echo "Pushed $NEW_TAG — publish-runtime workflow will fire on the tag."
diff --git a/.github/workflows/publish-runtime.yml b/.github/workflows/publish-runtime.yml
new file mode 100644
index 00000000..61054f8a
--- /dev/null
+++ b/.github/workflows/publish-runtime.yml
@@ -0,0 +1,161 @@
+name: publish-runtime
+
+# Publishes molecule-ai-workspace-runtime to PyPI from monorepo workspace/.
+# Monorepo workspace/ is the only source-of-truth for runtime code; this
+# workflow is the bridge from monorepo edits to the PyPI artifact that
+# the 8 workspace-template-* repos depend on.
+#
+# Triggered by:
+#   - Pushing a tag matching `runtime-vX.Y.Z` (the version is derived from
+#     the tag — `runtime-v0.1.6` publishes `0.1.6`).
+#   - Manual workflow_dispatch with an explicit `version` input (useful for
+#     dev/test releases without tagging the repo).
+#
+# The workflow:
+#   1. Runs scripts/build_runtime_package.py to copy workspace/ →
+#      build/molecule_runtime/ with imports rewritten (`a2a_client` →
+#      `molecule_runtime.a2a_client`).
+#   2. Builds wheel + sdist with `python -m build`.
+#   3. Publishes to PyPI via twine + repo secret PYPI_TOKEN.
+#
+# After publish: the 8 template repos pick up the new version on their
+# next image rebuild (their requirements.txt pin
+# `molecule-ai-workspace-runtime>=0.1.0`, so any new release is eligible).
+# To force-pull immediately, bump the pin in each template repo's
+# requirements.txt and merge — that triggers their own publish-image.yml.
+
+on:
+  push:
+    tags:
+      - "runtime-v*"
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "Version to publish (e.g. 0.1.6). Required for manual dispatch."
+        required: true
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    environment: pypi-publish
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: pip
+
+      - name: Derive version from tag or input
+        id: version
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            VERSION="${{ inputs.version }}"
+          else
+            # Tag is `runtime-vX.Y.Z` — strip the prefix.
+            VERSION="${GITHUB_REF_NAME#runtime-v}"
+          fi
+          if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+(\.dev[0-9]+|rc[0-9]+|a[0-9]+|b[0-9]+|\.post[0-9]+)?$'; then
+            echo "::error::version $VERSION does not match PEP 440"
+            exit 1
+          fi
+          echo "version=$VERSION" >> "$GITHUB_OUTPUT"
+          echo "Publishing molecule-ai-workspace-runtime $VERSION"
+
+      - name: Install build tooling
+        run: pip install build twine
+
+      - name: Build package from workspace/
+        run: |
+          python scripts/build_runtime_package.py \
+            --version "${{ steps.version.outputs.version }}" \
+            --out "${{ runner.temp }}/runtime-build"
+
+      - name: Build wheel + sdist
+        working-directory: ${{ runner.temp }}/runtime-build
+        run: python -m build
+
+      - name: Verify package contents (sanity)
+        working-directory: ${{ runner.temp }}/runtime-build
+        run: |
+          python -m twine check dist/*
+          # Smoke-import the built wheel to catch import-rewrite mistakes
+          # before they hit PyPI. The package depends on a2a-sdk + httpx
+          # via pyproject; install those so the smoke import resolves.
+          python -m venv /tmp/smoke
+          /tmp/smoke/bin/pip install --quiet dist/*.whl
+          WORKSPACE_ID=00000000-0000-0000-0000-000000000000 \
+          PLATFORM_URL=http://localhost:8080 \
+            /tmp/smoke/bin/python -c "
+          from molecule_runtime import a2a_client, a2a_tools
+          from molecule_runtime.builtin_tools import memory
+          from molecule_runtime.adapters import get_adapter, BaseAdapter, AdapterConfig
+          assert a2a_client._A2A_QUEUED_PREFIX, 'queued prefix missing — chat-leak fix not in build'
+          print('✓ smoke import passed')
+          "
+
+      - name: Publish to PyPI
+        working-directory: ${{ runner.temp }}/runtime-build
+        env:
+          TWINE_USERNAME: __token__
+          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+        run: python -m twine upload dist/*
+
+  cascade:
+    # After PyPI accepts the upload, fan out a repository_dispatch to each
+    # template repo so they rebuild their image against the new runtime.
+    # Each template's `runtime-published.yml` receiver picks up the event,
+    # pulls the new PyPI version (their requirements.txt pin is `>=`), and
+    # republishes ghcr.io/molecule-ai/workspace-template-<runtime>:latest.
+    #
+    # Soft-fail per repo: if one template's dispatch fails (perms missing,
+    # repo archived, etc.) we still try the others and surface the failures
+    # in the workflow summary instead of aborting the whole cascade.
+    needs: publish
+    runs-on: ubuntu-latest
+    steps:
+      - name: Fan out repository_dispatch
+        env:
+          # Fine-grained PAT with `actions:write` on the 8 template repos.
+          # GITHUB_TOKEN can't fire dispatches across repos — needs an explicit
+          # token. Stored as a repo secret; rotate per the standard schedule.
+          DISPATCH_TOKEN: ${{ secrets.TEMPLATE_DISPATCH_TOKEN }}
+          RUNTIME_VERSION: ${{ needs.publish.outputs.version || steps.version.outputs.version }}
+        run: |
+          set +e   # don't abort on a single repo failure — collect them all
+          if [ -z "$DISPATCH_TOKEN" ]; then
+            echo "::warning::TEMPLATE_DISPATCH_TOKEN secret not set — skipping cascade. PyPI was published; templates will pick up the new version on their own next rebuild."
+            exit 0
+          fi
+          # Re-derive version from the tag here too (in case publish job
+          # didn't expose an output the previous step's reference reads).
+          VERSION="${GITHUB_REF_NAME#runtime-v}"
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            VERSION="${{ inputs.version }}"
+          fi
+          TEMPLATES="claude-code langgraph crewai autogen deepagents hermes gemini-cli openclaw"
+          FAILED=""
+          for tpl in $TEMPLATES; do
+            REPO="Molecule-AI/molecule-ai-workspace-template-$tpl"
+            STATUS=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" \
+              -X POST "https://api.github.com/repos/$REPO/dispatches" \
+              -H "Authorization: Bearer $DISPATCH_TOKEN" \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              -d "{\"event_type\":\"runtime-published\",\"client_payload\":{\"runtime_version\":\"$VERSION\"}}")
+            if [ "$STATUS" = "204" ]; then
+              echo "✓ dispatched $tpl ($VERSION)"
+            else
+              echo "::warning::✗ failed to dispatch $tpl: HTTP $STATUS — $(cat /tmp/dispatch.out)"
+              FAILED="$FAILED $tpl"
+            fi
+          done
+          if [ -n "$FAILED" ]; then
+            echo "::warning::Cascade incomplete. Failed templates:$FAILED"
+            # Don't fail the whole job — PyPI publish already succeeded;
+            # operators can retry the failed templates manually.
+          fi
diff --git a/.github/workflows/sweep-cf-orphans.yml b/.github/workflows/sweep-cf-orphans.yml
index 0e825256..7fb35328 100644
--- a/.github/workflows/sweep-cf-orphans.yml
+++ b/.github/workflows/sweep-cf-orphans.yml
@@ -40,10 +40,14 @@ on:
         description: "Override safety gate (default 50, set higher only for major cleanup)"
         required: false
         default: "50"
-  # Required-check support: scheduled-only today, but include merge_group
-  # so a future branch-protection wire-in doesn't need a workflow edit.
-  merge_group:
-    types: [checks_requested]
+  # No `merge_group:` trigger on purpose. This is a janitor — it doesn't
+  # need to gate merges, and including it as written before #2088 fired
+  # the full sweep job (or its secret-check) on every PR going through
+  # the merge queue, generating one red CI run per merge-queue eval. If
+  # this workflow is ever wired up as a required check, re-add
+  #   merge_group: { types: [checks_requested] }
+  # AND gate the sweep step with `if: github.event_name != 'merge_group'`
+  # so merge-queue evals report success without actually running.
 
 # Don't let two sweeps race the same zone. workflow_dispatch during a
 # scheduled run would otherwise issue duplicate DELETE calls.
@@ -77,9 +81,12 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Verify required secrets present
-        # Fail fast and loud if a secret is unset — sweep-cf-orphans.sh
-        # also checks via `need`, but we want a single distinct error
-        # in the workflow log instead of script-level multi-line noise.
+        id: verify
+        # Soft skip when secrets aren't configured. The 6 secrets have
+        # to be set on the repo manually before this workflow can do
+        # real work; until they are, the schedule is a no-op rather
+        # than a recurring red CI run. workflow_dispatch surfaces a
+        # warning so an operator running it ad-hoc sees the gap.
         run: |
           missing=()
           for var in CF_API_TOKEN CF_ZONE_ID CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
@@ -88,12 +95,15 @@ jobs:
             fi
           done
           if [ ${#missing[@]} -gt 0 ]; then
-            echo "::error::missing required secret(s): ${missing[*]}"
-            exit 2
+            echo "::warning::skipping sweep — secrets not yet configured: ${missing[*]}"
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            exit 0
           fi
           echo "All required secrets present ✓"
+          echo "skip=false" >> "$GITHUB_OUTPUT"
 
       - name: Run sweep
+        if: steps.verify.outputs.skip != 'true'
         # Schedule-vs-dispatch dry-run asymmetry (intentional):
         #   - Scheduled runs: github.event.inputs.dry_run is empty →
         #     defaults to "false" below → script runs with --execute
diff --git a/canvas/e2e/staging-setup.ts b/canvas/e2e/staging-setup.ts
index 963f9ccb..5fc39225 100644
--- a/canvas/e2e/staging-setup.ts
+++ b/canvas/e2e/staging-setup.ts
@@ -46,7 +46,17 @@ const TENANT_DOMAIN = process.env.STAGING_TENANT_DOMAIN || "staging.moleculesai.
 // were blocking staging→main syncs on 2026-04-24.
 const PROVISION_TIMEOUT_MS = 20 * 60 * 1000;
 const WORKSPACE_ONLINE_TIMEOUT_MS = 20 * 60 * 1000;
-const TLS_TIMEOUT_MS = 3 * 60 * 1000;
+
+// TLS readiness depends on (1) Cloudflare DNS propagation through the
+// edge, (2) the tenant's CF Tunnel registering the new hostname, (3)
+// CF's edge ACME cert provisioning + cache. Each of these layers can
+// add 1-3 min on its own under heavy staging load. Bumped 10→15 min
+// after a burst of canary failures correlated with CP changes (#2090).
+// Stays below the 20-min PROVISION_TIMEOUT envelope so a genuinely-
+// stuck tenant fails-loud at the provision step rather than
+// masquerading as a TLS issue. Kept aligned with
+// tests/e2e/test_staging_full_saas.sh.
+const TLS_TIMEOUT_MS = 15 * 60 * 1000;
 
 async function jsonFetch(
   url: string,
diff --git a/canvas/next.config.ts b/canvas/next.config.ts
index 68a6c64d..079e21c2 100644
--- a/canvas/next.config.ts
+++ b/canvas/next.config.ts
@@ -1,7 +1,100 @@
 import type { NextConfig } from "next";
+import { existsSync, readFileSync } from "node:fs";
+import { dirname, join } from "node:path";
+
+// Load NEXT_PUBLIC_* vars from the monorepo root .env so a fresh
+// `pnpm dev` works without a per-developer canvas/.env.local. Next.js
+// only auto-loads .env from the project root by default — but our
+// canonical config (NEXT_PUBLIC_PLATFORM_URL, NEXT_PUBLIC_WS_URL,
+// MOLECULE_ENV, etc.) lives at the monorepo root, gitignored, shared
+// by the Go platform binary. Without this, the canvas falls back to
+// `window.location` (`ws://localhost:3000/ws`) and the WS pill stays
+// "Reconnecting" forever because Next.js dev doesn't serve /ws.
+//
+// Mirrors workspace-server/cmd/server/dotenv.go's monorepo-rooted .env
+// loader. Both processes look for the SAME marker (`workspace-server/
+// go.mod`) so a developer renaming or relocating the repo only has to
+// update one heuristic. Production is unaffected: `output: "standalone"`
+// bakes resolved env into the build, and the marker file isn't shipped.
+loadMonorepoEnv();
 
 const nextConfig: NextConfig = {
   output: "standalone",
 };
 
 export default nextConfig;
+
+function loadMonorepoEnv() {
+  const root = findMonorepoRoot(__dirname);
+  if (!root) return;
+  const envPath = join(root, ".env");
+  if (!existsSync(envPath)) return;
+  const body = readFileSync(envPath, "utf8");
+  let loaded = 0;
+  let skipped = 0;
+  for (const line of body.split(/\r?\n/)) {
+    const kv = parseLine(line);
+    if (!kv) continue;
+    const [k, v] = kv;
+    // Existing env wins. NOTE: an explicitly-set empty string
+    // (`KEY=` exported from a parent shell, where Node represents it
+    // as `""` not `undefined`) counts as "set" — we keep the empty
+    // value rather than backfilling from the file. Matches Go's
+    // os.LookupEnv check in workspace-server/cmd/server/dotenv.go so
+    // both processes treat the same input identically. Operators who
+    // want the file value to win must `unset KEY` in the launching
+    // shell.
+    if (process.env[k] !== undefined) {
+      skipped++;
+      continue;
+    }
+    process.env[k] = v;
+    loaded++;
+  }
+  // eslint-disable-next-line no-console
+  console.log(
+    `[next.config] loaded ${loaded} vars from ${envPath} (${skipped} already set in env)`,
+  );
+}
+
+function findMonorepoRoot(start: string): string | null {
+  let dir = start;
+  for (let i = 0; i < 6; i++) {
+    if (existsSync(join(dir, "workspace-server", "go.mod"))) return dir;
+    const parent = dirname(dir);
+    if (parent === dir) break;
+    dir = parent;
+  }
+  return null;
+}
+
+// Mirror of workspace-server/cmd/server/dotenv.go's parseDotEnvLine
+// — same rules so the two loaders agree on every line in the shared
+// .env. If you change one parser, change the other.
+function parseLine(raw: string): [string, string] | null {
+  let line = raw.replace(/^﻿/, "").trim();
+  if (line === "" || line.startsWith("#")) return null;
+  // `export ` prefix uses a literal space — `export\tFOO=bar` with a
+  // tab is intentionally rejected, matching the Go mirror in
+  // workspace-server/cmd/server/dotenv.go. Shells emit the prefix
+  // with a space; tabs would only appear in hand-mangled files.
+  if (line.startsWith("export ")) line = line.slice("export ".length).trimStart();
+  const eq = line.indexOf("=");
+  if (eq <= 0) return null;
+  const k = line.slice(0, eq).trim();
+  let v = line.slice(eq + 1).replace(/^[ \t]+/, "");
+  if (v.length >= 2 && (v[0] === '"' || v[0] === "'")) {
+    const quote = v[0];
+    const end = v.indexOf(quote, 1);
+    if (end >= 0) return [k, v.slice(1, end)];
+    // unterminated — fall through to bare-value handling
+  }
+  for (let i = 0; i < v.length; i++) {
+    if (v[i] !== "#") continue;
+    if (i === 0 || v[i - 1] === " " || v[i - 1] === "\t") {
+      v = v.slice(0, i);
+      break;
+    }
+  }
+  return [k, v.trim()];
+}
diff --git a/canvas/src/app/globals.css b/canvas/src/app/globals.css
index a88ce30a..ee39b125 100644
--- a/canvas/src/app/globals.css
+++ b/canvas/src/app/globals.css
@@ -1,5 +1,9 @@
 @import "xterm/css/xterm.css";
+/* Theme tokens MUST load before any feature stylesheet that
+   references them so custom properties are in scope. */
+@import "../styles/theme-tokens.css";
 @import "../styles/settings-panel.css";
+@import "../styles/org-deploy.css";
 
 @tailwind base;
 @tailwind components;
@@ -38,7 +42,20 @@ body {
 }
 
 .react-flow__node {
-  transition: box-shadow 0.2s ease;
+  /* Transform transition drives the "spawn from parent" motion —
+     org-deploy sets the node's initial position to the parent's
+     absolute coords, then repositions to the real slot, and this
+     transition interpolates the translate() in between.
+     Non-deploy workspace moves (drag, nest) get the same smoothing
+     for free. */
+  transition:
+    box-shadow var(--mol-duration-fast) ease,
+    transform var(--mol-duration-spawn) var(--mol-easing-bounce-out);
+}
+/* Drag events must feel instant — React Flow adds this class
+   for the lifetime of the gesture. */
+.react-flow__node.dragging {
+  transition: box-shadow var(--mol-duration-fast) ease;
 }
 
 /* Scrollbar styling */
diff --git a/canvas/src/app/page.tsx b/canvas/src/app/page.tsx
index 8b79ef83..666923eb 100644
--- a/canvas/src/app/page.tsx
+++ b/canvas/src/app/page.tsx
@@ -7,13 +7,19 @@ import { CommunicationOverlay } from "@/components/CommunicationOverlay";
 import { Spinner } from "@/components/Spinner";
 import { connectSocket, disconnectSocket } from "@/store/socket";
 import { useCanvasStore } from "@/store/canvas";
-import { api } from "@/lib/api";
+import { api, PlatformUnavailableError } from "@/lib/api";
 import type { WorkspaceData } from "@/store/socket";
 
 export default function Home() {
   const hydrationError = useCanvasStore((s) => s.hydrationError);
   const setHydrationError = useCanvasStore((s) => s.setHydrationError);
   const [hydrating, setHydrating] = useState(true);
+  // Distinct from hydrationError: platform-down is its own UX path
+  // (different copy, different action — the user's next step is to
+  // check local services, not to retry the API call). Tracked
+  // separately rather than encoded into hydrationError so the
+  // generic-error branch can stay simple.
+  const [platformDown, setPlatformDown] = useState(false);
 
   useEffect(() => {
     connectSocket();
@@ -28,8 +34,11 @@ export default function Home() {
         useCanvasStore.getState().setViewport(viewport);
       }
     }).catch((err) => {
-      // Initial hydration failed — show error banner to user
       console.error("Canvas: initial hydration failed", err);
+      if (err instanceof PlatformUnavailableError) {
+        setPlatformDown(true);
+        return;
+      }
       useCanvasStore.getState().setHydrationError(
         err instanceof Error && err.message ? err.message : "Failed to load canvas"
       );
@@ -53,6 +62,10 @@ export default function Home() {
     );
   }
 
+  if (platformDown) {
+    return <PlatformDownDiagnostic />;
+  }
+
   return (
     <>
       <Canvas />
@@ -83,3 +96,43 @@ export default function Home() {
     </>
   );
 }
+
+/**
+ * Dedicated diagnostic for the case where the platform reported its
+ * datastore (Postgres / Redis) is unreachable. Distinct from the
+ * generic API-error overlay: the user's next action is to check
+ * local services, not to retry the API call. Includes the exact
+ * commands for the common dev-host setup.
+ */
+function PlatformDownDiagnostic() {
+  return (
+    <div
+      role="alert"
+      className="fixed inset-0 flex flex-col items-center justify-center bg-zinc-950 text-zinc-300 gap-5 z-[9999] px-6"
+    >
+      <div className="text-amber-400 text-sm font-semibold uppercase tracking-wider">
+        Platform infrastructure unreachable
+      </div>
+      <p className="text-zinc-400 text-sm max-w-lg text-center leading-relaxed">
+        The platform server returned <code className="font-mono text-amber-300">503 platform_unavailable</code>.
+        That means it can&apos;t reach Postgres or Redis to validate your session.
+        Most common cause on a dev host: one of those services stopped.
+      </p>
+      <div className="bg-zinc-900/80 border border-zinc-700/50 rounded-lg px-4 py-3 max-w-lg w-full">
+        <div className="text-[10px] uppercase tracking-wider text-zinc-500 mb-2">Try first</div>
+        <pre className="text-[12px] text-zinc-300 font-mono whitespace-pre-wrap leading-relaxed">{`brew services start postgresql@14
+brew services start redis`}</pre>
+      </div>
+      <p className="text-[11px] text-zinc-500 max-w-lg text-center">
+        If both are running, check <code className="font-mono">/tmp/molecule-server.log</code> for
+        the underlying error. If you&apos;re on hosted SaaS, this is a platform incident — try again in a moment.
+      </p>
+      <button
+        onClick={() => window.location.reload()}
+        className="px-4 py-2 bg-blue-600 hover:bg-blue-500 text-white rounded-md text-sm mt-2"
+      >
+        Reload
+      </button>
+    </div>
+  );
+}
diff --git a/canvas/src/components/A2ATopologyOverlay.tsx b/canvas/src/components/A2ATopologyOverlay.tsx
index 4a35e638..efd4f0ff 100644
--- a/canvas/src/components/A2ATopologyOverlay.tsx
+++ b/canvas/src/components/A2ATopologyOverlay.tsx
@@ -74,7 +74,11 @@ export function buildA2AEdges(
     });
   }
 
-  // 3. Build React Flow Edge objects
+  // 3. Build React Flow Edge objects. We tag every overlay edge with
+  //    type: "a2a" so React Flow renders it via our custom A2AEdge
+  //    component (canvas/A2AEdge.tsx). The custom component portals
+  //    its label out of the SVG layer so it (a) doesn't get hidden
+  //    behind workspace cards and (b) is clickable.
   return Array.from(map.values()).map(({ source, target, count, lastAt }) => {
     const isHot = now - lastAt < A2A_HOT_MS;
     const stroke = isHot ? "#8b5cf6" : "#3b82f6"; // violet-500 : blue-500
@@ -84,6 +88,7 @@ export function buildA2AEdges(
 
     return {
       id: `a2a-${source}-${target}`,
+      type: "a2a",
       source,
       target,
       animated: isHot,
@@ -96,22 +101,22 @@ export function buildA2AEdges(
       style: {
         stroke,
         strokeWidth: 2,
-        // Non-blocking: label overlay never intercepts pointer events
+        // Path itself stays non-interactive so node drags through
+        // the line still work. The clickable target is the label
+        // pill, which sets pointerEvents: all on its own div.
         pointerEvents: "none" as React.CSSProperties["pointerEvents"],
       },
+      // `label` keeps the same string for back-compat with any test
+      // that asserts on it (e.g. buildA2AEdges output shape). Custom
+      // edge reads the rich data from `data` so the label visual is
+      // not constrained to a string anymore.
       label,
-      labelStyle: {
-        fill: "#a1a1aa",   // zinc-400
-        fontSize: 10,
-        pointerEvents: "none" as React.CSSProperties["pointerEvents"],
+      data: {
+        count,
+        lastAt,
+        isHot,
+        label,
       },
-      labelBgStyle: {
-        fill: "#18181b",   // zinc-900
-        fillOpacity: 0.9,
-        pointerEvents: "none" as React.CSSProperties["pointerEvents"],
-      },
-      labelBgPadding: [4, 6] as [number, number],
-      labelBgBorderRadius: 4,
     };
   });
 }
diff --git a/canvas/src/components/Canvas.tsx b/canvas/src/components/Canvas.tsx
index 16c299cb..0d793df5 100644
--- a/canvas/src/components/Canvas.tsx
+++ b/canvas/src/components/Canvas.tsx
@@ -36,11 +36,22 @@ import { DropTargetBadge } from "./canvas/DropTargetBadge";
 import { useDragHandlers } from "./canvas/useDragHandlers";
 import { useKeyboardShortcuts } from "./canvas/useKeyboardShortcuts";
 import { useCanvasViewport } from "./canvas/useCanvasViewport";
+import { A2AEdge } from "./canvas/A2AEdge";
 
 const nodeTypes = {
   workspaceNode: WorkspaceNode,
 };
 
+// Custom edge types. The default React Flow edge renders its label
+// inside the SVG group (always under nodes) with pointerEvents: none
+// inherited from the path. A2AEdge portals the label to a sibling
+// DOM layer so it renders above nodes and accepts clicks. Keep the
+// reference stable (module-scope const) so React Flow doesn't see a
+// new edgeTypes object on every render and warn about prop churn.
+const edgeTypes = {
+  a2a: A2AEdge,
+};
+
 const defaultEdgeOptions: Partial<Edge> = {
   animated: true,
   style: {
@@ -58,14 +69,95 @@ export function Canvas() {
 }
 
 function CanvasInner() {
-  const nodes = useCanvasStore((s) => s.nodes);
+  const rawNodes = useCanvasStore((s) => s.nodes);
   const edges = useCanvasStore((s) => s.edges);
   const a2aEdges = useCanvasStore((s) => s.a2aEdges);
   const showA2AEdges = useCanvasStore((s) => s.showA2AEdges);
+  const deletingIds = useCanvasStore((s) => s.deletingIds);
   const allEdges = useMemo(
     () => (showA2AEdges ? [...edges, ...a2aEdges] : edges),
     [edges, a2aEdges, showA2AEdges],
   );
+  // Drag-lock during a system-owned operation (deploy OR delete).
+  // React Flow respects Node.draggable, which stops the gesture
+  // before it starts — preventDefault() on the drag-start callback
+  // isn't authoritative in v12. We project `draggable: false` onto
+  // each locked node before handing the array to ReactFlow; the
+  // drag-start handler in useDragHandlers remains as a belt-and-
+  // braces check.
+  //
+  // Perf: short-circuit when nothing is provisioning so the memo
+  // passes rawNodes through unchanged (identity-stable → RF
+  // reconciles nothing). When a deploy IS active, build an O(n)
+  // root index once and re-use it. Critically, do NOT spread every
+  // node — only mutate the locked ones — so unmodified nodes keep
+  // their object identity and RF's per-node memo short-circuits.
+  const nodes = useMemo(() => {
+    const anyProvisioning = rawNodes.some((n) => n.data.status === "provisioning");
+    const anyDeleting = deletingIds.size > 0;
+    if (!anyProvisioning && !anyDeleting) return rawNodes;
+
+    const byId = new Map<string, typeof rawNodes[number]>();
+    for (const n of rawNodes) byId.set(n.id, n);
+    const rootOf = new Map<string, string>();
+    const resolveRoot = (id: string): string => {
+      // Iterative walk guards against a pathological cycle (hostile
+      // data) — recursion would hit the stack limit on a deep tree.
+      const visited = new Set<string>();
+      let cursor: string | null = id;
+      while (cursor) {
+        if (visited.has(cursor)) break;
+        visited.add(cursor);
+        const cached = rootOf.get(cursor);
+        if (cached) {
+          for (const seenId of visited) rootOf.set(seenId, cached);
+          return cached;
+        }
+        const n = byId.get(cursor);
+        if (!n) break;
+        if (!n.data.parentId) {
+          for (const seenId of visited) rootOf.set(seenId, cursor);
+          return cursor;
+        }
+        cursor = n.data.parentId;
+      }
+      return id;
+    };
+
+    const provisioningByRoot = new Map<string, number>();
+    for (const n of rawNodes) {
+      if (n.data.status !== "provisioning") continue;
+      const rootId = resolveRoot(n.id);
+      provisioningByRoot.set(rootId, (provisioningByRoot.get(rootId) ?? 0) + 1);
+    }
+
+    let touched = false;
+    const next = rawNodes.map((n) => {
+      const rootId = resolveRoot(n.id);
+      const deployLocked = n.id !== rootId && (provisioningByRoot.get(rootId) ?? 0) > 0;
+      // Delete-locked: nothing in a subtree whose DELETE is in
+      // flight should be draggable, INCLUDING the root of that
+      // subtree (unlike deploy, there's no cancel — the delete
+      // is irrevocable at this point).
+      const deleteLocked = deletingIds.has(n.id);
+      const shouldLock = deployLocked || deleteLocked;
+      if (shouldLock && n.draggable !== false) {
+        touched = true;
+        return { ...n, draggable: false };
+      }
+      if (!shouldLock && n.draggable === false) {
+        // Node was locked in a prior render; deploy cancelled /
+        // completed, or delete failed and was reverted. Restore
+        // default dragability.
+        touched = true;
+        const { draggable: _d, ...rest } = n;
+        void _d;
+        return rest as typeof n;
+      }
+      return n; // identity-preserved
+    });
+    return touched ? next : rawNodes;
+  }, [rawNodes, deletingIds]);
   const onNodesChange = useCanvasStore((s) => s.onNodesChange);
   const selectNode = useCanvasStore((s) => s.selectNode);
   const selectedNodeId = useCanvasStore((s) => s.selectedNodeId);
@@ -91,18 +183,45 @@ function CanvasInner() {
   // outside-click handler.
   const pendingDelete = useCanvasStore((s) => s.pendingDelete);
   const setPendingDelete = useCanvasStore((s) => s.setPendingDelete);
-  const removeNode = useCanvasStore((s) => s.removeNode);
+  const removeSubtree = useCanvasStore((s) => s.removeSubtree);
   const confirmDelete = useCallback(async () => {
     if (!pendingDelete) return;
     const { id } = pendingDelete;
     setPendingDelete(null);
+    // Compute the full subtree and mark it as "deleting" so every
+    // node in the chain renders dim + non-draggable during the
+    // network round-trip + the server-side cascade. Matches the
+    // deploy-lock UX: once a system-initiated operation owns this
+    // subtree, the user shouldn't be able to move its pieces
+    // around until it resolves.
+    const state = useCanvasStore.getState();
+    const subtree = new Set<string>();
+    const stack = [id];
+    while (stack.length) {
+      const nid = stack.pop()!;
+      subtree.add(nid);
+      for (const n of state.nodes) {
+        if (n.data.parentId === nid) stack.push(n.id);
+      }
+    }
+    state.beginDelete(subtree);
     try {
       await api.del(`/workspaces/${id}?confirm=true`);
-      removeNode(id);
+      // Mirror the server-side cascade locally — drop the parent AND
+      // every descendant in one atomic update. The per-descendant
+      // WORKSPACE_REMOVED WS events still arrive (and are no-ops
+      // because the nodes are already gone), but we no longer depend
+      // on them: a wedged WS used to leave orphan child cards on the
+      // canvas until the user refreshed the page.
+      removeSubtree(id);
+      state.endDelete(subtree);
     } catch (e) {
+      // Network or server error — restore the subtree to normal
+      // interaction and surface the error.
+      state.endDelete(subtree);
       showToast(e instanceof Error ? e.message : "Delete failed", "error");
     }
-  }, [pendingDelete, setPendingDelete, removeNode]);
+  }, [pendingDelete, setPendingDelete, removeSubtree]);
 
   const onPaneClick = useCallback(() => {
     selectNode(null);
@@ -141,6 +260,7 @@ function CanvasInner() {
           onPaneClick={onPaneClick}
           onMoveEnd={onMoveEnd}
           nodeTypes={nodeTypes}
+          edgeTypes={edgeTypes}
           defaultEdgeOptions={defaultEdgeOptions}
           defaultViewport={defaultViewport}
           fitView={viewport.x === 0 && viewport.y === 0 && viewport.zoom === 1}
diff --git a/canvas/src/components/EmptyState.tsx b/canvas/src/components/EmptyState.tsx
index bca64869..43e66665 100644
--- a/canvas/src/components/EmptyState.tsx
+++ b/canvas/src/components/EmptyState.tsx
@@ -1,27 +1,19 @@
 "use client";
 
-import { useState, useEffect } from "react";
+import { useState, useEffect, useCallback } from "react";
 import { api } from "@/lib/api";
 import { useCanvasStore } from "@/store/canvas";
 import { OrgTemplatesSection } from "./TemplatePalette";
+import { type Template } from "@/lib/deploy-preflight";
+import { useTemplateDeploy } from "@/hooks/useTemplateDeploy";
 import { Spinner } from "./Spinner";
 import { TIER_CONFIG } from "@/lib/design-tokens";
 
-interface Template {
-  id: string;
-  name: string;
-  description: string;
-  tier: number;
-  model: string;
-  skills: string[];
-  skill_count: number;
-}
-
 export function EmptyState() {
   const [templates, setTemplates] = useState<Template[]>([]);
   const [loading, setLoading] = useState(true);
-  const [deploying, setDeploying] = useState<string | null>(null);
-  const [error, setError] = useState<string | null>(null);
+  const [blankCreating, setBlankCreating] = useState(false);
+  const [blankError, setBlankError] = useState<string | null>(null);
 
   useEffect(() => {
     api
@@ -31,48 +23,56 @@ export function EmptyState() {
       .finally(() => setLoading(false));
   }, []);
 
-  const deploy = async (template: Template) => {
-    setDeploying(template.id);
-    setError(null);
-    try {
-      const ws = await api.post<{ id: string }>("/workspaces", {
-        name: template.name,
-        template: template.id,
-        tier: template.tier,
-        canvas: { x: 200, y: 150 },
-      });
-      // Auto-select the new workspace and open chat
-      setTimeout(() => {
-        useCanvasStore.getState().selectNode(ws.id);
-        useCanvasStore.getState().setPanelTab("chat");
-      }, 500);
-    } catch (e) {
-      setError(e instanceof Error ? e.message : "Deploy failed");
-    } finally {
-      setDeploying(null);
-    }
-  };
+  // Canvas fills in a visible "center-ish" spot on a fresh tenant so
+  // the user doesn't have to pan to find their new workspace. Fixed
+  // (200, 150) instead of the sidebar's random placement because the
+  // canvas is guaranteed empty when this component mounts.
+  const firstDeployCoords = useCallback(() => ({ x: 200, y: 150 }), []);
 
+  // After the POST succeeds, auto-select the new workspace and flip
+  // the panel to Chat. This is a UX flourish that only makes sense
+  // on first deploy (the canvas is empty so the selection can't
+  // surprise anyone); the sidebar intentionally skips this step.
+  // 500 ms delay so React Flow has a frame to render the new node
+  // before it receives focus.
+  const handleDeployed = useCallback((workspaceId: string) => {
+    setTimeout(() => {
+      useCanvasStore.getState().selectNode(workspaceId);
+      useCanvasStore.getState().setPanelTab("chat");
+    }, 500);
+  }, []);
+
+  const { deploy, deploying, error, modal } = useTemplateDeploy({
+    canvasCoords: firstDeployCoords,
+    onDeployed: handleDeployed,
+  });
+
+  // "Create blank" bypasses templates entirely — no preflight, no
+  // modal, just POST /workspaces with a default name and tier.
+  // Deliberately NOT routed through useTemplateDeploy because it
+  // has no `template.id` to deploy against.
   const createBlank = async () => {
-    setDeploying("blank");
-    setError(null);
+    setBlankCreating(true);
+    setBlankError(null);
     try {
       const ws = await api.post<{ id: string }>("/workspaces", {
         name: "My First Agent",
         tier: 2,
-        canvas: { x: 200, y: 150 },
+        canvas: firstDeployCoords(),
       });
-      setTimeout(() => {
-        useCanvasStore.getState().selectNode(ws.id);
-        useCanvasStore.getState().setPanelTab("chat");
-      }, 500);
+      handleDeployed(ws.id);
     } catch (e) {
-      setError(e instanceof Error ? e.message : "Create failed");
+      setBlankError(e instanceof Error ? e.message : "Create failed");
     } finally {
-      setDeploying(null);
+      setBlankCreating(false);
     }
   };
 
+  // Any active gesture locks every button so the user can't fire a
+  // second POST while the first is still in flight.
+  const anyDeploying = !!deploying || blankCreating;
+  const displayError = error ?? blankError;
+
   return (
     <div className="absolute inset-0 flex items-start justify-center pointer-events-none z-[1] overflow-y-auto py-8">
       <div className="relative max-w-2xl w-full rounded-3xl border border-zinc-800/70 bg-zinc-950/80 backdrop-blur-xl px-8 py-8 text-center shadow-2xl shadow-black/40 pointer-events-auto mx-4">
@@ -112,8 +112,8 @@ export function EmptyState() {
                 <button
                   type="button"
                   key={t.id}
-                  onClick={() => deploy(t)}
-                  disabled={!!deploying}
+                  onClick={() => void deploy(t)}
+                  disabled={anyDeploying}
                   className="group rounded-xl border border-zinc-800/60 bg-zinc-900/50 px-3.5 py-3 hover:border-blue-500/40 hover:bg-zinc-900/80 transition-all disabled:opacity-50 disabled:cursor-not-allowed disabled:hover:border-zinc-800/60 disabled:hover:bg-zinc-900/50 text-left focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70"
                 >
                   <div className="flex items-center gap-2 mb-1">
@@ -143,10 +143,10 @@ export function EmptyState() {
         <button
           type="button"
           onClick={createBlank}
-          disabled={!!deploying}
+          disabled={anyDeploying}
           className="w-full rounded-xl border border-dashed border-zinc-700/60 bg-zinc-900/30 px-4 py-3 text-sm text-zinc-400 hover:text-zinc-200 hover:border-zinc-600 hover:bg-zinc-900/50 transition-all disabled:opacity-50 disabled:cursor-not-allowed disabled:hover:text-zinc-400 disabled:hover:border-zinc-700/60 focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70"
         >
-          {deploying === "blank" ? "Creating..." : "+ Create blank workspace"}
+          {blankCreating ? "Creating..." : "+ Create blank workspace"}
         </button>
 
         {/* Org templates — instantiate a whole team in one click */}
@@ -154,12 +154,17 @@ export function EmptyState() {
           <OrgTemplatesSection />
         </div>
 
-        {error && (
+        {displayError && (
           <div role="alert" className="mt-3 px-3 py-2 bg-red-950/40 border border-red-800/50 rounded-lg text-xs text-red-400">
-            {error}
+            {displayError}
           </div>
         )}
 
+        {/* Missing-keys preflight modal — owned by useTemplateDeploy,
+            shared with TemplatePalette. Rendered inline here so it
+            overlays this card naturally. */}
+        {modal}
+
         {/* Tips */}
         <div className="mt-5 pt-4 border-t border-zinc-800/50">
           <div className="flex items-center justify-center gap-6 text-[10px] text-zinc-400">
diff --git a/canvas/src/components/Legend.tsx b/canvas/src/components/Legend.tsx
index 10964fd3..0e578972 100644
--- a/canvas/src/components/Legend.tsx
+++ b/canvas/src/components/Legend.tsx
@@ -1,19 +1,92 @@
 "use client";
 
+import { useEffect, useState } from "react";
 import { STATUS_CONFIG } from "@/lib/design-tokens";
 import { useCanvasStore } from "@/store/canvas";
 
 const LEGEND_STATUSES = ["online", "provisioning", "degraded", "failed", "paused", "offline"] as const;
 
+// Persist the user's choice across sessions. Default is "open" so
+// first-time users still see the symbol key; once dismissed we
+// respect that until they explicitly reopen via the floating pill.
+const STORAGE_KEY = "molecule.legend.open";
+
+function readStoredOpen(): boolean {
+  if (typeof window === "undefined") return true;
+  try {
+    const v = window.localStorage.getItem(STORAGE_KEY);
+    if (v === null) return true;
+    return v === "1";
+  } catch {
+    return true;
+  }
+}
+
+function writeStoredOpen(open: boolean) {
+  if (typeof window === "undefined") return;
+  try {
+    window.localStorage.setItem(STORAGE_KEY, open ? "1" : "0");
+  } catch {
+    // localStorage can throw in private mode / quota / disabled
+    // contexts. Silent fallback — the in-memory state still works
+    // for the current session.
+  }
+}
+
 export function Legend() {
   // TemplatePalette (when open) is fixed top-0 left-0 w-[280px] — the
   // default bottom-6 left-4 position of this legend would sit under it.
   // Shift past the 280 px palette + a 16 px gap when the palette is open.
   const paletteOpen = useCanvasStore((s) => s.templatePaletteOpen);
   const leftClass = paletteOpen ? "left-[296px]" : "left-4";
+
+  // SSR-safe pattern: mount with the default (true) so first paint
+  // matches the server output, then hydrate the persisted value
+  // after mount. Avoids a hydration mismatch warning when the user
+  // had previously closed the legend.
+  const [open, setOpen] = useState(true);
+  useEffect(() => {
+    setOpen(readStoredOpen());
+  }, []);
+
+  const closeLegend = () => {
+    setOpen(false);
+    writeStoredOpen(false);
+  };
+  const openLegend = () => {
+    setOpen(true);
+    writeStoredOpen(true);
+  };
+
+  if (!open) {
+    return (
+      <button
+        type="button"
+        onClick={openLegend}
+        aria-label="Show legend"
+        title="Show legend"
+        className={`fixed bottom-6 ${leftClass} z-30 flex items-center gap-1.5 rounded-full bg-zinc-900/95 border border-zinc-700/50 px-3 py-1.5 text-[11px] font-semibold text-zinc-400 uppercase tracking-wider shadow-xl shadow-black/30 backdrop-blur-sm hover:text-zinc-200 hover:border-zinc-600 transition-[left,colors] duration-200`}
+      >
+        <span aria-hidden="true" className="text-[10px]">ⓘ</span>
+        Legend
+      </button>
+    );
+  }
+
   return (
     <div className={`fixed bottom-6 ${leftClass} z-30 bg-zinc-900/95 border border-zinc-700/50 rounded-xl px-4 py-3 shadow-xl shadow-black/30 backdrop-blur-sm max-w-[280px] transition-[left] duration-200`}>
-      <div className="text-[11px] font-semibold text-zinc-400 uppercase tracking-wider mb-2">Legend</div>
+      <div className="flex items-start justify-between mb-2">
+        <div className="text-[11px] font-semibold text-zinc-400 uppercase tracking-wider">Legend</div>
+        <button
+          type="button"
+          onClick={closeLegend}
+          aria-label="Hide legend"
+          title="Hide legend"
+          className="-mt-0.5 -mr-1 px-1.5 text-[14px] leading-none text-zinc-500 hover:text-zinc-200 transition-colors"
+        >
+          ×
+        </button>
+      </div>
 
       {/* Status */}
       <div className="mb-2">
diff --git a/canvas/src/components/MissingKeysModal.tsx b/canvas/src/components/MissingKeysModal.tsx
index e80ab58a..318ecef7 100644
--- a/canvas/src/components/MissingKeysModal.tsx
+++ b/canvas/src/components/MissingKeysModal.tsx
@@ -1,6 +1,7 @@
 "use client";
 
 import { useState, useEffect, useCallback, useRef, useMemo } from "react";
+import { createPortal } from "react-dom";
 import { api } from "@/lib/api";
 import { getKeyLabel, type ProviderChoice } from "@/lib/deploy-preflight";
 
@@ -196,6 +197,12 @@ function ProviderPickerModal({
   );
 
   if (!open) return null;
+  // Portal to document.body for the same reason as
+  // OrgImportPreflightModal — several callers (TemplatePalette,
+  // EmptyState) render the modal inside their own fixed+filtered
+  // containers, which re-anchor the "fixed" positioning to the
+  // wrapper's bounds instead of the viewport.
+  if (typeof document === "undefined") return null;
 
   const allSaved = entries.length > 0 && entries.every((e) => e.saved);
   const anySaving = entries.some((e) => e.saving);
@@ -203,8 +210,14 @@ function ProviderPickerModal({
     .replace(/[-_]/g, " ")
     .replace(/\b\w/g, (c) => c.toUpperCase());
 
-  return (
-    <div className="fixed inset-0 z-50 flex items-center justify-center">
+  return createPortal(
+    // z-[60] so this stacks ABOVE OrgImportPreflightModal (z-50).
+    // Both can be on screen at once during an org import: the org-
+    // preflight is open while the user clicks a per-workspace deploy
+    // that triggers MissingKeys. Without the explicit z-order the
+    // backdrop click might dismiss the wrong modal depending on
+    // React's commit ordering.
+    <div className="fixed inset-0 z-[60] flex items-center justify-center">
       <div
         aria-hidden="true"
         className="absolute inset-0 bg-black/70 backdrop-blur-sm"
@@ -215,7 +228,7 @@ function ProviderPickerModal({
         role="dialog"
         aria-modal="true"
         aria-labelledby="missing-keys-title"
-        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[480px] w-full mx-4 overflow-hidden"
+        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[480px] w-full mx-4 max-h-[80vh] overflow-auto"
       >
         <div className="px-5 py-4 border-b border-zinc-800">
           <div className="flex items-center gap-2 mb-1">
@@ -360,7 +373,8 @@ function ProviderPickerModal({
           </div>
         </div>
       </div>
-    </div>
+    </div>,
+    document.body,
   );
 }
 
@@ -474,6 +488,7 @@ function AllKeysModal({
   }, [open]);
 
   if (!open) return null;
+  if (typeof document === "undefined") return null;
 
   const allSaved = entries.length > 0 && entries.every((e) => e.saved);
   const anySaving = entries.some((e) => e.saving);
@@ -481,8 +496,14 @@ function AllKeysModal({
     .replace(/[-_]/g, " ")
     .replace(/\b\w/g, (c) => c.toUpperCase());
 
-  return (
-    <div className="fixed inset-0 z-50 flex items-center justify-center">
+  return createPortal(
+    // z-[60] so this stacks ABOVE OrgImportPreflightModal (z-50).
+    // Both can be on screen at once during an org import: the org-
+    // preflight is open while the user clicks a per-workspace deploy
+    // that triggers MissingKeys. Without the explicit z-order the
+    // backdrop click might dismiss the wrong modal depending on
+    // React's commit ordering.
+    <div className="fixed inset-0 z-[60] flex items-center justify-center">
       <div
         className="absolute inset-0 bg-black/70 backdrop-blur-sm"
         aria-hidden="true"
@@ -493,7 +514,7 @@ function AllKeysModal({
         role="dialog"
         aria-modal="true"
         aria-labelledby="missing-keys-title"
-        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[440px] w-full mx-4 overflow-hidden"
+        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[440px] w-full mx-4 max-h-[80vh] overflow-auto"
       >
         <div className="px-5 py-4 border-b border-zinc-800">
           <div className="flex items-center gap-2 mb-1">
@@ -608,6 +629,7 @@ function AllKeysModal({
           </div>
         </div>
       </div>
-    </div>
+    </div>,
+    document.body,
   );
 }
diff --git a/canvas/src/components/OrgImportPreflightModal.tsx b/canvas/src/components/OrgImportPreflightModal.tsx
new file mode 100644
index 00000000..51d61c82
--- /dev/null
+++ b/canvas/src/components/OrgImportPreflightModal.tsx
@@ -0,0 +1,540 @@
+"use client";
+
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+import { createPortal } from "react-dom";
+import { createSecret } from "@/lib/api/secrets";
+
+/**
+ * One entry from the server's preflight `required_env` / `recommended_env`.
+ *
+ *   - A plain string is a STRICT requirement: that exact env var must be
+ *     configured.
+ *   - A `{any_of: [...]}` object is an OR group: at least one member
+ *     must be configured to satisfy it. Lets a template say "either
+ *     ANTHROPIC_API_KEY or CLAUDE_CODE_OAUTH_TOKEN" without forcing
+ *     both.
+ *
+ * Matches the Go `EnvRequirement` type's JSON shape (MarshalJSON in
+ * workspace-server/internal/handlers/org.go). The union is written so
+ * that a narrow check — `typeof e === "string"` — distinguishes cleanly.
+ */
+export type EnvRequirement = string | { any_of: string[] };
+
+/** Flat member list for a requirement. */
+export function envReqMembers(r: EnvRequirement): string[] {
+  return typeof r === "string" ? [r] : r.any_of;
+}
+
+/** True if any member is present in `configured`. */
+export function envReqSatisfied(r: EnvRequirement, configured: Set<string>): boolean {
+  if (typeof r === "string") return configured.has(r);
+  return r.any_of.some((m) => configured.has(m));
+}
+
+/** Stable react-key / dedup key for a requirement. Sorted for groups so
+ *  reordered-member variants still collapse to one entry. */
+export function envReqKey(r: EnvRequirement): string {
+  if (typeof r === "string") return r;
+  return [...r.any_of].sort().join("|");
+}
+
+interface Props {
+  open: boolean;
+  /** Display name of the org template — headline only. */
+  orgName: string;
+  /** Total workspace count so the header can read "12 workspaces". */
+  workspaceCount: number;
+  /** Env vars the server has declared MUST be set as global secrets.
+   *  Import is disabled until every entry here is configured. Entries
+   *  are either a single key name or an any-of group. */
+  requiredEnv: EnvRequirement[];
+  /** Env vars the server suggests — import can proceed without them,
+   *  but the user sees them listed so they can decide. Same union
+   *  shape as `requiredEnv`. */
+  recommendedEnv: EnvRequirement[];
+  /** Names of env vars already configured globally. Used to strike
+   *  through entries the user has already set up in another
+   *  session. Passed in rather than queried inside the modal so the
+   *  parent can refresh after each save without prop-driven effects. */
+  configuredKeys: Set<string>;
+  /** Called after a successful secret save so the parent can refresh
+   *  `configuredKeys`. */
+  onSecretSaved: () => void;
+  /** User clicked Import with all required envs satisfied. */
+  onProceed: () => void;
+  /** User dismissed the modal. Import is NOT fired. */
+  onCancel: () => void;
+}
+
+interface DraftEntry {
+  key: string;
+  value: string;
+  saving: boolean;
+  error: string | null;
+}
+
+/**
+ * OrgImportPreflightModal
+ * -----------------------
+ * Two-tier env preflight before POST /org/import:
+ *
+ *   - REQUIRED section (red, blocking) — every entry MUST be configured
+ *     globally before the Import button enables. Matches the server-
+ *     side preflight that would 412 the import anyway.
+ *
+ *   - RECOMMENDED section (yellow, non-blocking) — listed so the user
+ *     can add them if they want the full experience, but the Import
+ *     button stays enabled regardless.
+ *
+ * Saving goes to the GLOBAL secrets endpoint (PUT /settings/secrets)
+ * because org-level templates deploy shared resources. Per-workspace
+ * overrides still work via the Config tab on an individual node
+ * after import. The modal does NOT enable Import the moment a key is
+ * typed — only after it saves successfully (so a half-entered token
+ * can't proceed and then fail at container-start time instead).
+ */
+export function OrgImportPreflightModal({
+  open,
+  orgName,
+  workspaceCount,
+  requiredEnv,
+  recommendedEnv,
+  configuredKeys,
+  onSecretSaved,
+  onProceed,
+  onCancel,
+}: Props) {
+  const [drafts, setDrafts] = useState<Record<string, DraftEntry>>({});
+
+  // Flatten the union-shaped requirement lists to the set of every key
+  // that could ever appear as an input row. Used purely to seed the
+  // drafts map — satisfaction semantics still read from the grouped
+  // EnvRequirement entries (a group can be satisfied by any one
+  // member).
+  const allMemberKeys = useMemo(() => {
+    const keys: string[] = [];
+    for (const r of requiredEnv) keys.push(...envReqMembers(r));
+    for (const r of recommendedEnv) keys.push(...envReqMembers(r));
+    return keys;
+  }, [requiredEnv, recommendedEnv]);
+
+  // Seed a draft entry per declared key the first time the modal
+  // opens. Entries persist across `configuredKeys` changes so a mid-
+  // save recheck doesn't wipe what the user typed.
+  //
+  // Dep: derive a STABLE string from the env-name lists rather than
+  // the array refs themselves. The parent computes
+  // `preflight.org.required_env ?? []`, which produces a fresh []
+  // identity on every re-render (e.g. when refreshConfiguredKeys
+  // bumps state); depending on the array refs would re-fire the
+  // effect on every parent render and mask any future edit that
+  // drops the `if (!next[k])` guard as a silent input-reset bug.
+  const envKeysSignature = useMemo(
+    () => [...allMemberKeys].sort().join("|"),
+    [allMemberKeys],
+  );
+  useEffect(() => {
+    if (!open) return;
+    setDrafts((prev) => {
+      const next = { ...prev };
+      for (const k of allMemberKeys) {
+        if (!next[k]) {
+          next[k] = { key: k, value: "", saving: false, error: null };
+        }
+      }
+      return next;
+    });
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [open, envKeysSignature]);
+
+  const missingRequired = useMemo(
+    () => requiredEnv.filter((r) => !envReqSatisfied(r, configuredKeys)),
+    [requiredEnv, configuredKeys],
+  );
+  const missingRecommended = useMemo(
+    () => recommendedEnv.filter((r) => !envReqSatisfied(r, configuredKeys)),
+    [recommendedEnv, configuredKeys],
+  );
+  const canProceed = missingRequired.length === 0;
+
+  // Synchronous in-flight gate. A ref (not state) so two clicks
+  // dispatched in the SAME microtask both see the gate flip — state
+  // commits don't help here because setState is async. The previous
+  // closure-based `current.saving` gate worked under React Testing
+  // Library's act() flushing but failed for true microtask-level
+  // double-fires (programmatic clicks, dblclick events, Enter-spam
+  // before React commits). Set is keyed by env var name so different
+  // rows can save concurrently.
+  const inFlightRef = useRef<Set<string>>(new Set());
+
+  // Latest-drafts ref so saveOne can read the current input value
+  // without taking `drafts` as a useCallback dep — that dep would
+  // re-create saveOne on every keystroke and re-bind every Save
+  // button's onClick handler, churn that scales with row count.
+  const draftsRef = useRef(drafts);
+  useEffect(() => {
+    draftsRef.current = drafts;
+  }, [drafts]);
+
+  const saveOne = useCallback(
+    async (key: string) => {
+      // Microtask-safe gate: claim the slot synchronously BEFORE any
+      // await so a second click in the same tick bounces immediately.
+      if (inFlightRef.current.has(key)) return;
+      const current = draftsRef.current[key];
+      if (!current || !current.value.trim()) return;
+      inFlightRef.current.add(key);
+
+      const startValue = current.value;
+      setDrafts((d) => ({
+        ...d,
+        [key]: { ...d[key], saving: true, error: null },
+      }));
+      try {
+        await createSecret("global", key, startValue);
+        setDrafts((d) => ({
+          ...d,
+          [key]: { ...d[key], value: "", saving: false, error: null },
+        }));
+        // Let the parent refresh configuredKeys so the strike-through
+        // updates and canProceed recomputes.
+        onSecretSaved();
+      } catch (e) {
+        setDrafts((d) => ({
+          ...d,
+          [key]: {
+            ...d[key],
+            saving: false,
+            error: e instanceof Error ? e.message : "Save failed",
+          },
+        }));
+      } finally {
+        inFlightRef.current.delete(key);
+      }
+    },
+    [onSecretSaved],
+  );
+
+  if (!open) return null;
+
+  // Portal the dialog to document.body so it escapes any ancestor
+  // containing block. TemplatePalette renders this modal inside a
+  // sidebar whose `fixed` container plus backdrop-filter together
+  // re-anchor descendants' `position: fixed` to the sidebar's own
+  // bounds instead of the viewport — the modal ends up glued to the
+  // sidebar's scrollable region and only becomes visible after the
+  // user scrolls the sidebar. Portal dodges that class of issue
+  // once and for all, regardless of what future wrappers do.
+  //
+  // SSR-safe guard: `document` is undefined on the server. Since
+  // the modal is gated by `if (!open) return null` above, this
+  // effectively only runs after open flips true on the client.
+  if (typeof document === "undefined") return null;
+
+  return createPortal(
+    <div
+      role="dialog"
+      aria-modal="true"
+      aria-labelledby="org-preflight-title"
+      className="fixed inset-0 z-50 flex items-center justify-center bg-black/70"
+      onClick={onCancel}
+    >
+      <div
+        className="w-[560px] max-h-[80vh] overflow-auto rounded-xl bg-zinc-900 border border-zinc-700 shadow-2xl"
+        onClick={(e) => e.stopPropagation()}
+      >
+        <header className="px-5 py-4 border-b border-zinc-800">
+          <h2 id="org-preflight-title" className="text-sm font-semibold text-zinc-100">
+            Deploy {orgName}
+          </h2>
+          <p className="mt-0.5 text-[11px] text-zinc-500">
+            {workspaceCount} workspace{workspaceCount === 1 ? "" : "s"}.
+            Review the credentials needed before import.
+          </p>
+        </header>
+
+        <section className="p-5 space-y-5">
+          {requiredEnv.length > 0 && (
+            <EnvList
+              tone="required"
+              title="Required"
+              subtitle="Import is blocked until every key below is saved globally."
+              entries={requiredEnv}
+              configuredKeys={configuredKeys}
+              drafts={drafts}
+              onChange={(key, value) =>
+                setDrafts((d) => ({ ...d, [key]: { ...d[key], value } }))
+              }
+              onSave={saveOne}
+            />
+          )}
+          {recommendedEnv.length > 0 && (
+            <EnvList
+              tone="recommended"
+              title="Recommended"
+              subtitle="Not required, but some features degrade without them. Add them now for the best experience."
+              entries={recommendedEnv}
+              configuredKeys={configuredKeys}
+              drafts={drafts}
+              onChange={(key, value) =>
+                setDrafts((d) => ({ ...d, [key]: { ...d[key], value } }))
+              }
+              onSave={saveOne}
+            />
+          )}
+          {requiredEnv.length === 0 && recommendedEnv.length === 0 && (
+            <p className="text-[12px] text-zinc-400">
+              No additional credentials required for this template.
+            </p>
+          )}
+        </section>
+
+        <footer className="px-5 py-3 border-t border-zinc-800 flex items-center justify-between">
+          <button
+            type="button"
+            onClick={onCancel}
+            className="px-3 py-1.5 text-[11px] rounded bg-zinc-800 hover:bg-zinc-700 text-zinc-300"
+          >
+            Cancel
+          </button>
+          <div className="flex items-center gap-2">
+            {missingRecommended.length > 0 && canProceed && (
+              <span className="text-[10px] text-amber-400/90">
+                {missingRecommended.length} recommended key
+                {missingRecommended.length === 1 ? "" : "s"} still unset
+              </span>
+            )}
+            <button
+              type="button"
+              onClick={onProceed}
+              disabled={!canProceed}
+              className="px-4 py-1.5 text-[11px] font-semibold rounded bg-blue-600 hover:bg-blue-500 text-white disabled:bg-zinc-700 disabled:text-zinc-500 disabled:cursor-not-allowed"
+            >
+              Import
+            </button>
+          </div>
+        </footer>
+      </div>
+    </div>,
+    document.body,
+  );
+}
+
+interface EnvListProps {
+  tone: "required" | "recommended";
+  title: string;
+  subtitle: string;
+  entries: EnvRequirement[];
+  configuredKeys: Set<string>;
+  drafts: Record<string, DraftEntry>;
+  onChange: (key: string, value: string) => void;
+  onSave: (key: string) => void;
+}
+
+function EnvList({
+  tone,
+  title,
+  subtitle,
+  entries,
+  configuredKeys,
+  drafts,
+  onChange,
+  onSave,
+}: EnvListProps) {
+  const accent =
+    tone === "required"
+      ? "border-red-800/60 bg-red-950/20"
+      : "border-amber-800/50 bg-amber-950/15";
+  const headerColor =
+    tone === "required" ? "text-red-300" : "text-amber-300";
+
+  return (
+    <div className={`rounded-lg border ${accent} p-3`}>
+      <h3 className={`text-[11px] font-semibold uppercase tracking-wide ${headerColor}`}>
+        {title}
+      </h3>
+      <p className="mt-0.5 mb-2 text-[10px] text-zinc-400">{subtitle}</p>
+      <ul className="space-y-2">
+        {entries.map((entry) =>
+          typeof entry === "string" ? (
+            <StrictEnvRow
+              key={envReqKey(entry)}
+              envKey={entry}
+              configured={configuredKeys.has(entry)}
+              draft={drafts[entry]}
+              onChange={onChange}
+              onSave={onSave}
+            />
+          ) : (
+            <AnyOfEnvGroup
+              key={envReqKey(entry)}
+              members={entry.any_of}
+              configuredKeys={configuredKeys}
+              drafts={drafts}
+              onChange={onChange}
+              onSave={onSave}
+            />
+          ),
+        )}
+      </ul>
+    </div>
+  );
+}
+
+interface StrictEnvRowProps {
+  envKey: string;
+  configured: boolean;
+  draft: DraftEntry | undefined;
+  onChange: (key: string, value: string) => void;
+  onSave: (key: string) => void;
+}
+
+function StrictEnvRow({
+  envKey,
+  configured,
+  draft: d,
+  onChange,
+  onSave,
+}: StrictEnvRowProps) {
+  return (
+    <li className="flex items-center gap-2 rounded bg-zinc-900/70 border border-zinc-800 px-2 py-1.5">
+      <code
+        className={`text-[11px] font-mono flex-1 ${
+          configured ? "text-zinc-500 line-through" : "text-zinc-200"
+        }`}
+      >
+        {envKey}
+      </code>
+      {configured ? (
+        <span className="text-[10px] text-emerald-400">✓ set</span>
+      ) : (
+        <>
+          <input
+            type="password"
+            aria-label={`Value for ${envKey}`}
+            placeholder="paste value"
+            value={d?.value ?? ""}
+            onChange={(e) => onChange(envKey, e.target.value)}
+            onKeyDown={(e) => {
+              if (e.key === "Enter") {
+                e.preventDefault();
+                onSave(envKey);
+              }
+            }}
+            disabled={d?.saving}
+            className="flex-1 px-2 py-1 rounded bg-zinc-800 border border-zinc-700 text-[11px] text-zinc-200 focus:outline-none focus:border-blue-500 disabled:opacity-50"
+          />
+          <button
+            type="button"
+            onClick={() => onSave(envKey)}
+            disabled={d?.saving || !d?.value.trim()}
+            className="px-2 py-1 text-[10px] rounded bg-blue-600 hover:bg-blue-500 text-white disabled:opacity-40 disabled:cursor-not-allowed"
+          >
+            {d?.saving ? "…" : "Save"}
+          </button>
+        </>
+      )}
+      {d?.error && (
+        <span className="text-[9px] text-red-400 basis-full pl-1">
+          {d.error}
+        </span>
+      )}
+    </li>
+  );
+}
+
+interface AnyOfEnvGroupProps {
+  members: string[];
+  configuredKeys: Set<string>;
+  drafts: Record<string, DraftEntry>;
+  onChange: (key: string, value: string) => void;
+  onSave: (key: string) => void;
+}
+
+/**
+ * Renders an OR group: the user only needs to configure ONE of the
+ * members to satisfy the requirement. Once any member is configured
+ * the group shows a green banner identifying the satisfying key; the
+ * other inputs remain visible but muted so the user can still switch
+ * providers if they want (uncommon but cheap to support).
+ */
+function AnyOfEnvGroup({
+  members,
+  configuredKeys,
+  drafts,
+  onChange,
+  onSave,
+}: AnyOfEnvGroupProps) {
+  const satisfiedBy = members.find((m) => configuredKeys.has(m));
+  return (
+    <li className="rounded border border-zinc-800 bg-zinc-900/50 px-2.5 py-2">
+      <div className="flex items-center justify-between mb-1.5">
+        <span className="text-[10px] uppercase tracking-wide text-zinc-400">
+          Configure any one
+        </span>
+        {satisfiedBy && (
+          <span className="text-[10px] text-emerald-400">
+            ✓ using <code className="font-mono">{satisfiedBy}</code>
+          </span>
+        )}
+      </div>
+      <ul className="space-y-1.5">
+        {members.map((m) => {
+          const isConfigured = configuredKeys.has(m);
+          const d = drafts[m];
+          const dimmed = !!satisfiedBy && !isConfigured;
+          return (
+            <li
+              key={m}
+              className={`flex items-center gap-2 rounded bg-zinc-900/70 border border-zinc-800 px-2 py-1 ${
+                dimmed ? "opacity-50" : ""
+              }`}
+            >
+              <code
+                className={`text-[11px] font-mono flex-1 ${
+                  isConfigured ? "text-zinc-500 line-through" : "text-zinc-200"
+                }`}
+              >
+                {m}
+              </code>
+              {isConfigured ? (
+                <span className="text-[10px] text-emerald-400">✓ set</span>
+              ) : (
+                <>
+                  <input
+                    type="password"
+                    aria-label={`Value for ${m}`}
+                    placeholder="paste value"
+                    value={d?.value ?? ""}
+                    onChange={(e) => onChange(m, e.target.value)}
+                    onKeyDown={(e) => {
+                      if (e.key === "Enter") {
+                        e.preventDefault();
+                        onSave(m);
+                      }
+                    }}
+                    disabled={d?.saving}
+                    className="flex-1 px-2 py-1 rounded bg-zinc-800 border border-zinc-700 text-[11px] text-zinc-200 focus:outline-none focus:border-blue-500 disabled:opacity-50"
+                  />
+                  <button
+                    type="button"
+                    onClick={() => onSave(m)}
+                    disabled={d?.saving || !d?.value.trim()}
+                    className="px-2 py-1 text-[10px] rounded bg-blue-600 hover:bg-blue-500 text-white disabled:opacity-40 disabled:cursor-not-allowed"
+                  >
+                    {d?.saving ? "…" : "Save"}
+                  </button>
+                </>
+              )}
+              {d?.error && (
+                <span className="text-[9px] text-red-400 basis-full pl-1">
+                  {d.error}
+                </span>
+              )}
+            </li>
+          );
+        })}
+      </ul>
+    </li>
+  );
+}
diff --git a/canvas/src/components/ProvisioningTimeout.tsx b/canvas/src/components/ProvisioningTimeout.tsx
index 2f2ee564..92af73f0 100644
--- a/canvas/src/components/ProvisioningTimeout.tsx
+++ b/canvas/src/components/ProvisioningTimeout.tsx
@@ -65,6 +65,12 @@ export function ProvisioningTimeout({
   // banner even if they stay in provisioning. Cleared when the
   // workspace leaves provisioning (status changes).
   const [dismissed, setDismissed] = useState<Set<string>>(new Set());
+  // Watch the live WS health. While it's not "connected", local node
+  // status reflects the last event we received before the drop —
+  // workspaces may have actually transitioned to online minutes ago.
+  // Suppress the banner until WS recovers + rehydrate confirms each
+  // workspace is genuinely still provisioning.
+  const wsStatus = useCanvasStore((s) => s.wsStatus);
 
   // Subscribe to provisioning nodes — use shallow compare to avoid infinite re-render
   // (filter+map creates new array reference on every store update).
@@ -273,8 +279,11 @@ export function ProvisioningTimeout({
   }, []);
 
   const visibleTimedOut = useMemo(
-    () => timedOut.filter((e) => !dismissed.has(e.workspaceId)),
-    [timedOut, dismissed],
+    () =>
+      wsStatus === "connected"
+        ? timedOut.filter((e) => !dismissed.has(e.workspaceId))
+        : [],
+    [timedOut, dismissed, wsStatus],
   );
 
   if (visibleTimedOut.length === 0) return null;
diff --git a/canvas/src/components/SidePanel.tsx b/canvas/src/components/SidePanel.tsx
index 35ba5c8f..44a32940 100644
--- a/canvas/src/components/SidePanel.tsx
+++ b/canvas/src/components/SidePanel.tsx
@@ -29,7 +29,7 @@ const TABS: { id: PanelTab; label: string; icon: string }[] = [
   { id: "chat", label: "Chat", icon: "◈" },
   { id: "activity", label: "Activity", icon: "⊙" },
   { id: "details", label: "Details", icon: "◉" },
-  { id: "skills", label: "Skills", icon: "✦" },
+  { id: "skills", label: "Plugins", icon: "✦" },
   { id: "terminal", label: "Terminal", icon: "▸" },
   { id: "config", label: "Config", icon: "⚙" },
   { id: "schedule", label: "Schedule", icon: "⏲" },
@@ -280,7 +280,7 @@ export function SidePanel() {
         className="flex-1 overflow-y-auto focus:outline-none"
       >
         {panelTab === "details" && <DetailsTab key={selectedNodeId} workspaceId={selectedNodeId} data={node.data} />}
-        {panelTab === "skills" && <SkillsTab key={selectedNodeId} data={node.data} />}
+        {panelTab === "skills" && <SkillsTab key={selectedNodeId} workspaceId={selectedNodeId} data={node.data} />}
         {panelTab === "activity" && <ActivityTab key={selectedNodeId} workspaceId={selectedNodeId} />}
         {panelTab === "chat" && <ChatTab key={selectedNodeId} workspaceId={selectedNodeId} data={node.data} />}
         {panelTab === "terminal" && <TerminalTab key={selectedNodeId} workspaceId={selectedNodeId} />}
diff --git a/canvas/src/components/TemplatePalette.tsx b/canvas/src/components/TemplatePalette.tsx
index 48b94156..f3b9044b 100644
--- a/canvas/src/components/TemplatePalette.tsx
+++ b/canvas/src/components/TemplatePalette.tsx
@@ -1,35 +1,48 @@
 "use client";
 
 import { useState, useEffect, useCallback, useRef } from "react";
+import { flushSync } from "react-dom";
 import { api } from "@/lib/api";
 import { useCanvasStore } from "@/store/canvas";
 import type { WorkspaceData } from "@/store/socket";
-import { checkDeploySecrets, type PreflightResult, type ModelSpec } from "@/lib/deploy-preflight";
-import { MissingKeysModal } from "./MissingKeysModal";
+import { type Template } from "@/lib/deploy-preflight";
+import { useTemplateDeploy } from "@/hooks/useTemplateDeploy";
+import {
+  OrgImportPreflightModal,
+  type EnvRequirement,
+} from "./OrgImportPreflightModal";
 import { ConfirmDialog } from "./ConfirmDialog";
 import { Spinner } from "./Spinner";
 import { showToast } from "./Toaster";
 import { TIER_CONFIG } from "@/lib/design-tokens";
+import { listSecrets } from "@/lib/api/secrets";
 
-interface Template {
-  id: string;
-  name: string;
-  description: string;
-  tier: number;
-  runtime?: string;
-  model: string;
-  models?: ModelSpec[];
-  /** AND-required env vars declared at runtime_config.required_env. */
-  required_env?: string[];
-  skills: string[];
-  skill_count: number;
-}
-
+// `Template` type and `resolveRuntime` helper now live in
+// `@/lib/deploy-preflight` so EmptyState can import the same ones. Was
+// redeclared here + a narrower redeclaration in EmptyState; the
+// narrower one dropped `runtime`, `models`, `required_env`, which is
+// exactly the data the preflight needs. See reviewer's "runtime
+// fallback drift" note — single source of truth closes the drift.
 export interface OrgTemplate {
   dir: string;
   name: string;
   description: string;
   workspaces: number;
+  /** Env vars that MUST be set as global secrets before the org can
+   *  import. Server refuses the import with 412 if any are missing;
+   *  the canvas preflights against /secrets/list to avoid the round
+   *  trip. Aggregated from org-level + every workspace in the tree.
+   *
+   *  Each entry is either a key name (strict) or an `{any_of: [...]}`
+   *  group (any one of the listed members satisfies the requirement —
+   *  e.g. `ANTHROPIC_API_KEY` OR `CLAUDE_CODE_OAUTH_TOKEN`). */
+  required_env?: EnvRequirement[];
+  /** "Nice-to-have" tier. Import proceeds without them but features
+   *  may degrade — a channel's webhook posts get dropped, a fallback
+   *  LLM isn't available, etc. Surfaced to the user as a non-blocking
+   *  warning with an "add now" affordance. Same union shape as
+   *  `required_env`. */
+  recommended_env?: EnvRequirement[];
 }
 
 /** Fetch the list of org templates from the platform. Returns [] on error
@@ -91,6 +104,14 @@ export function OrgTemplatesSection() {
   const [loading, setLoading] = useState(false);
   const [importing, setImporting] = useState<string | null>(null);
   const [error, setError] = useState<string | null>(null);
+  // Preflight modal state. `preflight` is non-null when the user
+  // clicked Import on an org with declared required/recommended envs
+  // and we're waiting for them to confirm; null otherwise (direct
+  // import path for orgs with zero env requirements).
+  const [preflight, setPreflight] = useState<{
+    org: OrgTemplate;
+    configuredKeys: Set<string>;
+  } | null>(null);
   // Collapsed by default — org templates are multi-workspace imports
   // that most new users don't reach for first. Keeping them
   // expand-on-demand frees ~400 px of vertical space for the
@@ -109,21 +130,55 @@ export function OrgTemplatesSection() {
     loadOrgs();
   }, [loadOrgs]);
 
-  const handleImport = async (org: OrgTemplate) => {
+  /** Fetch the set of global secret KEYS that are already configured.
+   *  Used to strike through already-set entries in the preflight modal
+   *  and to decide whether the import needs the modal at all. */
+  const loadConfiguredKeys = useCallback(async (): Promise<Set<string>> => {
+    try {
+      const secrets = await listSecrets("global");
+      return new Set(secrets.map((s) => s.name));
+    } catch {
+      // Secrets endpoint unreachable → assume nothing configured.
+      // The server will refuse the import with 412 and the user
+      // retries; safer than letting the import fly blind.
+      return new Set();
+    }
+  }, []);
+
+  /** Actually run the import. Split out so both the "no preflight
+   *  needed" fast path and the "preflight modal approved" path can
+   *  share the fetch + hydrate + toast sequence. */
+  const doImport = useCallback(async (org: OrgTemplate) => {
     setImporting(org.dir);
     setError(null);
     try {
       await importOrgTemplate(org.dir);
-      // Refresh canvas inline — the WebSocket may be offline, in which case
-      // WORKSPACE_PROVISIONING broadcasts never arrive and the user sees
-      // no change from clicking "Import org". A direct fetch guarantees
-      // the new workspaces land on canvas regardless of WS state.
-      try {
-        const workspaces = await api.get<WorkspaceData[]>("/workspaces");
-        useCanvasStore.getState().hydrate(workspaces);
-      } catch {
-        // Rehydrate failure is non-fatal; WS (if alive) or the next
-        // health-check cycle will eventually pick the new workspaces up.
+      // Hydrate is the safety net for the "WS is offline" case —
+      // without live events the canvas stays empty. But calling it
+      // immediately wipes the org-deploy animation (hydrate rebuilds
+      // the node array from scratch, dropping the spawn / shimmer
+      // classes and position tweens). So:
+      //   1. If the number of nodes on the canvas already matches
+      //      (or exceeds) the template's workspace count, WS
+      //      delivered everything — skip hydrate.
+      //   2. Otherwise, wait a short window to let any in-flight WS
+      //      events land, then hydrate only if still behind.
+      const expectedCount = org.workspaces;
+      // Nodes transition through WORKSPACE_REMOVED which physically
+      // drops them from the store — there is no "removed" status in
+      // WorkspaceNodeData — so a simple length check is enough here.
+      const hasAll = () => useCanvasStore.getState().nodes.length >= expectedCount;
+      if (!hasAll()) {
+        await new Promise((r) => setTimeout(r, 1500));
+      }
+      if (!hasAll()) {
+        try {
+          const workspaces = await api.get<WorkspaceData[]>("/workspaces");
+          useCanvasStore.getState().hydrate(workspaces);
+        } catch {
+          // WS (if alive) or the next health-check cycle will
+          // eventually pick the new workspaces up.
+        }
       }
       showToast(`Imported "${org.name || org.dir}" (${org.workspaces} workspaces)`, "success");
     } catch (e) {
@@ -133,7 +188,45 @@ export function OrgTemplatesSection() {
     } finally {
       setImporting(null);
     }
-  };
+  }, []);
+
+  /** Entry point for the Import button. Two paths:
+   *
+   *   1. No env declared by the template (required_env + recommended_env
+   *      both empty) → fire doImport directly. Matches the pre-preflight
+   *      behaviour for existing templates.
+   *
+   *   2. Any env declared → load the configured-keys set and open the
+   *      preflight modal. doImport runs only when the user clicks
+   *      Import inside the modal, which is gated to "required envs all
+   *      configured" by the modal itself. */
+  const handleImport = useCallback(async (org: OrgTemplate) => {
+    const hasEnvDeclarations =
+      (org.required_env && org.required_env.length > 0) ||
+      (org.recommended_env && org.recommended_env.length > 0);
+    if (!hasEnvDeclarations) {
+      void doImport(org);
+      return;
+    }
+    // Flip the button to its "Importing…" state while the secrets
+    // lookup runs — on a tenant with 500+ global secrets the round
+    // trip can be > 200 ms and the user otherwise gets zero visual
+    // feedback after clicking. Cleared on modal close / error.
+    setImporting(org.dir);
+    try {
+      const configuredKeys = await loadConfiguredKeys();
+      setPreflight({ org, configuredKeys });
+    } finally {
+      setImporting(null);
+    }
+  }, [doImport, loadConfiguredKeys]);
+
+  /** Called by the preflight modal after a successful key save so the
+   *  strike-through re-renders and canProceed recomputes. */
+  const refreshConfiguredKeys = useCallback(async () => {
+    const keys = await loadConfiguredKeys();
+    setPreflight((prev) => (prev ? { ...prev, configuredKeys: keys } : prev));
+  }, [loadConfiguredKeys]);
 
   return (
     <div className="space-y-2" data-testid="org-templates-section">
@@ -222,6 +315,35 @@ export function OrgTemplatesSection() {
       })}
         </div>
       )}
+
+      {preflight && (
+        <OrgImportPreflightModal
+          open
+          orgName={preflight.org.name || preflight.org.dir}
+          workspaceCount={preflight.org.workspaces}
+          requiredEnv={preflight.org.required_env ?? []}
+          recommendedEnv={preflight.org.recommended_env ?? []}
+          configuredKeys={preflight.configuredKeys}
+          onSecretSaved={refreshConfiguredKeys}
+          onProceed={() => {
+            const org = preflight.org;
+            // flushSync guarantees the modal unmounts BEFORE we kick
+            // off the import network call. Without it, React batches
+            // setPreflight(null) with the setImporting(...) from
+            // doImport's synchronous prefix, both commit at the end
+            // of this handler, AND the await import() POST may yield
+            // a microtask before React schedules the paint. Net
+            // effect: the modal backdrop sat over the canvas during
+            // the first wave of WORKSPACE_PROVISIONING WS events,
+            // hiding the spawn animation. Force the close to land
+            // first so the user sees the canvas reveal + agents
+            // popping into place.
+            flushSync(() => setPreflight(null));
+            void doImport(org);
+          }}
+          onCancel={() => setPreflight(null)}
+        />
+      )}
     </div>
   );
 }
@@ -319,14 +441,6 @@ export function TemplatePalette() {
 
   const [templates, setTemplates] = useState<Template[]>([]);
   const [loading, setLoading] = useState(false);
-  const [creating, setCreating] = useState<string | null>(null);
-  const [error, setError] = useState<string | null>(null);
-
-  // Missing keys modal state
-  const [missingKeysInfo, setMissingKeysInfo] = useState<{
-    template: Template;
-    preflight: PreflightResult;
-  } | null>(null);
 
   const loadTemplates = useCallback(async () => {
     setLoading(true);
@@ -344,65 +458,15 @@ export function TemplatePalette() {
     if (open) loadTemplates();
   }, [open, loadTemplates]);
 
-  /** Resolve runtime from template ID (e.g., "langgraph", "claude-code-default" → "claude-code") */
-  const resolveRuntime = (templateId: string): string => {
-    const runtimeMap: Record<string, string> = {
-      langgraph: "langgraph",
-      "claude-code-default": "claude-code",
-      openclaw: "openclaw",
-      deepagents: "deepagents",
-      crewai: "crewai",
-      autogen: "autogen",
-    };
-    return runtimeMap[templateId] ?? templateId.replace(/-default$/, "");
-  };
-
-  /** Actually execute the deploy API call */
-  const executeDeploy = useCallback(async (template: Template) => {
-    setCreating(template.id);
-    setError(null);
-    try {
-      await api.post("/workspaces", {
-        name: template.name,
-        template: template.id,
-        tier: template.tier,
-        canvas: {
-          x: Math.random() * 400 + 100,
-          y: Math.random() * 300 + 100,
-        },
-      });
-      setCreating(null);
-    } catch (e) {
-      setError(e instanceof Error ? e.message : "Failed to deploy");
-      setCreating(null);
-    }
-  }, []);
-
-  /** Pre-deploy check: validate secrets before deploying */
-  const handleDeploy = async (template: Template) => {
-    setCreating(template.id);
-    setError(null);
-
-    // Prefer the runtime the Go /templates endpoint returned verbatim —
-    // resolveRuntime() is a legacy id→runtime fallback for installs whose
-    // template summary predates the `runtime` field.
-    const runtime = template.runtime ?? resolveRuntime(template.id);
-    const preflight = await checkDeploySecrets({
-      runtime,
-      models: template.models,
-      required_env: template.required_env,
-    });
-
-    if (!preflight.ok) {
-      // Missing keys — show the modal instead of deploying
-      setMissingKeysInfo({ template, preflight });
-      setCreating(null);
-      return;
-    }
-
-    // All keys present — deploy directly
-    await executeDeploy(template);
-  };
+  // Preflight + POST + modal wiring moved into useTemplateDeploy so
+  // this component and EmptyState use one implementation. The sidebar
+  // uses the hook's default random canvas placement (no override) —
+  // an already-populated canvas shouldn't have new deploys stacking on
+  // a single fixed point. No post-deploy side effect either: the
+  // palette is operator-triggered, so auto-selecting would yank
+  // focus off whatever the user was already looking at.
+  const { deploy: handleDeploy, deploying: creating, error, modal } =
+    useTemplateDeploy();
 
   return (
     <>
@@ -426,21 +490,9 @@ export function TemplatePalette() {
         </svg>
       </button>
 
-      {/* Missing Keys Modal */}
-      <MissingKeysModal
-        open={!!missingKeysInfo}
-        missingKeys={missingKeysInfo?.preflight.missingKeys ?? []}
-        providers={missingKeysInfo?.preflight.providers ?? []}
-        runtime={missingKeysInfo?.preflight.runtime ?? ""}
-        onKeysAdded={() => {
-          if (missingKeysInfo) {
-            const template = missingKeysInfo.template;
-            setMissingKeysInfo(null);
-            executeDeploy(template);
-          }
-        }}
-        onCancel={() => setMissingKeysInfo(null)}
-      />
+      {/* Missing-keys modal — rendered by the shared hook. Same
+          instance shape used by EmptyState. */}
+      {modal}
 
       {/* Sidebar */}
       {open && (
@@ -483,7 +535,7 @@ export function TemplatePalette() {
                 <button
                   type="button"
                   key={t.id}
-                  onClick={() => handleDeploy(t)}
+                  onClick={() => void handleDeploy(t)}
                   disabled={isDeploying}
                   className="w-full text-left bg-zinc-800/40 hover:bg-zinc-800/70 border border-zinc-700/40 hover:border-zinc-600/50 rounded-xl p-3 transition-all disabled:opacity-50 disabled:cursor-not-allowed disabled:hover:bg-zinc-800/40 disabled:hover:border-zinc-700/40 group focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70"
                 >
diff --git a/canvas/src/components/WorkspaceNode.tsx b/canvas/src/components/WorkspaceNode.tsx
index a2a8962f..fb5b5344 100644
--- a/canvas/src/components/WorkspaceNode.tsx
+++ b/canvas/src/components/WorkspaceNode.tsx
@@ -6,6 +6,8 @@ import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
 import { showToast } from "@/components/Toaster";
 import { Tooltip } from "@/components/Tooltip";
 import { STATUS_CONFIG, TIER_CONFIG } from "@/lib/design-tokens";
+import { useOrgDeployState } from "@/components/canvas/useOrgDeployState";
+import { OrgCancelButton } from "@/components/canvas/OrgCancelButton";
 
 /** Descendant count for the "N sub" badge — children are first-class nodes
  *  rendered as full cards inside this one via React Flow's native parentId,
@@ -35,6 +37,10 @@ function EjectIcon(props: React.SVGProps<SVGSVGElement>) {
 export function WorkspaceNode({ id, data }: NodeProps<Node<WorkspaceNodeData>>) {
   const statusCfg = STATUS_CONFIG[data.status] || STATUS_CONFIG.offline;
   const tierCfg = TIER_CONFIG[data.tier] || { label: `T${data.tier}`, color: "text-zinc-500 bg-zinc-800" };
+  // Org-deploy context — four derived flags off one store subscription.
+  // Drives the shimmer while provisioning, the dimmed/non-draggable
+  // treatment on locked descendants, and the Cancel pill on the root.
+  const deploy = useOrgDeployState(id);
   const selectedNodeId = useCanvasStore((s) => s.selectedNodeId);
   const selectNode = useCanvasStore((s) => s.selectNode);
   const openContextMenu = useCanvasStore((s) => s.openContextMenu);
@@ -138,8 +144,21 @@ export function WorkspaceNode({ id, data }: NodeProps<Node<WorkspaceNodeData>>)
         }
         backdrop-blur-sm
         focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70 focus-visible:ring-offset-1 focus-visible:ring-offset-zinc-950
+        ${deploy.isActivelyProvisioning ? "mol-deploy-shimmer" : ""}
+        ${deploy.isLockedChild ? "mol-deploy-locked" : ""}
       `}
     >
+      {/* Cancel-deployment pill — rendered on the root of a deploying
+          org only. Positioned absolute inside the card so it moves
+          with drag; class="nodrag" on the button stops React Flow
+          from treating clicks as a drag start. */}
+      {deploy.isDeployingRoot && (
+        <OrgCancelButton
+          rootId={id}
+          rootName={data.name}
+          workspaceCount={deploy.descendantProvisioningCount}
+        />
+      )}
       {/* Status gradient bar at top */}
       <div className={`absolute inset-x-0 top-0 h-8 bg-gradient-to-b ${statusCfg.bar} pointer-events-none`} />
 
diff --git a/canvas/src/components/__tests__/A2ATopologyOverlay.test.tsx b/canvas/src/components/__tests__/A2ATopologyOverlay.test.tsx
index ab0a3c4d..8d4dba00 100644
--- a/canvas/src/components/__tests__/A2ATopologyOverlay.test.tsx
+++ b/canvas/src/components/__tests__/A2ATopologyOverlay.test.tsx
@@ -175,9 +175,28 @@ describe("buildA2AEdges — edge properties", () => {
     expect((edge.style as React.CSSProperties).pointerEvents).toBe("none");
   });
 
-  it("sets pointerEvents: 'none' on labelStyle", () => {
+  it("tags the edge as type=a2a so React Flow renders the custom A2AEdge component", () => {
+    // The custom edge portals labels above the node layer and makes
+    // them clickable. Without type=a2a, RF falls back to the default
+    // edge whose label sits in the SVG group (hidden under nodes,
+    // pointerEvents:none). Regression guard for the hidden-label /
+    // unclickable-label bug observed 2026-04-25.
     const [edge] = buildA2AEdges([makeRow()], NOW);
-    expect((edge.labelStyle as React.CSSProperties).pointerEvents).toBe("none");
+    expect(edge.type).toBe("a2a");
+  });
+
+  it("populates edge.data with the fields the custom edge component reads", () => {
+    // A2AEdge reads count, lastAt, isHot, label from edge.data so the
+    // shape upstream must keep emitting them. A future buildA2AEdges
+    // refactor that drops any of these silently breaks the rendered
+    // pill (label disappears, hot/warm color swap fails, click handler
+    // can still fire but the label text vanishes).
+    const [edge] = buildA2AEdges([makeRow()], NOW);
+    const data = edge.data as Record<string, unknown>;
+    expect(data.count).toBe(1);
+    expect(typeof data.lastAt).toBe("number");
+    expect(typeof data.isHot).toBe("boolean");
+    expect(data.label).toMatch(/^1 call ·/);
   });
 
   it("label uses singular 'call' for count === 1", () => {
diff --git a/canvas/src/components/__tests__/Canvas.a11y.test.tsx b/canvas/src/components/__tests__/Canvas.a11y.test.tsx
index a8231eb3..341a2c7a 100644
--- a/canvas/src/components/__tests__/Canvas.a11y.test.tsx
+++ b/canvas/src/components/__tests__/Canvas.a11y.test.tsx
@@ -72,6 +72,7 @@ const mockStoreState = {
   selectedNodeIds: new Set<string>(),
   clearSelection: vi.fn(),
   toggleNodeSelection: vi.fn(),
+  deletingIds: new Set<string>(),
 };
 
 vi.mock("@/store/canvas", () => ({
diff --git a/canvas/src/components/__tests__/Canvas.pan-to-node.test.tsx b/canvas/src/components/__tests__/Canvas.pan-to-node.test.tsx
index 77ac6518..76d9be78 100644
--- a/canvas/src/components/__tests__/Canvas.pan-to-node.test.tsx
+++ b/canvas/src/components/__tests__/Canvas.pan-to-node.test.tsx
@@ -16,7 +16,9 @@ afterEach(() => {
 // ── Shared fitView spy — must be set up before vi.mock hoisting ──────────────
 const mockFitView = vi.fn();
 const mockFitBounds = vi.fn();
-const mockGetIntersectingNodes = vi.fn(() => []);
+const mockGetIntersectingNodes = vi.fn(
+  (): Array<{ id: string; position: { x: number; y: number } }> => [],
+);
 
 vi.mock("@xyflow/react", () => {
   const ReactFlow = ({
@@ -83,6 +85,12 @@ const mockStoreState = {
   selectedNodeIds: new Set<string>(),
   clearSelection: vi.fn(),
   toggleNodeSelection: vi.fn(),
+  // Cascade-delete / deploy animation state (added in the multilevel-
+  // layout-UX bundle). Canvas.tsx reads deletingIds.size to decide
+  // whether to apply the "locked during delete" class on each node;
+  // an empty Set mirrors the idle canvas and doesn't interact with
+  // any pan/fit behaviour under test here.
+  deletingIds: new Set<string>(),
 };
 
 vi.mock("@/store/canvas", () => ({
diff --git a/canvas/src/components/__tests__/OrgImportPreflightModal.test.tsx b/canvas/src/components/__tests__/OrgImportPreflightModal.test.tsx
new file mode 100644
index 00000000..73d62803
--- /dev/null
+++ b/canvas/src/components/__tests__/OrgImportPreflightModal.test.tsx
@@ -0,0 +1,225 @@
+// @vitest-environment jsdom
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import { render, screen, fireEvent, cleanup, waitFor } from "@testing-library/react";
+
+// Regression tests for the OrgImportPreflightModal's save path and
+// any-of group rendering. Guards two specific bugs caught in the
+// UX A/B Lab rollout (2026-04-24):
+//
+//   1. saveOne early-returned because it tried to read a local
+//      `startValue` reassigned inside a functional setDrafts
+//      updater. React did not always evaluate the updater
+//      synchronously, so the gate read "" and bailed while
+//      `saving:true` committed at next render, wedging the
+//      button on "…" without ever calling createSecret.
+//
+//   2. Double-click / Enter-spam could race past the disabled-
+//      button UI gate, firing createSecret twice. The production
+//      endpoint is idempotent so no data hazard, but the extra
+//      PUT is wasteful and harder to reason about.
+
+const createSecretMock = vi.fn().mockResolvedValue(undefined);
+
+vi.mock("@/lib/api/secrets", () => ({
+  createSecret: (...args: unknown[]) => createSecretMock(...args),
+}));
+
+import { OrgImportPreflightModal } from "../OrgImportPreflightModal";
+
+beforeEach(() => {
+  createSecretMock.mockClear();
+  createSecretMock.mockResolvedValue(undefined);
+});
+
+afterEach(() => {
+  cleanup();
+});
+
+describe("OrgImportPreflightModal — saveOne", () => {
+  it("calls createSecret exactly once when Save is clicked on an any-of member", async () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    // Both any-of members render their own input + Save.
+    const input = screen.getByLabelText(/Value for ANTHROPIC_API_KEY/i);
+    fireEvent.change(input, { target: { value: "test-secret-value" } });
+
+    // The Save button adjacent to the changed input.
+    const saveButtons = screen
+      .getAllByRole("button")
+      .filter((b) => b.textContent === "Save");
+    // Two saves on screen (one per any-of member). First is ANTHROPIC.
+    fireEvent.click(saveButtons[0]);
+
+    await waitFor(() => {
+      expect(createSecretMock).toHaveBeenCalledTimes(1);
+    });
+    expect(createSecretMock).toHaveBeenCalledWith(
+      "global",
+      "ANTHROPIC_API_KEY",
+      "test-secret-value",
+    );
+  });
+
+  it("synchronous double-click on Save fires createSecret exactly once", async () => {
+    // Pause the first save so we can fire a second click while the
+    // first is still mid-await. The two clicks happen in the SAME
+    // tick — fireEvent runs synchronously through React's event
+    // system — so any guard that depends on a committed setState
+    // (e.g. `disabled={drafts[key].saving}` or a closure read of
+    // `drafts[key].saving`) loses the race: the second click sees
+    // saving=false because React hasn't committed yet. The fix is
+    // a useRef-based gate that flips synchronously before any await.
+    let resolveCreate!: () => void;
+    createSecretMock.mockImplementationOnce(
+      () => new Promise<void>((resolve) => {
+        resolveCreate = resolve;
+      }),
+    );
+
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    const input = screen.getByLabelText(/Value for ANTHROPIC_API_KEY/i);
+    fireEvent.change(input, { target: { value: "test-secret-value" } });
+
+    const saveButtons = screen
+      .getAllByRole("button")
+      .filter((b) => b.textContent === "Save");
+    // Pull the React-bound onClick once so both invocations close
+    // over the SAME callback — simulates a double-fire that happens
+    // before React reconciles between events. Without this, RTL
+    // flushes act() between fireEvent calls and the second click
+    // sees the post-commit state.
+    const saveBtn = saveButtons[0] as HTMLButtonElement;
+    saveBtn.click();
+    saveBtn.click();
+
+    // Give React a tick to process any queued state updates.
+    await waitFor(() => {
+      expect(createSecretMock).toHaveBeenCalledTimes(1);
+    });
+
+    resolveCreate();
+    await waitFor(() => {
+      // Post-save count must remain at exactly one.
+      expect(createSecretMock).toHaveBeenCalledTimes(1);
+    });
+  });
+
+  it("does not call createSecret when value is empty", async () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    // Button is disabled when value is empty — clicking a disabled
+    // button still dispatches onClick in RTL (since fireEvent
+    // bypasses the disabled attribute), so this asserts the code-
+    // level gate catches it, not just the UI.
+    const saveButtons = screen
+      .getAllByRole("button")
+      .filter((b) => b.textContent === "Save");
+    fireEvent.click(saveButtons[0]);
+
+    // Small async wait to let any state updates settle.
+    await new Promise((r) => setTimeout(r, 50));
+    expect(createSecretMock).not.toHaveBeenCalled();
+  });
+});
+
+describe("OrgImportPreflightModal — any-of rendering", () => {
+  it("renders each any-of member as a separate input row", () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    expect(screen.getByText("Configure any one")).toBeTruthy();
+    expect(screen.getByLabelText(/Value for ANTHROPIC_API_KEY/i)).toBeTruthy();
+    expect(screen.getByLabelText(/Value for CLAUDE_CODE_OAUTH_TOKEN/i)).toBeTruthy();
+  });
+
+  it("shows satisfied indicator when any member is configured, and enables Import", () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set(["CLAUDE_CODE_OAUTH_TOKEN"])}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    // "✓ using CLAUDE_CODE_OAUTH_TOKEN" banner renders. Name appears
+    // twice (banner + member row) so use getAllByText.
+    expect(screen.getByText(/using/i)).toBeTruthy();
+    expect(screen.getAllByText("CLAUDE_CODE_OAUTH_TOKEN").length).toBeGreaterThanOrEqual(1);
+
+    const importBtn = screen.getByRole("button", { name: /^Import$/ });
+    expect(importBtn.hasAttribute("disabled")).toBe(false);
+  });
+
+  it("keeps Import disabled when no any-of member is configured", () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    const importBtn = screen.getByRole("button", { name: /^Import$/ });
+    expect(importBtn.hasAttribute("disabled")).toBe(true);
+  });
+});
diff --git a/canvas/src/components/__tests__/SkillsTab.install.test.tsx b/canvas/src/components/__tests__/SkillsTab.install.test.tsx
new file mode 100644
index 00000000..24f48b7d
--- /dev/null
+++ b/canvas/src/components/__tests__/SkillsTab.install.test.tsx
@@ -0,0 +1,143 @@
+// @vitest-environment jsdom
+//
+// Behavioral coverage for the install flow. Two regressions to pin
+// down:
+//
+//  1. The install POST URL has to include the workspace id. A pre-fix
+//     bug routed it to /workspaces/undefined/plugins because the
+//     component read `data.id`, but `WorkspaceNodeData` has no `id`
+//     field — its `extends Record<string, unknown>` index signature
+//     hid the bad access from TS. The component now takes
+//     `workspaceId` as an explicit prop; this test asserts the URL.
+//
+//  2. The optimistic install update has to flip the registry row to
+//     "Installed" without waiting for the 15s reload timer (the
+//     PLUGIN_RELOAD_DELAY_MS gap). This test asserts the row's "Install"
+//     button is replaced by the green "Installed" tag synchronously
+//     after the POST resolves.
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import { render, screen, fireEvent, waitFor, cleanup } from "@testing-library/react";
+
+const mockApiGet = vi.fn();
+const mockApiPost = vi.fn();
+vi.mock("@/lib/api", () => ({
+  api: {
+    get: (...args: unknown[]) => mockApiGet(...args),
+    post: (...args: unknown[]) => mockApiPost(...args),
+    put: vi.fn().mockResolvedValue({}),
+    del: vi.fn().mockResolvedValue({}),
+    patch: vi.fn().mockResolvedValue({}),
+  },
+}));
+
+vi.mock("@/store/canvas", () => ({
+  useCanvasStore: Object.assign(
+    vi.fn((selector: (s: Record<string, unknown>) => unknown) =>
+      selector({ setPanelTab: vi.fn() } as Record<string, unknown>),
+    ),
+    { getState: () => ({ setPanelTab: vi.fn() }) },
+  ),
+  summarizeWorkspaceCapabilities: vi.fn(() => ({ skills: [], tools: [] })),
+}));
+
+vi.mock("../Toaster", () => ({ showToast: vi.fn() }));
+
+import { SkillsTab } from "../tabs/SkillsTab";
+
+function makeData() {
+  return {
+    name: "Test WS",
+    status: "online",
+    tier: 1,
+    agentCard: null,
+    activeTasks: 0,
+    collapsed: false,
+    role: "agent",
+    lastErrorRate: 0,
+    lastSampleError: "",
+    url: "http://localhost:9000",
+    parentId: null,
+    currentTask: "",
+    runtime: "langgraph",
+    needsRestart: false,
+    budgetLimit: null,
+  };
+}
+
+const REGISTRY = [
+  {
+    name: "browser-automation",
+    version: "1.1.0",
+    description: "Browser automation + testing",
+    author: "molecule",
+    tags: ["browser", "playwright"],
+    skills: [],
+    runtimes: ["claude-code"],
+  },
+];
+
+beforeEach(() => {
+  // Order matches the component's loadInstalled / loadRegistry
+  // /loadSourceSchemes calls. Schemes endpoint resolves with an
+  // empty list so the Install-from-source input doesn't blow up.
+  mockApiGet.mockReset();
+  mockApiPost.mockReset();
+  mockApiGet.mockImplementation((path: string) => {
+    if (path.endsWith("/plugins") && path.startsWith("/workspaces/")) {
+      return Promise.resolve([]); // installed
+    }
+    if (path === "/plugins") {
+      return Promise.resolve(REGISTRY); // registry
+    }
+    if (path === "/plugins/sources") {
+      return Promise.resolve({ schemes: ["github://", "local://"] });
+    }
+    return Promise.resolve(null);
+  });
+  mockApiPost.mockResolvedValue({ status: "installed", plugin: "browser-automation" });
+});
+
+afterEach(() => {
+  cleanup();
+  vi.clearAllMocks();
+});
+
+// Returns the registry row's Install button. The custom-source input
+// also renders an "Install" button, so `findByRole({name: /install/})`
+// throws on multiple matches; scope by the row's plugin-name text.
+async function findRowInstallButton() {
+  const nameNode = await screen.findByText("browser-automation");
+  const row = nameNode.closest("div.flex.items-center.justify-between") as HTMLElement;
+  if (!row) throw new Error("could not locate row container for browser-automation");
+  const buttons = row.querySelectorAll("button");
+  const install = Array.from(buttons).find((b) => b.textContent?.trim() === "Install");
+  if (!install) throw new Error("row has no Install button (already installed?)");
+  return install;
+}
+
+describe("SkillsTab install flow", () => {
+  it("POSTs to /workspaces/<workspaceId>/plugins (no `undefined` in URL)", async () => {
+    render(<SkillsTab workspaceId="ws-abc-123" data={makeData() as never} />);
+
+    fireEvent.click(await findRowInstallButton());
+
+    await waitFor(() => expect(mockApiPost).toHaveBeenCalled());
+    expect(mockApiPost).toHaveBeenCalledWith(
+      "/workspaces/ws-abc-123/plugins",
+      { source: "local://browser-automation" },
+    );
+  });
+
+  it("flips the registry row to 'Installed' synchronously after POST resolves (no 15s wait)", async () => {
+    render(<SkillsTab workspaceId="ws-abc-123" data={makeData() as never} />);
+
+    fireEvent.click(await findRowInstallButton());
+
+    // The "Installed" green tag must appear without advancing the
+    // reload timer — the optimistic update is the entire point of
+    // this fix. If this test ever regresses to needing fake timers
+    // + advanceTimersByTime, the optimistic path is broken.
+    const installedTag = await screen.findByText(/^Installed$/i);
+    expect(installedTag).toBeDefined();
+  });
+});
diff --git a/canvas/src/components/__tests__/tabs.a11y.test.tsx b/canvas/src/components/__tests__/tabs.a11y.test.tsx
index a7000917..be5446fe 100644
--- a/canvas/src/components/__tests__/tabs.a11y.test.tsx
+++ b/canvas/src/components/__tests__/tabs.a11y.test.tsx
@@ -123,7 +123,7 @@ describe("SkillsTab — aria-label on bare source input (WCAG 1.3.1)", () => {
   });
 
   it('install source input has aria-label="Install from source URL"', async () => {
-    render(<SkillsTab data={makeSkillsData() as never} />);
+    render(<SkillsTab workspaceId="ws-test-id" data={makeSkillsData() as never} />);
 
     // The source input is inside the registry section (showRegistry=false initially).
     // Click the "+ Install Plugin" button to reveal it.
@@ -138,7 +138,7 @@ describe("SkillsTab — aria-label on bare source input (WCAG 1.3.1)", () => {
   });
 
   it("install source input is a text input (not hidden)", async () => {
-    render(<SkillsTab data={makeSkillsData() as never} />);
+    render(<SkillsTab workspaceId="ws-test-id" data={makeSkillsData() as never} />);
 
     const installBtn = screen.getByRole("button", { name: /install plugin/i });
     fireEvent.click(installBtn);
diff --git a/canvas/src/components/canvas/A2AEdge.tsx b/canvas/src/components/canvas/A2AEdge.tsx
new file mode 100644
index 00000000..05900e7a
--- /dev/null
+++ b/canvas/src/components/canvas/A2AEdge.tsx
@@ -0,0 +1,133 @@
+"use client";
+
+import { memo } from "react";
+import {
+  BaseEdge,
+  EdgeLabelRenderer,
+  getBezierPath,
+  type EdgeProps,
+} from "@xyflow/react";
+import { useCanvasStore } from "@/store/canvas";
+
+/**
+ * Custom edge for the A2A topology overlay. Solves two problems with the
+ * default React Flow edge label rendering:
+ *
+ *   1. **Z-order.** The default `label` prop renders inside the edge's
+ *      SVG group, which always sits below node DOM in React Flow. When
+ *      a label happened to land underneath a workspace card, it was
+ *      hidden. EdgeLabelRenderer mounts label content in a separate
+ *      portal layer that we can pin above nodes via z-index.
+ *
+ *   2. **Clickability.** Default labels inherit `pointerEvents: none`
+ *      from the SVG path so the user can drag through them. The
+ *      portaled label is a regular HTML element with its own pointer
+ *      events — we set `pointerEvents: all` only on the label pill so
+ *      drags on the edge line still pass through to the canvas.
+ *
+ * On click: selects the source workspace and switches its side panel
+ * to Activity, where the user can inspect the underlying delegations.
+ */
+interface A2AEdgeData {
+  count: number;
+  lastAt: number;
+  isHot: boolean;
+  /** Pre-formatted "5 calls · 2m ago" — built upstream by buildA2AEdges
+   *  so the same string renders here and in any future tooltip layer. */
+  label: string;
+}
+
+function A2AEdgeImpl({
+  id,
+  source,
+  sourceX,
+  sourceY,
+  targetX,
+  targetY,
+  sourcePosition,
+  targetPosition,
+  data,
+  style = {},
+}: EdgeProps) {
+  const [edgePath, labelX, labelY] = getBezierPath({
+    sourceX,
+    sourceY,
+    sourcePosition,
+    targetX,
+    targetY,
+    targetPosition,
+  });
+
+  const selectNode = useCanvasStore((s) => s.selectNode);
+  const setPanelTab = useCanvasStore((s) => s.setPanelTab);
+
+  const edgeData = (data ?? {}) as Partial<A2AEdgeData>;
+  const labelText = edgeData.label ?? "";
+  const isHot = edgeData.isHot ?? false;
+  const count = edgeData.count ?? 0;
+
+  const handleClick = (e: React.MouseEvent) => {
+    e.stopPropagation();
+    // Select the source (the agent that initiated the delegations).
+    // The user's mental model when clicking the edge is "show me the
+    // calls FROM here" — that's the source's activity feed.
+    //
+    // Preserve the current tab when the user re-clicks the same edge
+    // (or another edge whose source is already selected). Yanking
+    // them back to Activity every click would surprise — they may
+    // have intentionally switched to Chat / Memory while looking at
+    // this peer. The first click that lands a *different* selection
+    // still routes them to Activity, which is the discovery affordance.
+    const alreadySelected =
+      useCanvasStore.getState().selectedNodeId === source;
+    selectNode(source);
+    if (!alreadySelected) {
+      setPanelTab("activity");
+    }
+  };
+
+  // The edge stroke color matches what buildA2AEdges sets on the SVG
+  // path style. Mirror it on the badge border so the visual identity
+  // (hot=violet vs warm=blue) carries to the clickable label.
+  const accent = isHot ? "border-violet-500/60" : "border-blue-500/60";
+  const accentText = isHot ? "text-violet-200" : "text-blue-200";
+  const ariaLabel = `${count} delegation${count === 1 ? "" : "s"} from ${
+    edgeData.label?.split(" · ")[1] ?? "recent"
+  }. Click to inspect.`;
+
+  return (
+    <>
+      <BaseEdge id={id} path={edgePath} style={style} markerEnd="url(#a2a-arrow)" />
+      {labelText && (
+        <EdgeLabelRenderer>
+          <div
+            // The label sits in a portal at the canvas root. position:
+            // absolute + the (labelX, labelY) translate places it at
+            // the edge midpoint. zIndex 5 wins against React Flow's
+            // node layer (default z=0) without fighting the controls
+            // strip (z=10).
+            style={{
+              position: "absolute",
+              transform: `translate(-50%, -50%) translate(${labelX}px, ${labelY}px)`,
+              pointerEvents: "all",
+              zIndex: 5,
+            }}
+            className="nodrag nopan"
+          >
+            <button
+              type="button"
+              onClick={handleClick}
+              aria-label={ariaLabel}
+              title="Open source workspace's activity feed"
+              className={`px-2 py-0.5 rounded-full bg-zinc-900/95 border ${accent} ${accentText} text-[10px] font-medium shadow-md shadow-black/40 backdrop-blur-sm hover:bg-zinc-800 hover:border-opacity-100 transition-colors cursor-pointer`}
+            >
+              {labelText}
+            </button>
+          </div>
+        </EdgeLabelRenderer>
+      )}
+    </>
+  );
+}
+
+export const A2AEdge = memo(A2AEdgeImpl);
diff --git a/canvas/src/components/canvas/OrgCancelButton.tsx b/canvas/src/components/canvas/OrgCancelButton.tsx
new file mode 100644
index 00000000..402b92d8
--- /dev/null
+++ b/canvas/src/components/canvas/OrgCancelButton.tsx
@@ -0,0 +1,165 @@
+"use client";
+
+import { useState } from "react";
+import { api } from "@/lib/api";
+import { useCanvasStore } from "@/store/canvas";
+import { showToast } from "@/components/Toaster";
+
+interface Props {
+  /** Root workspace of the org being deployed. The cancel action
+   *  cascades delete through workspace-server's existing recursive
+   *  delete handler, so we only need the root id. */
+  rootId: string;
+  rootName: string;
+  /** Count rendered in the pill label; updated live as children
+   *  come online (the useOrgDeployState hook recomputes on every
+   *  status change). */
+  workspaceCount: number;
+}
+
+/**
+ * Cancel-deployment pill attached to the root of a deploying org.
+ * One click → confirm dialog → DELETE /workspaces/:rootId?confirm=true
+ * which cascades through every descendant server-side.
+ *
+ * Rendered inside the root's WorkspaceNode card via an absolute-
+ * positioned overlay so it sits visually ON the card and moves with
+ * drag. `className="nodrag"` stops React Flow from interpreting
+ * clicks here as the start of a drag gesture.
+ *
+ * Deliberately uses only `.mol-deploy-cancel*` classes for styling —
+ * every color / easing comes from theme-tokens.css, so a future
+ * light-theme (or tenant-branded theme) inherits automatically.
+ */
+export function OrgCancelButton({ rootId, rootName, workspaceCount }: Props) {
+  const [confirming, setConfirming] = useState(false);
+  const [submitting, setSubmitting] = useState(false);
+
+  const handleCancel = async () => {
+    setSubmitting(true);
+    // Populate deletingIds with the subtree so every descendant
+    // (and the root) locks into the dim + non-draggable state for
+    // the duration of the network round-trip + server cascade —
+    // same treatment the regular delete gives. Otherwise the org
+    // looks interactive for the several seconds between click and
+    // the first WORKSPACE_REMOVED event.
+    const preState = useCanvasStore.getState();
+    const subtreeIds = new Set<string>();
+    const walkStack = [rootId];
+    while (walkStack.length) {
+      const nid = walkStack.pop()!;
+      subtreeIds.add(nid);
+      for (const n of preState.nodes) {
+        if (n.data.parentId === nid) walkStack.push(n.id);
+      }
+    }
+    preState.beginDelete(subtreeIds);
+    try {
+      await api.del<{ status: string }>(
+        `/workspaces/${rootId}?confirm=true`,
+      );
+      showToast(`Cancelled deployment of "${rootName}"`, "success");
+      // Optimistic local removal — workspace-server broadcasts
+      // WORKSPACE_REMOVED per node but the WS may lag; strip the
+      // subtree now so the user sees immediate feedback. Re-read
+      // the store AFTER the await: children may have landed (or
+      // already been removed by WS events) during the network
+      // round-trip. If the WS_REMOVED handler already dropped the
+      // root during the network call, bail out — the subtree walk
+      // would miss any now-orphaned descendants (handleCanvasEvent
+      // reparents children of a removed node upward, so they no
+      // longer share the original root's id as parentId).
+      const postDeleteState = useCanvasStore.getState();
+      if (!postDeleteState.nodes.some((n) => n.id === rootId)) {
+        return;
+      }
+      const subtree = new Set<string>();
+      const stack = [rootId];
+      while (stack.length) {
+        const id = stack.pop()!;
+        subtree.add(id);
+        for (const n of postDeleteState.nodes) {
+          if (n.data.parentId === id) stack.push(n.id);
+        }
+      }
+      useCanvasStore.setState({
+        nodes: postDeleteState.nodes.filter((n) => !subtree.has(n.id)),
+        edges: postDeleteState.edges.filter(
+          (e) => !subtree.has(e.source) && !subtree.has(e.target),
+        ),
+      });
+    } catch (e) {
+      // Undo the lock so the user can try again / interact with the
+      // still-deploying subtree.
+      useCanvasStore.getState().endDelete(subtreeIds);
+      showToast(
+        e instanceof Error ? `Cancel failed: ${e.message}` : "Cancel failed",
+        "error",
+      );
+    } finally {
+      // Success path's endDelete is covered implicitly — every node
+      // in the subtree is stripped by the optimistic local removal
+      // above, and any stragglers are removed by WORKSPACE_REMOVED
+      // WS events whose handler is a no-op on already-missing ids.
+      // The deletingIds set will naturally empty as endDelete runs
+      // in both paths below.
+      useCanvasStore.getState().endDelete(subtreeIds);
+      setSubmitting(false);
+      setConfirming(false);
+    }
+  };
+
+  if (confirming) {
+    return (
+      <div
+        className="nodrag absolute -top-10 right-0 z-20 flex items-center gap-1.5 rounded-lg bg-zinc-900/95 px-2 py-1 shadow-lg border border-red-800/60"
+        onClick={(e) => e.stopPropagation()}
+      >
+        <span className="text-[10px] text-zinc-300">
+          Delete {workspaceCount} workspace{workspaceCount === 1 ? "" : "s"}?
+        </span>
+        <button
+          type="button"
+          onClick={handleCancel}
+          disabled={submitting}
+          className="mol-deploy-cancel px-2 py-0.5 rounded text-[10px] font-semibold"
+        >
+          {submitting ? "Deleting…" : "Yes"}
+        </button>
+        <button
+          type="button"
+          onClick={() => setConfirming(false)}
+          disabled={submitting}
+          className="px-2 py-0.5 rounded bg-zinc-700/80 hover:bg-zinc-600 text-[10px] text-zinc-200"
+        >
+          No
+        </button>
+      </div>
+    );
+  }
+
+  return (
+    <button
+      type="button"
+      onClick={(e) => {
+        // Stop the click from bubbling to React Flow (selects the
+        // node) — the Cancel pill is a UI surface, not a node
+        // activation.
+        e.stopPropagation();
+        setConfirming(true);
+      }}
+      className="nodrag mol-deploy-cancel mol-deploy-cancel-pulse absolute -top-7 right-1 z-20 flex items-center gap-1 rounded-full px-2.5 py-0.5 text-[10px] font-semibold shadow-md"
+      aria-label={`Cancel deployment of ${rootName}`}
+    >
+      <svg width="10" height="10" viewBox="0 0 16 16" aria-hidden="true">
+        <path
+          d="M4 4l8 8M12 4l-8 8"
+          stroke="currentColor"
+          strokeWidth="2"
+          strokeLinecap="round"
+        />
+      </svg>
+      <span>Cancel ({workspaceCount})</span>
+    </button>
+  );
+}
diff --git a/canvas/src/components/canvas/__tests__/useCanvasViewport.test.ts b/canvas/src/components/canvas/__tests__/useCanvasViewport.test.ts
new file mode 100644
index 00000000..4d21ea91
--- /dev/null
+++ b/canvas/src/components/canvas/__tests__/useCanvasViewport.test.ts
@@ -0,0 +1,53 @@
+import { describe, it, expect } from "vitest";
+import { shouldFitGrowing } from "../useCanvasViewport";
+
+// Tests cover the auto-fit gate in isolation. The hook itself is
+// effects + refs + React Flow handles, awkward to exercise directly —
+// extracting the pure decision into shouldFitGrowing(...) lets us
+// pin down the regression-prone logic with unit tests instead.
+
+describe("shouldFitGrowing", () => {
+  it("fits the very first time (no prior snapshot)", () => {
+    expect(shouldFitGrowing(["a"], undefined, null, 0)).toBe(true);
+  });
+
+  it("fits when the prior snapshot is empty", () => {
+    expect(shouldFitGrowing(["a", "b"], new Set(), null, 0)).toBe(true);
+  });
+
+  it("fits when a brand-new id has been added since the last fit", () => {
+    const prev = new Set(["root", "a", "b"]);
+    expect(shouldFitGrowing(["root", "a", "b", "c"], prev, null, 0)).toBe(true);
+  });
+
+  it("respects user pan when the subtree hasn't grown", () => {
+    const prev = new Set(["root", "a", "b"]);
+    // Status update on existing node — same membership.
+    expect(shouldFitGrowing(["root", "a", "b"], prev, 5_000, 1_000)).toBe(false);
+  });
+
+  it("fits when the subtree hasn't grown but the user never panned", () => {
+    const prev = new Set(["root", "a", "b"]);
+    expect(shouldFitGrowing(["root", "a", "b"], prev, null, 1_000)).toBe(true);
+  });
+
+  it("fits when the subtree hasn't grown and the user panned BEFORE the last fit", () => {
+    const prev = new Set(["root", "a", "b"]);
+    expect(shouldFitGrowing(["root", "a", "b"], prev, 500, 1_000)).toBe(true);
+  });
+
+  it("forces fit on delete-then-add even when the count is unchanged", () => {
+    // Subtree was [root, a, b, c, d]. Then `d` got removed and a
+    // sibling `e` arrived. Same length, different membership — a
+    // length-only check would skip the fit and leave `e` off-screen.
+    const prev = new Set(["root", "a", "b", "c", "d"]);
+    expect(
+      shouldFitGrowing(["root", "a", "b", "c", "e"], prev, 5_000, 1_000),
+    ).toBe(true);
+  });
+
+  it("does NOT fit on shrink-only when the user has panned (deletion alone shouldn't override exploration)", () => {
+    const prev = new Set(["root", "a", "b", "c"]);
+    expect(shouldFitGrowing(["root", "a", "b"], prev, 5_000, 1_000)).toBe(false);
+  });
+});
diff --git a/canvas/src/components/canvas/useCanvasViewport.ts b/canvas/src/components/canvas/useCanvasViewport.ts
index 8ab916e5..db49cd20 100644
--- a/canvas/src/components/canvas/useCanvasViewport.ts
+++ b/canvas/src/components/canvas/useCanvasViewport.ts
@@ -3,11 +3,43 @@
 import { useCallback, useEffect, useRef } from "react";
 import { useReactFlow } from "@xyflow/react";
 import { useCanvasStore } from "@/store/canvas";
+import { appendClass, removeClass } from "@/store/classNames";
 import {
   CHILD_DEFAULT_HEIGHT,
   CHILD_DEFAULT_WIDTH,
 } from "@/store/canvas-topology";
 
+/**
+ * Decide whether the deploy-time auto-fit should run. Pure function so
+ * the gate logic is unit-testable in isolation — the surrounding
+ * useEffect tangle of refs, timers, and React Flow handles is awkward
+ * to exercise directly.
+ *
+ * Returns true when the auto-fit SHOULD fire:
+ *   - the subtree contains an id that wasn't in the previous snapshot
+ *     (a new node arrived → user has lost context, force the fit
+ *     through regardless of any user-pan in between), OR
+ *   - the user has not panned since the last successful fit (so the
+ *     auto-fit isn't fighting their override).
+ *
+ * `prevSubtreeIds === undefined` means no fit has ever run for this
+ * root — treat every id as "new" and fit. `userPannedAt === null`
+ * means the user has never panned at all in this session — fit.
+ */
+export function shouldFitGrowing(
+  currentSubtreeIds: readonly string[],
+  prevSubtreeIds: ReadonlySet<string> | undefined,
+  userPannedAt: number | null,
+  lastAutoFitAt: number,
+): boolean {
+  if (!prevSubtreeIds || prevSubtreeIds.size === 0) return true;
+  for (const id of currentSubtreeIds) {
+    if (!prevSubtreeIds.has(id)) return true;
+  }
+  if (userPannedAt === null) return true;
+  return userPannedAt <= lastAutoFitAt;
+}
+
 /**
  * Wires the two canvas-wide CustomEvent listeners and the viewport
  * save/restore bookkeeping so Canvas.tsx doesn't have to.
@@ -25,17 +57,79 @@ export function useCanvasViewport() {
   const saveViewport = useCanvasStore((s) => s.saveViewport);
   const saveTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
   const panTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
-  const autoFitTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
+  // Two distinct fit timers — DO NOT collapse to one.
+  //   - settleFitTimerRef:   1200ms one-shot run by the
+  //     "transition from any-provisioning to none" effect (the deploy
+  //     just finished — settle on the whole org once).
+  //   - trackingFitTimerRef: 500ms debounced by the per-arrival
+  //     molecule:fit-deploying-org event handler (track the org's
+  //     bounds as children land during the deploy).
+  // They MUST NOT share a ref: the two effects fire interleaved
+  // (every WS event during a deploy resets the tracking timer; the
+  // settle timer arms the moment provisioning hits zero), and a
+  // shared ref made each effect silently clearTimeout the other's
+  // pending fit. Today's behavior happened to land in the right
+  // order out of luck; splitting the refs makes ordering independent
+  // of fire sequence.
+  const settleFitTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
+  const trackingFitTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
   // Tracks whether any workspace was provisioning on the previous
   // render so we can detect the boundary when the last one finishes
   // and auto-fit the viewport around the whole tree.
   const hadProvisioningRef = useRef(false);
+  // Respect-user-pan gate for the deploy-time auto-fit. Earlier
+  // revisions tried to detect user pans via `onMoveEnd`, but React
+  // Flow v12 fires that callback with a truthy event at the END of
+  // a programmatic fitView animation — so the first auto-fit we
+  // triggered would immediately look like a user pan and block
+  // every subsequent fit for the rest of the deploy, leaving the
+  // viewport stuck wherever the first fit landed. Now we stamp
+  // this ref ONLY on wheel / pointerdown / touchstart on the
+  // React Flow pane itself (see the effect below), which are
+  // unambiguous user-gesture signals.
+  const userPannedAtRef = useRef<number | null>(null);
+  const lastAutoFitAtRef = useRef(0);
 
   useEffect(() => {
     return () => {
       clearTimeout(saveTimerRef.current);
       clearTimeout(panTimerRef.current);
-      clearTimeout(autoFitTimerRef.current);
+      clearTimeout(settleFitTimerRef.current);
+      clearTimeout(trackingFitTimerRef.current);
+    };
+  }, []);
+
+  // User-gesture listeners for the respect-user-pan gate. Listens on
+  // `document` with capture phase and filters to events whose target
+  // lies inside the React Flow pane — this avoids a mount-order race
+  // (`.react-flow__pane` may not exist when the hook first runs if
+  // RF is behind a Suspense boundary) AND keeps clicks on the
+  // toolbar / modals / side panel from stamping user-pan-intent.
+  // Capture phase runs before target-phase `stopPropagation` so a
+  // handler elsewhere can't swallow the signal.
+  //
+  // Wheel only — NOT pointerdown. A pointerdown on the pane fires for
+  // ordinary clicks (deselect, click-near-a-card, modal-close-bubble)
+  // as well as the start of a drag-pan. Treating every pointerdown as
+  // "user wants to override auto-fit" meant a single accidental click
+  // before/during an org import locked out every subsequent fit, so
+  // the viewport stuck at whatever the first fit landed on while
+  // children kept materialising off-screen. Wheel is the canonical
+  // unambiguous gesture: scroll-to-pan and pinch-zoom both surface as
+  // wheel events. Drag-pans without an accompanying wheel are rare
+  // enough that letting them be overridden by a follow-up auto-fit is
+  // the right tradeoff.
+  useEffect(() => {
+    if (typeof window === "undefined") return;
+    const stamp = (e: Event) => {
+      const target = e.target as HTMLElement | null;
+      if (!target?.closest?.(".react-flow__pane")) return;
+      userPannedAtRef.current = Date.now();
+    };
+    const opts: AddEventListenerOptions = { passive: true, capture: true };
+    document.addEventListener("wheel", stamp, opts);
+    return () => {
+      document.removeEventListener("wheel", stamp, opts);
     };
   }, []);
 
@@ -55,20 +149,64 @@ export function useCanvasViewport() {
     hadProvisioningRef.current = hasProvisioning;
 
     if (wasProvisioning && !hasProvisioning && nodeCount > 0) {
-      clearTimeout(autoFitTimerRef.current);
+      // Root-complete moment — every root that has children just
+      // finished deploying. Pop + glow once (mol-deploy-root-complete)
+      // then auto-fit the viewport around the whole org. Leaf-only
+      // roots (single workspaces with no children) are skipped so the
+      // effect reads as "your org landed" not "random card flickered".
+      const state = useCanvasStore.getState();
+      const rootsWithChildren = new Set<string>();
+      for (const n of state.nodes) {
+        if (n.data.parentId) continue;
+        if (state.nodes.some((c) => c.data.parentId === n.id)) {
+          rootsWithChildren.add(n.id);
+        }
+      }
+      if (rootsWithChildren.size > 0) {
+        useCanvasStore.setState({
+          nodes: state.nodes.map((n) =>
+            rootsWithChildren.has(n.id)
+              ? { ...n, className: appendClass(n.className, "mol-deploy-root-complete") }
+              : n,
+          ),
+        });
+        // Strip the one-shot class after the keyframe ends so a later
+        // deploy on the same node can fire it again.
+        window.setTimeout(() => {
+          const s = useCanvasStore.getState();
+          useCanvasStore.setState({
+            nodes: s.nodes.map((n) =>
+              rootsWithChildren.has(n.id)
+                ? { ...n, className: removeClass(n.className, "mol-deploy-root-complete") }
+                : n,
+            ),
+          });
+        }, 800);
+      }
+
+      clearTimeout(settleFitTimerRef.current);
       // 1200ms settle delay: lets React Flow's DOM measurement pass
       // resize newly-online parents before we compute bounds.
       // Measuring too early gives us the pre-render skeleton bbox and
       // fitView zooms to that smaller-than-real rectangle.
-      autoFitTimerRef.current = setTimeout(() => {
+      settleFitTimerRef.current = setTimeout(() => {
         fitView({
+          // Deliberately SLOWER than the in-flight tracking fits
+          // (400ms). The asymmetry reads as "settling" on the
+          // finished org rather than "tracking" another arrival,
+          // which is the intended UX for the "deploy done" moment.
+          // Don't normalize these two durations to the same value.
           duration: 1200,
-          padding: 0.25,
+          // Match the deploy-time fit padding (0.45) so end-state
+          // and in-flight state use the same framing — otherwise
+          // the final zoom-out "jumps" relative to the intermediate
+          // fits and looks like a mis-layout.
+          padding: 0.45,
           // Cap zoom-in: a small tree (2-3 nodes) would otherwise end
           // up at the 2x maxZoom, visually implying "something is
-          // wrong". 0.8 reads like "here's your whole org" even when
-          // the tree is small.
-          maxZoom: 0.8,
+          // wrong". 0.65 reads like "here's your whole org" even when
+          // the tree is small — matches deploy-time cap.
+          maxZoom: 0.65,
           // Cap zoom-out: fitView would fall back to the component's
           // minZoom=0.1 on a sparse/outlier layout, leaving the user
           // staring at a postage-stamp canvas. 0.25 is the floor.
@@ -92,6 +230,115 @@ export function useCanvasViewport() {
     return () => window.removeEventListener("molecule:pan-to-node", handler);
   }, [fitView]);
 
+  // Auto pan+zoom to the whole deploying org after each child
+  // arrival — DEBOUNCED. Firing fitView on every event with a
+  // 600ms animation meant rapid sibling arrivals (server paces 2s
+  // apart, HMR bursts can land faster) made the viewport lurch
+  // continuously, which the user read as "parent flashing around".
+  // We now wait until the arrivals GO QUIET for 500ms, then run
+  // exactly one fit. The rootId we captured on the most recent
+  // event drives the fit bounds. Respect-user-pan still short-
+  // circuits: if the user moved after our last auto-fit, we never
+  // fit again this deploy.
+  const pendingFitRootRef = useRef<string | null>(null);
+  // Membership snapshot of the subtree at the moment of the last
+  // successful auto-fit, keyed by root id. When a new event arrives,
+  // we compute growth as "any id in the current subtree that wasn't
+  // in the snapshot". An id-set rather than just a count handles the
+  // delete-then-add case correctly: subtree of 6 → delete one → 5 →
+  // a different child arrives → 6 again. A length-only comparison
+  // would call this "no growth" and skip the fit even though a
+  // brand-new node landed off-screen. The id-set sees the new id
+  // wasn't in the snapshot and forces the fit.
+  //
+  // Map is keyed by root id and never pruned. Acceptable today because
+  // org roots are UUIDs (no collisions on retry / template re-import),
+  // canvas sessions are per-tab, and entries are tiny. Worth a sweep
+  // if long-lived sessions ever start importing hundreds of orgs.
+  const lastFitSubtreeIdsRef = useRef<Map<string, Set<string>>>(new Map());
+  useEffect(() => {
+    const runFit = () => {
+      const rootCandidate = pendingFitRootRef.current;
+      pendingFitRootRef.current = null;
+      if (!rootCandidate) return;
+      const state = useCanvasStore.getState();
+      // Climb to the true root — the event's rootId is the just-
+      // landed child's direct parent, which may itself be nested.
+      let topId = rootCandidate;
+      let cursor = state.nodes.find((n) => n.id === topId);
+      while (cursor?.data.parentId) {
+        const up = state.nodes.find((n) => n.id === cursor!.data.parentId);
+        if (!up) break;
+        cursor = up;
+        topId = up.id;
+      }
+      const subtree: string[] = [];
+      const stack = [topId];
+      while (stack.length) {
+        const id = stack.pop()!;
+        subtree.push(id);
+        for (const n of state.nodes) {
+          if (n.data.parentId === id) stack.push(n.id);
+        }
+      }
+      if (subtree.length === 0) return;
+
+      // Growth check: did any id in the current subtree NOT appear
+      // in the snapshot from the last fit? If yes, fit through
+      // regardless of the user-pan timestamp — the user has lost
+      // context, the new arrival is off-screen, and the deploy is
+      // the primary thing they want to watch. If no, fall back to
+      // the user-pan respect gate so post-deploy exploration isn't
+      // yanked back.
+      if (!shouldFitGrowing(
+        subtree,
+        lastFitSubtreeIdsRef.current.get(topId),
+        userPannedAtRef.current,
+        lastAutoFitAtRef.current,
+      )) {
+        return;
+      }
+      fitView({
+        nodes: subtree.map((id) => ({ id })),
+        // Short animation — server paces children ~2s apart, so a
+        // 400ms fit animation reads as "smoothly tracked" rather
+        // than "constantly lurching". Longer durations (the earlier
+        // 600ms) start to overlap if the user re-triggers deploys.
+        duration: 400,
+        // Generous padding so the right-hand Communications panel,
+        // bottom-left Legend, and bottom-right "New Workspace"
+        // button don't cover the outer cards. React Flow padding
+        // is a fraction of viewport dims, so 0.45 ≈ ~430px of
+        // margin on a 960-wide canvas — enough clearance for the
+        // two side panels (~300px + ~280px).
+        padding: 0.45,
+        // Lower maxZoom so small orgs (2-3 cards) still zoom out
+        // enough to show the parent frame + children clearly with
+        // the padded margins. 0.65 reads as "here's the whole org"
+        // without getting dragged to the maxZoom by fitView's
+        // "fill the viewport" default.
+        maxZoom: 0.65,
+        minZoom: 0.25,
+      });
+      lastAutoFitAtRef.current = Date.now();
+      lastFitSubtreeIdsRef.current.set(topId, new Set(subtree));
+    };
+    const handler = (e: Event) => {
+      const { rootId } = (e as CustomEvent<{ rootId: string }>).detail;
+      // Keep the most recently-requested root. Back-to-back imports
+      // on two different orgs (rare — user would have to click
+      // Import twice within 500ms) "later wins" the viewport rather
+      // than ping-ponging between them. If this becomes a real
+      // pattern we'd flush the pending fit synchronously when
+      // `rootId` changes, rather than resetting the timer.
+      pendingFitRootRef.current = rootId;
+      clearTimeout(trackingFitTimerRef.current);
+      trackingFitTimerRef.current = setTimeout(runFit, 500);
+    };
+    window.addEventListener("molecule:fit-deploying-org", handler);
+    return () => window.removeEventListener("molecule:fit-deploying-org", handler);
+  }, [fitView]);
+
   // Zoom to a team: fit the parent + its direct children in view.
   useEffect(() => {
     const handler = (e: Event) => {
@@ -129,6 +376,11 @@ export function useCanvasViewport() {
 
   const onMoveEnd = useCallback(
     (_event: unknown, vp: { x: number; y: number; zoom: number }) => {
+      // User-pan detection moved to the wheel/pointerdown listener
+      // above — onMoveEnd fires for programmatic fitView too, which
+      // made this callback an unreliable source for user-intent
+      // tracking. This now only handles the debounced viewport
+      // save so a reload lands the user back where they were.
       clearTimeout(saveTimerRef.current);
       saveTimerRef.current = setTimeout(() => {
         saveViewport(vp.x, vp.y, vp.zoom);
diff --git a/canvas/src/components/canvas/useDragHandlers.ts b/canvas/src/components/canvas/useDragHandlers.ts
index a0a38e77..aa8fa82c 100644
--- a/canvas/src/components/canvas/useDragHandlers.ts
+++ b/canvas/src/components/canvas/useDragHandlers.ts
@@ -113,6 +113,18 @@ export function useDragHandlers(): DragHandlers {
 
   const onNodeDragStart: OnNodeDrag<WorkspaceNode> = useCallback(
     (event, node) => {
+      // Belt-and-braces drag-lock: the primary mechanism is the
+      // `draggable: false` projection in Canvas.tsx — React Flow
+      // won't invoke this callback for locked nodes. But a future
+      // change to the projection that forgets a locked subtree
+      // would silently allow dragging, and locked drags mid-deploy
+      // corrupt the spawn animation. Fall through to a state-based
+      // check here so the invariant stays enforced in both places.
+      if (node.draggable === false) {
+        dragStartStateRef.current = null;
+        return;
+      }
+
       dragModifiersRef.current = {
         alt: event.altKey,
         meta: event.metaKey || event.ctrlKey,
diff --git a/canvas/src/components/canvas/useOrgDeployState.ts b/canvas/src/components/canvas/useOrgDeployState.ts
new file mode 100644
index 00000000..587643df
--- /dev/null
+++ b/canvas/src/components/canvas/useOrgDeployState.ts
@@ -0,0 +1,152 @@
+"use client";
+
+import { useMemo } from "react";
+import { useCanvasStore } from "@/store/canvas";
+
+/**
+ * Org-deploy state for a single workspace node. Computed from the
+ * current canvas store snapshot — no per-org status field on the
+ * backend is required (a root "is deploying" iff any descendant in
+ * its subtree still reports status === "provisioning").
+ *
+ * Performance note: the first version of this hook walked the entire
+ * nodes array per node render — O(n²) for a 50-node org. The current
+ * implementation computes ONE map of derived state for the whole
+ * canvas per nodes-array change, then each call site looks up its
+ * own id. The map is built inside useMemo against a cheap projection
+ * (id + parentId + status tuples via useShallow) so unrelated store
+ * mutations (drag, selection, viewport) don't re-run the walk.
+ */
+export interface OrgDeployState {
+  isActivelyProvisioning: boolean;
+  isDeployingRoot: boolean;
+  isLockedChild: boolean;
+  descendantProvisioningCount: number;
+}
+
+const EMPTY: OrgDeployState = {
+  isActivelyProvisioning: false,
+  isDeployingRoot: false,
+  isLockedChild: false,
+  descendantProvisioningCount: 0,
+};
+
+/** Projection used to drive the deploy-state computation. Shallow-
+ *  compared so re-renders only happen when one of these fields
+ *  actually changes across any node. */
+interface NodeProjection {
+  id: string;
+  parentId: string | null;
+  status: string;
+}
+
+function buildDeployMap(
+  projections: NodeProjection[],
+  deletingIds: ReadonlySet<string>,
+): Map<string, OrgDeployState> {
+  const byId = new Map<string, NodeProjection>();
+  const childrenBy = new Map<string, string[]>();
+  for (const p of projections) {
+    byId.set(p.id, p);
+    if (p.parentId) {
+      const arr = childrenBy.get(p.parentId) ?? [];
+      arr.push(p.id);
+      childrenBy.set(p.parentId, arr);
+    }
+  }
+
+  // Walk once from each node up to its root, memoising the root id.
+  // `rootOf.get(id)` short-circuits further walks on the same chain.
+  const rootOf = new Map<string, string>();
+  const findRoot = (id: string): string => {
+    const cached = rootOf.get(id);
+    if (cached) return cached;
+    let cursor: NodeProjection | undefined = byId.get(id);
+    let rootId = id;
+    while (cursor && cursor.parentId) {
+      const parent = byId.get(cursor.parentId);
+      if (!parent) break;
+      cursor = parent;
+      rootId = parent.id;
+      const alreadyKnown = rootOf.get(rootId);
+      if (alreadyKnown) {
+        rootId = alreadyKnown;
+        break;
+      }
+    }
+    rootOf.set(id, rootId);
+    return rootId;
+  };
+
+  // Count provisioning descendants per node. Also walk once per root
+  // using an iterative DFS so we don't stack-overflow on deep trees.
+  const countProvisioning = (rootId: string): number => {
+    let count = 0;
+    const stack = [rootId];
+    while (stack.length) {
+      const id = stack.pop()!;
+      const node = byId.get(id);
+      if (!node) continue;
+      if (node.status === "provisioning") count++;
+      const kids = childrenBy.get(id);
+      if (kids) stack.push(...kids);
+    }
+    return count;
+  };
+
+  // Per-root cache of subtree count so every descendant resolves in O(1).
+  const rootCount = new Map<string, number>();
+
+  const out = new Map<string, OrgDeployState>();
+  for (const p of projections) {
+    const rootId = findRoot(p.id);
+    let provCount = rootCount.get(rootId);
+    if (provCount === undefined) {
+      provCount = countProvisioning(rootId);
+      rootCount.set(rootId, provCount);
+    }
+    const rootIsDeploying = provCount > 0;
+    // A node being deleted gets the same visual + interaction lock
+    // as a deploying child. "The system owns this node right now,
+    // don't touch it" is the shared semantic — the user only cares
+    // that the card is dim and won't drag; they don't need to know
+    // whether it's coming up or going down.
+    const deleting = deletingIds.has(p.id);
+    out.set(p.id, {
+      isActivelyProvisioning: p.status === "provisioning",
+      isDeployingRoot: p.id === rootId && rootIsDeploying,
+      isLockedChild: deleting || (p.id !== rootId && rootIsDeploying),
+      descendantProvisioningCount:
+        p.id === rootId ? provCount : 0, // only roots display the count
+    });
+  }
+  return out;
+}
+
+/** Store-wide derived map. Recomputed whenever the `nodes` array
+ *  reference changes — which is on every store mutation that touches
+ *  nodes, including pure position tweens. The map build is O(n) so
+ *  a 50-node canvas costs ~50μs per tween frame; that's cheap enough
+ *  to not need a projection layer. (An earlier attempt to narrow the
+ *  subscription via `useShallow((s) => s.nodes.map(...))` triggered
+ *  React 18's "getSnapshot should be cached" loop because the
+ *  projection creates fresh object references each call — shallow
+ *  equality always sees "changed", which re-renders, which re-runs
+ *  the selector, ad infinitum.) */
+function useDeployMap(): Map<string, OrgDeployState> {
+  const nodes = useCanvasStore((s) => s.nodes);
+  const deletingIds = useCanvasStore((s) => s.deletingIds);
+  return useMemo(() => {
+    const projections = nodes.map((n) => ({
+      id: n.id,
+      parentId: n.data.parentId,
+      status: n.data.status,
+    }));
+    return buildDeployMap(projections, deletingIds);
+  }, [nodes, deletingIds]);
+}
+
+export function useOrgDeployState(nodeId: string): OrgDeployState {
+  const map = useDeployMap();
+  return map.get(nodeId) ?? EMPTY;
+}
diff --git a/canvas/src/components/tabs/ActivityTab.tsx b/canvas/src/components/tabs/ActivityTab.tsx
index fc857842..d0e31630 100644
--- a/canvas/src/components/tabs/ActivityTab.tsx
+++ b/canvas/src/components/tabs/ActivityTab.tsx
@@ -5,6 +5,7 @@ import { api } from "@/lib/api";
 import { ConversationTraceModal } from "@/components/ConversationTraceModal";
 import { type ActivityEntry } from "@/types/activity";
 import { useWorkspaceName } from "@/hooks/useWorkspaceName";
+import { inferA2AErrorHint } from "./chat/a2aErrorHint";
 
 interface Props {
   workspaceId: string;
@@ -286,6 +287,26 @@ function ActivityRow({
   );
 }
 
+const A2A_ERROR_PREFIX = "[A2A_ERROR]";
+
+/** Render a [A2A_ERROR]-prefixed response as a structured error block
+ *  with a stripped detail line + a cause hint. The previous raw render
+ *  ("[A2A_ERROR] " literal in the response area) gave the user no
+ *  signal to act on. */
+function A2AErrorPreview({ label, raw }: { label: string; raw: string }) {
+  const detail = raw.slice(A2A_ERROR_PREFIX.length).trim() || "(no detail provided)";
+  const hint = inferA2AErrorHint(detail);
+  return (
+    <div>
+      <div className="text-[8px] text-red-400/80 uppercase tracking-wider mb-1">{label} — delivery failed</div>
+      <div className="text-[10px] text-red-300 bg-red-950/30 border border-red-800/40 rounded p-2 space-y-1.5">
+        <div className="font-mono whitespace-pre-wrap break-words max-h-32 overflow-y-auto">{detail}</div>
+        <div className="text-[9px] text-red-300/70 leading-relaxed border-t border-red-800/30 pt-1.5">{hint}</div>
+      </div>
+    </div>
+  );
+}
+
 /** Extract human-readable text from A2A request/response JSON */
 function MessagePreview({ label, body }: { label: string; body: Record<string, unknown> }) {
   // Try to extract text from A2A message parts
@@ -295,6 +316,14 @@ function MessagePreview({ label, body }: { label: string; body: Record<string, u
     if (body.task && typeof body.task === "string") { text = body.task; }
     if (!text && body.result && typeof body.result === "string") { text = body.result; }
     if (text) {
+      // [A2A_ERROR]-prefixed responses get the structured error
+      // treatment. Bare text fallthrough renders a bland gray block
+      // — fine for normal replies, terrible for "[A2A_ERROR] " with
+      // no further context. Detect at the top of the rendering path
+      // so it short-circuits before the generic preview kicks in.
+      if (text.trimStart().startsWith(A2A_ERROR_PREFIX)) {
+        return <A2AErrorPreview label={label} raw={text.trimStart()} />;
+      }
       return (
         <div>
           <div className="text-[8px] text-zinc-500 uppercase tracking-wider mb-1">{label}</div>
diff --git a/canvas/src/components/tabs/ChatTab.tsx b/canvas/src/components/tabs/ChatTab.tsx
index 3762ffdc..68734f11 100644
--- a/canvas/src/components/tabs/ChatTab.tsx
+++ b/canvas/src/components/tabs/ChatTab.tsx
@@ -7,9 +7,12 @@ import { api } from "@/lib/api";
 import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
 import { WS_URL } from "@/store/socket";
 import { closeWebSocketGracefully } from "@/lib/ws-close";
-import { type ChatMessage, createMessage, appendMessageDeduped } from "./chat/types";
-import { extractResponseText, extractRequestText } from "./chat/message-parser";
+import { type ChatMessage, type ChatAttachment, createMessage, appendMessageDeduped } from "./chat/types";
+import { uploadChatFiles, downloadChatFile } from "./chat/uploads";
+import { AttachmentChip, PendingAttachmentPill } from "./chat/AttachmentViews";
+import { extractResponseText, extractRequestText, extractFilesFromTask } from "./chat/message-parser";
 import { AgentCommsPanel } from "./chat/AgentCommsPanel";
+import { appendActivityLine } from "./chat/activityLog";
 import { runtimeDisplayName } from "@/lib/runtime-names";
 import { ConfirmDialog } from "@/components/ConfirmDialog";
 
@@ -21,10 +24,18 @@ interface Props {
 type ChatSubTab = "my-chat" | "agent-comms";
 
 // A2A response shape (subset). The full schema is in @a2a-js/sdk but we only
-// need parts/artifacts text extraction for the synchronous fallback path.
+// need parts/artifacts text + file extraction for the synchronous fallback.
+interface A2AFileRef {
+  name?: string;
+  mimeType?: string;
+  uri?: string;
+  bytes?: string;
+  size?: number;
+}
 interface A2APart {
   kind: string;
-  text: string;
+  text?: string;
+  file?: A2AFileRef;
 }
 interface A2AResponse {
   result?: {
@@ -33,25 +44,81 @@ interface A2AResponse {
   };
 }
 
+/** Detect activity-log rows that the workspace's own runtime fired
+ *  against itself but were misclassified as canvas-source. The proper
+ *  fix is the X-Workspace-ID header from `self_source_headers()` in
+ *  workspace/platform_auth.py, which makes the platform record
+ *  source_id = workspace_id. But three failure modes still leak a
+ *  self-message into "My Chat":
+ *
+ *    1. Historical rows already in the DB with source_id=NULL.
+ *    2. Workspace containers running pre-fix heartbeat.py / main.py
+ *       (the fix only takes effect after an image rebuild + redeploy).
+ *    3. Future internal triggers added without the helper.
+ *
+ *  This client-side filter recognises the heartbeat trigger by its
+ *  exact prefix — the heartbeat assembles
+ *
+ *    "Delegation results are ready. Review them and take appropriate
+ *     action:\n" + summary_lines + report_instruction
+ *
+ *  in workspace/heartbeat.py. The prefix is template-fixed so a
+ *  string match is reliable. If the heartbeat copy ever changes,
+ *  update this constant in the same commit.
+ *
+ *  This is a backstop, not the primary defence — the X-Workspace-ID
+ *  header is. Filtering content is fragile to copy edits, so keep
+ *  the list narrow. */
+const INTERNAL_SELF_MESSAGE_PREFIXES = [
+  "Delegation results are ready. Review them and take appropriate action",
+];
+
+function isInternalSelfMessage(text: string): boolean {
+  return INTERNAL_SELF_MESSAGE_PREFIXES.some((p) => text.startsWith(p));
+}
+
 // extractReplyText pulls the agent's text reply out of an A2A response.
-// Mirrors the Go-side extractReplyText in workspace-server/internal/channels/manager.go.
+// Concatenates ALL text parts (joined with "\n") rather than returning
+// just the first. Claude Code and other runtimes commonly emit multi-
+// part text replies for long content (markdown tables, code blocks),
+// and the prior "first part wins" implementation silently truncated
+// the rest — observed on a 15k-char Wave 1 brief that rendered only
+// the table header. Mirrors extractTextsFromParts in message-parser.ts.
+//
+// Server-side counterpart in workspace-server/internal/channels/
+// manager.go has the same single-part bug; fix that too if/when a
+// channel-delivered reply (Slack, Lark, etc.) gets truncated.
 function extractReplyText(resp: A2AResponse): string {
+  const collect = (parts: A2APart[] | undefined): string => {
+    if (!parts) return "";
+    return parts
+      .filter((p) => p.kind === "text")
+      .map((p) => p.text ?? "")
+      .filter(Boolean)
+      .join("\n");
+  };
   const result = resp?.result;
-  if (result?.parts) {
-    for (const p of result.parts) {
-      if (p.kind === "text") return p.text;
-    }
-  }
+  const collected: string[] = [];
+  const fromParts = collect(result?.parts);
+  if (fromParts) collected.push(fromParts);
+  // Walk artifacts even if parts had text — some producers (Hermes
+  // tool calls) emit a summary in parts AND details in artifacts.
+  // Returning early on parts dropped the artifact body silently.
   if (result?.artifacts) {
     for (const a of result.artifacts) {
-      for (const p of a.parts || []) {
-        if (p.kind === "text") return p.text;
-      }
+      const t = collect(a.parts);
+      if (t) collected.push(t);
     }
   }
-  return "";
+  return collected.join("\n");
 }
 
+// Agent-returned files live on the same response shape as text —
+// delegated to extractFilesFromTask in message-parser.ts, which also
+// walks status.message.parts (that ChatTab's legacy text extractor
+// doesn't). Single source of truth for file-part parsing across
+// live chat, activity log replay, and any future consumers.
+
 /**
  * Load chat history from the activity_logs database via the platform API.
  * Uses source=canvas to only get user-initiated messages (not agent-to-agent).
@@ -71,16 +138,23 @@ async function loadMessagesFromDB(workspaceId: string): Promise<{ messages: Chat
     for (const a of [...activities].reverse()) {
       // Extract user message from request_body
       const userText = extractRequestText(a.request_body);
-      if (userText) {
+      if (userText && !isInternalSelfMessage(userText)) {
         messages.push(createMessage("user", userText));
       }
 
-      // Extract agent response
+      // Extract agent response — text AND any file attachments so a
+      // chat reload surfaces historical download chips, not just plain
+      // text. `result` is nested on successful A2A responses; some
+      // older rows stored the raw `result` payload at the top level,
+      // so fall back to the body itself when `.result` is absent.
       if (a.response_body) {
         const text = extractResponseText(a.response_body);
-        if (text) {
+        const attachments = extractFilesFromTask(
+          (a.response_body.result ?? a.response_body) as Record<string, unknown>,
+        );
+        if (text || attachments.length > 0) {
           const role = a.status === "error" || text.toLowerCase().startsWith("agent error") ? "system" : "agent";
-          messages.push({ ...createMessage(role, text), timestamp: a.created_at });
+          messages.push({ ...createMessage(role, text, attachments), timestamp: a.created_at });
         }
       }
     }
@@ -178,7 +252,16 @@ export function ChatTab({ workspaceId, data }: Props) {
 function MyChatPanel({ workspaceId, data }: Props) {
   const [messages, setMessages] = useState<ChatMessage[]>([]);
   const [input, setInput] = useState("");
-  const [sending, setSending] = useState(!!data.currentTask);
+  // `sending` is strictly the "this tab kicked off a send and hasn't
+  // seen the reply yet" signal. Previously this was initialized from
+  // data.currentTask to pick up in-flight agent work on mount, but
+  // that conflated agent-busy (workspace heartbeat) with user-
+  // in-flight (local send): when the WS dropped a TASK_COMPLETE event,
+  // currentTask lingered, the component re-mounted with sending=true,
+  // and the Send button stayed disabled forever even though nothing
+  // local was in flight. For the "agent is busy, show spinner" UX,
+  // use data.currentTask directly in the render path.
+  const [sending, setSending] = useState(false);
   const [thinkingElapsed, setThinkingElapsed] = useState(0);
   const [activityLog, setActivityLog] = useState<string[]>([]);
   const [loading, setLoading] = useState(true);
@@ -189,6 +272,17 @@ function MyChatPanel({ workspaceId, data }: Props) {
   const [error, setError] = useState<string | null>(null);
   const [confirmRestart, setConfirmRestart] = useState(false);
   const bottomRef = useRef<HTMLDivElement>(null);
+  // Files the user has picked but not yet sent. Cleared on send
+  // (upload success) or by the × on each pill.
+  const [pendingFiles, setPendingFiles] = useState<File[]>([]);
+  const [uploading, setUploading] = useState(false);
+  const fileInputRef = useRef<HTMLInputElement>(null);
+  // Guard against a double-click during the upload phase: React
+  // state updates from the click that started the upload haven't
+  // flushed yet, so the disabled-button logic sees `uploading=false`
+  // from the closure and lets a second `sendMessage` enter. A ref
+  // observes the latest value synchronously.
+  const sendInFlightRef = useRef(false);
 
   // Load chat history from database on mount
   useEffect(() => {
@@ -231,8 +325,10 @@ function MyChatPanel({ workspaceId, data }: Props) {
       // Dedupe in case the agent proactively pushed the same text the
       // HTTP /a2a response already delivered (observed with the Hermes
       // runtime, which emits both a reply body and a send_message_to_user
-      // push for the same content).
-      setMessages((prev) => appendMessageDeduped(prev, createMessage("agent", m.content)));
+      // push for the same content). Attachments ride along with the
+      // message so files returned by the A2A_RESPONSE WS path render
+      // their download chips.
+      setMessages((prev) => appendMessageDeduped(prev, createMessage("agent", m.content, m.attachments)));
     }
     if (sendingFromAPIRef.current && msgs.length > 0) {
       setSending(false);
@@ -277,12 +373,21 @@ function MyChatPanel({ workspaceId, data }: Props) {
       try {
         const msg = JSON.parse(event.data);
         if (msg.event === "ACTIVITY_LOGGED") {
+          // Filter to events for THIS workspace. The platform's
+          // BroadcastOnly fires to every connected client, and
+          // without this guard a sibling workspace's a2a_send would
+          // surface as "→ Delegating to X..." inside the wrong
+          // chat panel. (workspace_id on the WS envelope is the
+          // workspace whose activity_log row we just wrote.)
+          if (msg.workspace_id !== workspaceId) return;
+
           const p = msg.payload || {};
           const type = p.activity_type as string;
           const method = (p.method as string) || "";
           const status = (p.status as string) || "";
           const targetId = (p.target_id as string) || "";
           const durationMs = p.duration_ms as number | undefined;
+          const summary = (p.summary as string) || "";
 
           let line = "";
           if (type === "a2a_receive" && method === "message/send") {
@@ -313,17 +418,23 @@ function MyChatPanel({ workspaceId, data }: Props) {
             const targetName = resolveWorkspaceName(targetId);
             line = `→ Delegating to ${targetName}...`;
           } else if (type === "task_update") {
-            const summary = (p.summary as string) || "";
             if (summary) line = `⟳ ${summary}`;
+          } else if (type === "agent_log") {
+            // Per-tool-use telemetry from claude_sdk_executor's
+            // _report_tool_use. The summary already carries an icon
+            // + human-readable args (📄 Read /path, ⚡ Bash: …)
+            // so we render it verbatim. No icon prefix here — the
+            // emoji at the start of summary is the visual marker.
+            if (summary) line = summary;
           }
 
           if (line) {
-            setActivityLog((prev) => [...prev.slice(-8), line]);
+            setActivityLog((prev) => appendActivityLine(prev, line));
           }
         } else if (msg.event === "TASK_UPDATED" && msg.workspace_id === workspaceId) {
           const task = (msg.payload?.current_task as string) || "";
           if (task) {
-            setActivityLog((prev) => [...prev.slice(-8), `⟳ ${task}`]);
+            setActivityLog((prev) => appendActivityLine(prev, `⟳ ${task}`));
           }
         }
         // A2A_RESPONSE is already consumed by the store and its text is
@@ -339,10 +450,35 @@ function MyChatPanel({ workspaceId, data }: Props) {
 
   const sendMessage = async () => {
     const text = input.trim();
-    if (!text || !agentReachable || sending) return;
+    const filesToSend = pendingFiles;
+    // Allow sending if EITHER text OR attachments are present — a user
+    // can drop a file with no text and the agent still receives it.
+    if ((!text && filesToSend.length === 0) || !agentReachable || sending || uploading) return;
+    // Synchronous re-entry guard — see sendInFlightRef comment.
+    if (sendInFlightRef.current) return;
+    sendInFlightRef.current = true;
+
+    // Upload attachments first so we can include URIs in the A2A
+    // message parts. Sequential-before-send: a message with references
+    // to files not yet staged would fail agent-side; staging happens
+    // synchronously via /chat/uploads before message/send dispatch.
+    let uploaded: ChatAttachment[] = [];
+    if (filesToSend.length > 0) {
+      setUploading(true);
+      try {
+        uploaded = await uploadChatFiles(workspaceId, filesToSend);
+      } catch (e) {
+        setUploading(false);
+        sendInFlightRef.current = false;
+        setError(e instanceof Error ? `Upload failed: ${e.message}` : "Upload failed");
+        return;
+      }
+      setUploading(false);
+    }
 
     setInput("");
-    setMessages((prev) => [...prev, createMessage("user", text)]);
+    setPendingFiles([]);
+    setMessages((prev) => [...prev, createMessage("user", text, uploaded)]);
     setSending(true);
     sendingFromAPIRef.current = true;
     setError(null);
@@ -356,40 +492,228 @@ function MyChatPanel({ workspaceId, data }: Props) {
         parts: [{ kind: "text", text: m.content }],
       }));
 
+    // A2A parts: text part (if any) + file parts (per attachment). The
+    // agent sees both in a single turn, matching the A2A spec shape.
+    const parts: A2APart[] = [];
+    if (text) parts.push({ kind: "text", text });
+    for (const att of uploaded) {
+      parts.push({
+        kind: "file",
+        file: {
+          name: att.name,
+          mimeType: att.mimeType,
+          uri: att.uri,
+          size: att.size,
+        },
+      });
+    }
+
+    // A2A calls can legitimately take minutes — LLM latency +
+    // multi-turn tool use is common on slower providers (Hermes+minimax,
+    // Claude Code invoking bash/file tools, etc.). The 15s default
+    // would silently abort the fetch here, leaving the server to
+    // complete the reply and the user staring at
+    // "agent may be unreachable". Match the upload timeout (60s × 2)
+    // for the happy-path ceiling; anything longer is genuinely stuck.
     api.post<A2AResponse>(`/workspaces/${workspaceId}/a2a`, {
       method: "message/send",
       params: {
         message: {
           role: "user",
           messageId: crypto.randomUUID(),
-          parts: [{ kind: "text", text }],
+          parts,
         },
         metadata: { history },
       },
-    })
+    }, { timeoutMs: 120_000 })
       .then((resp) => {
         // Skip if the WS A2A_RESPONSE event already handled this response.
         // Both paths (WS + HTTP) check sendingFromAPIRef — whichever clears
         // it first wins, the other becomes a no-op (no duplicate messages).
         if (!sendingFromAPIRef.current) return;
         const replyText = extractReplyText(resp);
-        if (replyText) {
-          setMessages((prev) => appendMessageDeduped(prev, createMessage("agent", replyText)));
+        const replyFiles = extractFilesFromTask((resp?.result ?? {}) as Record<string, unknown>);
+        if (replyText || replyFiles.length > 0) {
+          setMessages((prev) =>
+            appendMessageDeduped(prev, createMessage("agent", replyText, replyFiles)),
+          );
         }
         setSending(false);
         sendingFromAPIRef.current = false;
+        sendInFlightRef.current = false;
       })
       .catch(() => {
+        // Same dedup guard as .then(): if a WS path (pendingAgentMsgs
+        // or ACTIVITY_LOGGED a2a_receive ok) already delivered the
+        // reply, sendingFromAPIRef is already false and there's
+        // nothing to roll back. Surfacing "Failed to send" here would
+        // contradict the agent reply the user is currently reading —
+        // exactly the false-positive observed when the HTTP request
+        // hung up (proxy idle / 502) after WS already won.
+        if (!sendingFromAPIRef.current) {
+          sendInFlightRef.current = false;
+          return;
+        }
         setSending(false);
         sendingFromAPIRef.current = false;
+        sendInFlightRef.current = false;
         setError("Failed to send message — agent may be unreachable");
       });
   };
 
+  const onFilesPicked = (fileList: FileList | null) => {
+    if (!fileList) return;
+    const picked = Array.from(fileList);
+    // Deduplicate against current pending set by name+size — user
+    // picking the same file twice shouldn't append it.
+    setPendingFiles((prev) => {
+      const keyed = new Set(prev.map((f) => `${f.name}:${f.size}`));
+      return [...prev, ...picked.filter((f) => !keyed.has(`${f.name}:${f.size}`))];
+    });
+    if (fileInputRef.current) fileInputRef.current.value = "";
+  };
+
+  const removePendingFile = (index: number) =>
+    setPendingFiles((prev) => prev.filter((_, i) => i !== index));
+
+  // Monotonic counter so two paste events within the same wall-clock
+  // second still produce distinct filenames. Without this, on
+  // Firefox (where pasted images have an empty `file.name`), two
+  // pastes ~100ms apart could yield identical synthetic names AND
+  // identical sizes, collapsing into one attachment via the
+  // `name:size` dedup in onFilesPicked.
+  const pasteCounterRef = useRef(0);
+
+  /** Paste-from-clipboard image attachment.
+   *
+   *  Browser clipboard image items arrive as `File`s whose `name` is
+   *  often a generic "image.png" (Chrome) or empty (Firefox/Safari),
+   *  so two consecutive screenshot pastes collide on the name+size
+   *  dedup the file-picker uses. Re-tag each pasted image with a
+   *  per-paste unique name so dedup keeps them apart and the upload
+   *  pipeline (which expects a non-empty filename) is happy.
+   *
+   *  Falls through to onFilesPicked via direct File[] (NOT through
+   *  the DataTransfer constructor — that throws on Safari < 14.1
+   *  and old Edge, silently aborting the paste).
+   *
+   *  Only intercepts the paste when the clipboard has at least one
+   *  image; text-only pastes fall through to the textarea's default
+   *  behaviour. */
+  const mimeToExt = (mime: string): string => {
+    // Avoid raw `mime.split("/")[1]` — that yields `"svg+xml"`,
+    // `"jpeg"`, `"webp"` etc. which produce ugly filenames and may
+    // trip server-side extension allowlists. Map known types
+    // explicitly; unknown falls back to a safe default.
+    if (mime === "image/svg+xml") return "svg";
+    if (mime === "image/jpeg") return "jpg";
+    if (mime === "image/png") return "png";
+    if (mime === "image/gif") return "gif";
+    if (mime === "image/webp") return "webp";
+    if (mime === "image/heic") return "heic";
+    return "png";
+  };
+
+  const onPasteIntoComposer = (e: React.ClipboardEvent<HTMLTextAreaElement>) => {
+    if (!dropEnabled) return;
+    const items = e.clipboardData?.items;
+    if (!items || items.length === 0) return;
+    const imageFiles: File[] = [];
+    for (let i = 0; i < items.length; i++) {
+      const item = items[i];
+      if (!item.type.startsWith("image/")) continue;
+      const file = item.getAsFile();
+      if (!file) continue;
+      const ext = mimeToExt(file.type);
+      const stamp = new Date()
+        .toISOString()
+        .replace(/[:.]/g, "-")
+        .slice(0, 19);
+      const seq = pasteCounterRef.current++;
+      const fname = `pasted-${stamp}-${seq}-${i}.${ext}`;
+      imageFiles.push(new File([file], fname, { type: file.type }));
+    }
+    if (imageFiles.length === 0) return;
+    e.preventDefault();
+    // Reuse the picker path so file-size guards, dedup, and pending-
+    // list state all run through the same code. Build a synthetic
+    // FileList-like object to avoid the DataTransfer constructor —
+    // that's missing on Safari < 14.1 / old Edge and would silently
+    // throw, leaving the paste a no-op.
+    addPastedFiles(imageFiles);
+  };
+
+  // Variant of onFilesPicked that accepts a File[] directly, sidestepping
+  // the DataTransfer-FileList round-trip. Same dedup + state shape.
+  const addPastedFiles = (files: File[]) => {
+    setPendingFiles((prev) => {
+      const keyed = new Set(prev.map((f) => `${f.name}:${f.size}`));
+      return [...prev, ...files.filter((f) => !keyed.has(`${f.name}:${f.size}`))];
+    });
+  };
+
+  // Drag-and-drop staging. dragDepthRef counts enter vs leave events so
+  // the overlay doesn't flicker when the cursor crosses nested children
+  // (textarea, buttons) — dragenter/dragleave fire for every boundary.
+  const [dragOver, setDragOver] = useState(false);
+  const dragDepthRef = useRef(0);
+  const dropEnabled = agentReachable && !sending && !uploading;
+  const isFileDrag = (e: React.DragEvent) =>
+    Array.from(e.dataTransfer.types || []).includes("Files");
+
+  const onDragEnter = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    e.preventDefault();
+    dragDepthRef.current += 1;
+    setDragOver(true);
+  };
+  const onDragOver = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    e.preventDefault();
+    e.dataTransfer.dropEffect = "copy";
+  };
+  const onDragLeave = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    dragDepthRef.current = Math.max(0, dragDepthRef.current - 1);
+    if (dragDepthRef.current === 0) setDragOver(false);
+  };
+  const onDrop = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    e.preventDefault();
+    dragDepthRef.current = 0;
+    setDragOver(false);
+    onFilesPicked(e.dataTransfer.files);
+  };
+
+  const downloadAttachment = (att: ChatAttachment) => {
+    // Errors here are rare but user-visible (401 on a revoked token,
+    // 404 if the agent deleted the file). Surface via the inline
+    // error banner — the message list itself stays untouched.
+    downloadChatFile(workspaceId, att).catch((e) => {
+      setError(e instanceof Error ? `Download failed: ${e.message}` : "Download failed");
+    });
+  };
+
   const isOnline = data.status === "online" || data.status === "degraded";
 
   return (
-    <div className="flex flex-col h-full">
+    <div
+      className="flex flex-col h-full relative"
+      onDragEnter={onDragEnter}
+      onDragOver={onDragOver}
+      onDragLeave={onDragLeave}
+      onDrop={onDrop}
+    >
+      {dragOver && (
+        <div
+          className="absolute inset-0 z-20 flex items-center justify-center bg-blue-500/10 border-2 border-dashed border-blue-400 rounded pointer-events-none"
+          aria-live="polite"
+        >
+          <div className="bg-zinc-900/90 border border-blue-400/50 rounded-lg px-4 py-2 text-xs text-blue-200">
+            Drop to attach
+          </div>
+        </div>
+      )}
       {/* Messages */}
       <div className="flex-1 overflow-y-auto p-3 space-y-3">
         {loading && (
@@ -435,9 +759,23 @@ function MyChatPanel({ workspaceId, data }: Props) {
                     : "bg-zinc-800/80 text-zinc-200 border border-zinc-700/30"
               }`}
             >
-              <div className="prose prose-sm prose-invert max-w-none [&>p]:mb-1 [&>p:last-child]:mb-0">
-                <ReactMarkdown remarkPlugins={[remarkGfm]}>{msg.content}</ReactMarkdown>
-              </div>
+              {msg.content && (
+                <div className="prose prose-sm prose-invert max-w-none [&>p]:mb-1 [&>p:last-child]:mb-0">
+                  <ReactMarkdown remarkPlugins={[remarkGfm]}>{msg.content}</ReactMarkdown>
+                </div>
+              )}
+              {msg.attachments && msg.attachments.length > 0 && (
+                <div className={`flex flex-wrap gap-1 ${msg.content ? "mt-1.5" : ""}`}>
+                  {msg.attachments.map((att, i) => (
+                    <AttachmentChip
+                      key={`${msg.id}-${i}`}
+                      attachment={att}
+                      onDownload={downloadAttachment}
+                      tone={msg.role === "user" ? "user" : "agent"}
+                    />
+                  ))}
+                </div>
+              )}
               <div className="text-[9px] text-zinc-500 mt-1">
                 {new Date(msg.timestamp).toLocaleTimeString()}
               </div>
@@ -445,8 +783,11 @@ function MyChatPanel({ workspaceId, data }: Props) {
           </div>
         ))}
 
-        {/* Thinking indicator */}
-        {sending && (
+        {/* Thinking indicator — shows when this tab is awaiting a reply
+           OR when the workspace heartbeat reports an in-flight task
+           (covers the "agent is already busy when I open the tab" case
+           without locking the Send button on a stale currentTask). */}
+        {(sending || !!data.currentTask) && (
           <div className="flex justify-start">
             <div className="bg-zinc-800/50 border border-zinc-700/30 rounded-lg px-3 py-2 max-w-[85%]">
               <div className="flex items-center gap-2 text-xs text-zinc-400">
@@ -490,7 +831,37 @@ function MyChatPanel({ workspaceId, data }: Props) {
 
       {/* Input */}
       <div className="p-3 border-t border-zinc-800">
-        <div className="flex gap-2">
+        {pendingFiles.length > 0 && (
+          <div className="flex flex-wrap gap-1.5 mb-2">
+            {pendingFiles.map((f, i) => (
+              <PendingAttachmentPill
+                key={`${f.name}-${f.size}-${i}`}
+                file={f}
+                onRemove={() => removePendingFile(i)}
+              />
+            ))}
+          </div>
+        )}
+        <div className="flex gap-2 items-end">
+          <input
+            ref={fileInputRef}
+            type="file"
+            multiple
+            className="hidden"
+            onChange={(e) => onFilesPicked(e.target.files)}
+            aria-hidden="true"
+          />
+          <button
+            onClick={() => fileInputRef.current?.click()}
+            disabled={!agentReachable || sending || uploading}
+            aria-label="Attach file"
+            title="Attach file"
+            className="p-2 bg-zinc-800 hover:bg-zinc-700 border border-zinc-700 rounded-lg text-zinc-400 hover:text-zinc-200 transition-colors shrink-0 disabled:opacity-40"
+          >
+            <svg width="14" height="14" viewBox="0 0 16 16" fill="none" aria-hidden="true">
+              <path d="M11 6.5 7 10.5a2 2 0 1 0 2.8 2.8l4-4a3.5 3.5 0 0 0-5-5l-4.5 4.5a5 5 0 0 0 7 7l4-4" stroke="currentColor" strokeWidth="1.4" strokeLinecap="round" strokeLinejoin="round" />
+            </svg>
+          </button>
           <textarea
             aria-label="Message to agent"
             value={input}
@@ -501,17 +872,18 @@ function MyChatPanel({ workspaceId, data }: Props) {
                 sendMessage();
               }
             }}
-            placeholder={agentReachable ? "Send a message... (Shift+Enter for new line)" : `Agent is ${data.status}`}
+            onPaste={onPasteIntoComposer}
+            placeholder={agentReachable ? "Send a message... (Shift+Enter for new line, paste images to attach)" : `Agent is ${data.status}`}
             disabled={!agentReachable || sending}
             rows={1}
             className="flex-1 bg-zinc-800 border border-zinc-700 rounded-lg px-3 py-2 text-xs text-zinc-200 placeholder-zinc-500 focus:outline-none focus:border-blue-500 resize-none disabled:opacity-50"
           />
           <button
             onClick={sendMessage}
-            disabled={!input.trim() || !agentReachable || sending}
+            disabled={(!input.trim() && pendingFiles.length === 0) || !agentReachable || sending || uploading}
             className="px-4 py-2 bg-blue-600 hover:bg-blue-500 text-xs font-medium rounded-lg text-white disabled:opacity-30 transition-colors shrink-0"
           >
-            Send
+            {uploading ? "Uploading…" : "Send"}
           </button>
         </div>
       </div>
diff --git a/canvas/src/components/tabs/ConfigTab.tsx b/canvas/src/components/tabs/ConfigTab.tsx
index 9ef1f0d4..3fa14679 100644
--- a/canvas/src/components/tabs/ConfigTab.tsx
+++ b/canvas/src/components/tabs/ConfigTab.tsx
@@ -105,12 +105,17 @@ interface RuntimeOption {
 // Fallback used when /templates can't be fetched (offline, older backend).
 // Keep in sync with manifest.json workspace_templates as a defensive default.
 // Model + env suggestions only flow when the backend is reachable.
+//
 // Runtimes that manage their own config outside the platform's config.yaml
-// template. For these, a missing config.yaml is expected — the user manages
-// config via the runtime's own mechanism (e.g. hermes edits
-// ~/.hermes/config.yaml on the workspace EC2 via the Terminal tab or its
-// own CLI). Showing a "No config.yaml found" error for these is misleading.
-const RUNTIMES_WITH_OWN_CONFIG = new Set<string>(["hermes", "external"]);
+// template. For these, a missing config.yaml is expected and the form
+// genuinely can't edit the runtime's settings (there's no platform file
+// to write). Hermes is NOT on this list: it DOES ship a platform
+// config.yaml via workspace-configs-templates/hermes that controls model,
+// runtime_config, required_env, etc. Editing it through this form is
+// exactly the point of the platform adaptor. The deep `~/.hermes/
+// config.yaml` on the container is a separate runtime-internal file,
+// not this one.
+const RUNTIMES_WITH_OWN_CONFIG = new Set<string>(["external"]);
 
 const FALLBACK_RUNTIME_OPTIONS: RuntimeOption[] = [
   { value: "", label: "LangGraph (default)", models: [] },
@@ -152,9 +157,11 @@ export function ConfigTab({ workspaceId }: Props) {
     // default `LangGraph`. See GH #1894.
     let wsMetadataRuntime = "";
     let wsMetadataModel = "";
+    let wsMetadataTier: number | null = null;
     try {
-      const ws = await api.get<{ runtime?: string }>(`/workspaces/${workspaceId}`);
+      const ws = await api.get<{ runtime?: string; tier?: number }>(`/workspaces/${workspaceId}`);
       wsMetadataRuntime = (ws.runtime || "").trim();
+      if (typeof ws.tier === "number") wsMetadataTier = ws.tier;
     } catch { /* fall back to config.yaml */ }
     try {
       const m = await api.get<{ model?: string }>(`/workspaces/${workspaceId}/model`);
@@ -166,11 +173,15 @@ export function ConfigTab({ workspaceId }: Props) {
       const parsed = parseYaml(res.content);
       setOriginalYaml(res.content);
       setRawDraft(res.content);
-      // Merge: config.yaml wins for fields it declares, but workspace metadata
-      // wins for runtime + model when config.yaml doesn't set them.
+      // Merge: workspace-row metadata is authoritative for the DB-backed
+      // fields (tier, runtime, model). config.yaml often lags — handleSave
+      // PATCHes tier/runtime directly and a template snapshot in the
+      // container can differ from the live row. Show the DB value so the
+      // form doesn't contradict the node badge (issue: badge=T3, form=T2).
       const merged = { ...DEFAULT_CONFIG, ...parsed } as ConfigData;
-      if (!merged.runtime && wsMetadataRuntime) merged.runtime = wsMetadataRuntime;
-      if (!merged.model && wsMetadataModel) merged.model = wsMetadataModel;
+      if (wsMetadataRuntime) merged.runtime = wsMetadataRuntime;
+      if (wsMetadataModel) merged.model = wsMetadataModel;
+      if (wsMetadataTier !== null) merged.tier = wsMetadataTier;
       setConfig(merged);
     } catch {
       // No platform-managed config.yaml. Some runtimes (hermes, external)
@@ -185,6 +196,7 @@ export function ConfigTab({ workspaceId }: Props) {
         ...DEFAULT_CONFIG,
         runtime: wsMetadataRuntime,
         model: wsMetadataModel,
+        ...(wsMetadataTier !== null ? { tier: wsMetadataTier } : {}),
       } as ConfigData);
     } finally {
       setLoading(false);
diff --git a/canvas/src/components/tabs/DetailsTab.tsx b/canvas/src/components/tabs/DetailsTab.tsx
index 8bd0d69a..211b1637 100644
--- a/canvas/src/components/tabs/DetailsTab.tsx
+++ b/canvas/src/components/tabs/DetailsTab.tsx
@@ -36,7 +36,7 @@ export function DetailsTab({ workspaceId, data }: Props) {
   const [restartError, setRestartError] = useState<string | null>(null);
   const [consoleOpen, setConsoleOpen] = useState(false);
   const updateNodeData = useCanvasStore((s) => s.updateNodeData);
-  const removeNode = useCanvasStore((s) => s.removeNode);
+  const removeSubtree = useCanvasStore((s) => s.removeSubtree);
   const selectNode = useCanvasStore((s) => s.selectNode);
   // Ref for the "Delete Workspace" trigger — Cancel returns focus here
   const deleteButtonRef = useRef<HTMLButtonElement>(null);
@@ -94,7 +94,11 @@ export function DetailsTab({ workspaceId, data }: Props) {
     setDeleteError(null);
     try {
       await api.del(`/workspaces/${workspaceId}?confirm=true`);
-      removeNode(workspaceId);
+      // Mirror the server-side cascade — drop the row + every
+      // descendant locally so the canvas reflects the deletion
+      // immediately, even when the WS is dead and the per-descendant
+      // WORKSPACE_REMOVED events never arrive.
+      removeSubtree(workspaceId);
       selectNode(null);
     } catch (e) {
       setDeleteError(e instanceof Error ? e.message : "Failed to delete");
diff --git a/canvas/src/components/tabs/SkillsTab.tsx b/canvas/src/components/tabs/SkillsTab.tsx
index d046f070..83207e10 100644
--- a/canvas/src/components/tabs/SkillsTab.tsx
+++ b/canvas/src/components/tabs/SkillsTab.tsx
@@ -6,6 +6,14 @@ import { useCanvasStore, summarizeWorkspaceCapabilities, type WorkspaceNodeData
 import { showToast } from "../Toaster";
 
 interface Props {
+  // The workspace's id is NOT a field on WorkspaceNodeData — that
+  // interface is the React Flow `node.data` blob, while the id lives
+  // on `node.id`. Pass it explicitly (matches every other tab in
+  // SidePanel) so the install/uninstall API calls don't end up
+  // POSTing to /workspaces/undefined/plugins. The interface extending
+  // Record<string, unknown> meant TypeScript silently typed
+  // `data.id` as `unknown` instead of erroring — easy to miss.
+  workspaceId: string;
   data: WorkspaceNodeData;
 }
 
@@ -40,7 +48,7 @@ interface SourceSchemesResponse {
 // Delay before reloading installed plugins after install/uninstall (workspace restarts)
 const PLUGIN_RELOAD_DELAY_MS = 15_000;
 
-export function SkillsTab({ data }: Props) {
+export function SkillsTab({ workspaceId, data }: Props) {
   const capability = summarizeWorkspaceCapabilities(data);
   const skills = useMemo(() => extractSkills(data.agentCard), [data.agentCard]);
   const setPanelTab = useCanvasStore((s) => s.setPanelTab);
@@ -57,32 +65,115 @@ export function SkillsTab({ data }: Props) {
   const reloadTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
 
   useEffect(() => {
+    // Re-init `mountedRef.current = true` on every mount. React 18
+    // StrictMode (Next.js dev) double-invokes effects: mount →
+    // cleanup → mount. Without this re-init, the first cleanup sets
+    // mountedRef.current = false, the re-mount runs the effect body
+    // again but never restores the flag, so every subsequent
+    // `if (mountedRef.current) setX(...)` guard skips and the
+    // component appears wedged: fetches complete, state never
+    // updates, "Loading…" sits forever. Production doesn't double-
+    // invoke so the bug only surfaces in dev — but dev is where we
+    // see it, and the cost of being explicit is one assignment.
+    mountedRef.current = true;
     return () => {
       mountedRef.current = false;
       clearTimeout(reloadTimerRef.current);
     };
   }, []);
 
-  const workspaceId = data.id;
+  // Tracks whether loadInstalled has completed at least once (success
+  // or empty-array success — NOT failure). Without this the auto-
+  // expand effect below would fire on the initial render where
+  // `installed.length === 0` simply because the fetch hasn't returned
+  // yet, and worse, would also fire if the fetch throws (network
+  // blip, auth failure) — both cases falsely look like "no plugins
+  // installed". Gating on a separate "loaded" flag avoids the false
+  // positive.
+  const [installedLoaded, setInstalledLoaded] = useState(false);
 
   const loadInstalled = useCallback(async () => {
     try {
       const result = await api.get<PluginInfo[]>(`/workspaces/${workspaceId}/plugins`);
-      if (mountedRef.current) setInstalled(Array.isArray(result) ? result : []);
+      if (mountedRef.current) {
+        setInstalled(Array.isArray(result) ? result : []);
+        setInstalledLoaded(true);
+      }
     } catch (e) {
       console.warn("SkillsTab: installed plugins load failed", e);
     }
   }, [workspaceId]);
 
-  const loadRegistry = useCallback(async () => {
+  // registry-load lifecycle so the UI can show "Loading…" / error /
+  // retry instead of an indistinguishable "No plugins in registry"
+  // banner whether the fetch is in-flight, errored, or genuinely
+  // returned []. The previous silent console.warn-only path made
+  // an auth failure or CORS blip look identical to an empty
+  // registry — exactly the diagnosis dead-end observed when the
+  // server returned 20 plugins via curl but the canvas showed 0.
+  const [registryLoading, setRegistryLoading] = useState(false);
+  const [registryError, setRegistryError] = useState<string | null>(null);
+
+  // Synchronous gate against concurrent loadRegistry runs. Refs survive
+  // Fast Refresh re-renders (ref objects persist across re-runs of
+  // the function body), so a previously-stranded fetch can pin this
+  // ref at true and block every subsequent loadRegistry call. The
+  // `force` parameter on loadRegistry below provides the user-driven
+  // escape hatch for that wedge.
+  const registryFetchInFlight = useRef(false);
+
+  // Reset the in-flight gate on unmount so a Fast Refresh that
+  // tears down + recreates the component without a full page reload
+  // doesn't carry the stuck-true value into the new instance via
+  // dev-server-preserved module state.
+  useEffect(() => {
+    return () => {
+      registryFetchInFlight.current = false;
+    };
+  }, []);
+
+  const loadRegistry = useCallback(async (force = false) => {
+    // Default callers (mount effect, button while not loading) honour
+    // the gate. Explicit force=true callers (Retry button) bypass it
+    // — the user is signalling "forget whatever you thought was in
+    // flight, fetch again now".
+    if (!force && registryFetchInFlight.current) return;
+    registryFetchInFlight.current = true;
+    setRegistryLoading(true);
+    setRegistryError(null);
     try {
-      const result = await api.get<PluginInfo[]>("/plugins");
+      // 10s timeout — tighter than the 15s default. Plugin registry
+      // is local-disk-backed on the platform host (server reads
+      // pluginsDir entries) so a 10s budget is generous. Without
+      // an explicit timeout the UI's "Loading registry…" can sit
+      // for the full 15s + any browser hop time when a Fast
+      // Refresh strands an in-flight promise.
+      const result = await api.get<PluginInfo[]>("/plugins", { timeoutMs: 10_000 });
       if (mountedRef.current) setRegistry(Array.isArray(result) ? result : []);
     } catch (e) {
-      // Registry is the AVAILABLE PLUGINS list. Silent failure here
-      // left the user seeing "No plugins in registry" with no clue
-      // it was a fetch error — log it so devtools shows the cause.
       console.warn("SkillsTab: registry load failed", e);
+      if (mountedRef.current) {
+        // Detect timeout/abort by DOMException.name first — that's
+        // the canonical signal across browsers. Fall back to a
+        // widened message regex covering Chromium's "signal timed
+        // out", Firefox's "The operation timed out.", Safari's
+        // "Aborted". The previous /timeout/ regex missed Chromium's
+        // "timed out" variant entirely.
+        const name = (e as { name?: string })?.name ?? "";
+        const msg = e instanceof Error ? e.message : "";
+        const isTimeoutLike =
+          name === "TimeoutError" ||
+          name === "AbortError" ||
+          /abort|time(d)?\s*out/i.test(msg);
+        setRegistryError(
+          isTimeoutLike
+            ? "Registry fetch timed out (10s). The platform server may be slow or unreachable."
+            : msg || "Failed to load registry",
+        );
+      }
+    } finally {
+      registryFetchInFlight.current = false;
+      if (mountedRef.current) setRegistryLoading(false);
     }
   }, []);
 
@@ -102,17 +193,73 @@ export function SkillsTab({ data }: Props) {
     loadSourceSchemes();
   }, [loadInstalled, loadRegistry, loadSourceSchemes]);
 
+  // First-time experience: if the workspace has zero plugins
+  // installed but the platform's registry has options to choose
+  // from, expand the registry by default so the user sees what's
+  // available without an extra click. Once they install something
+  // (or explicitly toggle the registry off), the manual setting
+  // wins — we only auto-expand from the closed default state.
+  const hasAutoExpandedRef = useRef(false);
+  useEffect(() => {
+    if (hasAutoExpandedRef.current) return;
+    if (installedLoaded && installed.length === 0 && registry.length > 0) {
+      setShowRegistry(true);
+      hasAutoExpandedRef.current = true;
+    }
+  }, [installedLoaded, installed.length, registry.length]);
+
   const installedNames = useMemo(() => new Set(installed.map((p) => p.name)), [installed]);
 
   // Install always goes through the source-based API. For registry
   // plugins we build the local:// source on the fly; custom sources
   // (github://, clawhub://, …) are typed into the input below.
-  const installFromSource = async (source: string, labelOverride?: string) => {
+  //
+  // Optional `optimistic` parameter mirrors the uninstall flow's local
+  // state mutation. Without it, the user sees the button revert from
+  // "Installing..." → "Install" the instant the POST returns, and the
+  // green "Installed" tag doesn't appear for ~15s while we wait out
+  // PLUGIN_RELOAD_DELAY_MS for the workspace restart before refetching.
+  // 15s of staring at the same button feels broken. Pushing the
+  // registry entry into `installed` immediately makes the UI reflect
+  // the install instantly; the delayed loadInstalled() reconciles
+  // anything we got wrong (or any server-side filtering we don't
+  // know about locally).
+  const installFromSource = async (
+    source: string,
+    labelOverride?: string,
+    optimistic?: PluginInfo,
+  ) => {
     const label = labelOverride ?? source;
     setInstalling(label);
     try {
       await api.post(`/workspaces/${workspaceId}/plugins`, { source });
       showToast(`Installed ${label} — restarting workspace`, "success");
+      if (optimistic && mountedRef.current) {
+        // Push with `supported_on_runtime` left undefined — the
+        // server's ListInstalled annotates the real value (true /
+        // false) at refetch time. Forcing `true` here would hide the
+        // "inert on this runtime" badge for 15s if the user
+        // installed a plugin that doesn't actually support the
+        // workspace's runtime; the badge only renders on `=== false`,
+        // so undefined keeps it neutral until reconciliation arrives.
+        setInstalled((prev) =>
+          prev.some((p) => p.name === optimistic.name)
+            ? prev
+            : [...prev, { ...optimistic, supported_on_runtime: undefined }],
+        );
+        // Note: we intentionally do NOT set `installedLoaded` here.
+        // That flag means "the initial GET has succeeded at least
+        // once" and gates the auto-expand-registry effect. A fast
+        // optimistic install BEFORE the initial fetch returns must
+        // not flip the gate, or the auto-expand never fires and a
+        // followup loadInstalled racing with the optimistic write
+        // could overwrite our entry with [] mid-restart.
+      }
+      // Drop any prior reload timer before scheduling a new one —
+      // back-to-back installs within PLUGIN_RELOAD_DELAY_MS would
+      // otherwise queue multiple loadInstalled() calls and the
+      // unmount cleanup only clears the latest handle.
+      clearTimeout(reloadTimerRef.current);
       reloadTimerRef.current = setTimeout(() => loadInstalled(), PLUGIN_RELOAD_DELAY_MS);
     } catch (e) {
       showToast(e instanceof Error ? e.message : "Install failed", "error");
@@ -121,7 +268,10 @@ export function SkillsTab({ data }: Props) {
     }
   };
 
-  const handleInstall = (pluginName: string) => installFromSource(`local://${pluginName}`, pluginName);
+  const handleInstall = (pluginName: string) => {
+    const entry = registry.find((p) => p.name === pluginName);
+    return installFromSource(`local://${pluginName}`, pluginName, entry);
+  };
 
   const handleInstallCustom = async () => {
     const source = customSource.trim();
@@ -133,9 +283,12 @@ export function SkillsTab({ data }: Props) {
   const handleUninstall = async (pluginName: string) => {
     setUninstalling(pluginName);
     try {
-      await api.del(`/workspaces/${data.id}/plugins/${pluginName}`);
+      await api.del(`/workspaces/${workspaceId}/plugins/${pluginName}`);
       showToast(`Removed ${pluginName} — restarting workspace`, "success");
       setInstalled((prev) => prev.filter((p) => p.name !== pluginName));
+      // Drop any prior reload timer (see installFromSource for the
+      // back-to-back-action leak rationale).
+      clearTimeout(reloadTimerRef.current);
       reloadTimerRef.current = setTimeout(() => loadInstalled(), PLUGIN_RELOAD_DELAY_MS);
     } catch (e) {
       showToast(e instanceof Error ? e.message : "Uninstall failed", "error");
@@ -264,9 +417,53 @@ export function SkillsTab({ data }: Props) {
                 Local registry plugins below; paste any scheme URL above for GitHub or other sources.
               </div>
             </div>
-            <div className="text-[10px] uppercase tracking-[0.2em] text-zinc-600 mb-2">Available plugins</div>
-            {registry.length === 0 ? (
-              <div className="text-[10px] text-zinc-600">No plugins in registry</div>
+            <div className="flex items-center justify-between mb-2">
+              <div className="text-[10px] uppercase tracking-[0.2em] text-zinc-600">Available plugins</div>
+              {/* Retry visible whenever registry is empty — including
+                  the loading state — so a stuck fetch (Fast Refresh
+                  stranded promise, slow server, browser quirk) has a
+                  user-driven escape hatch. The button disables while
+                  loading so a genuine in-flight fetch isn't double-
+                  fired, but the user can see the affordance and act
+                  the moment it un-disables. */}
+              {registry.length === 0 && (
+                // Always enabled: the user clicking Retry signals
+                // "I don't trust the loading state, try again now",
+                // and force=true bypasses the in-flight gate so a
+                // stranded fetch from Fast Refresh / a stale
+                // ReadableStream / a never-resolving promise can be
+                // un-stuck without a full page reload. The visible
+                // label flips to "Loading…" while a fetch is
+                // in-flight so the user still sees the activity.
+                <button
+                  type="button"
+                  onClick={() => loadRegistry(true)}
+                  className="text-[10px] text-violet-300 hover:text-violet-200 underline-offset-2 hover:underline"
+                >
+                  {registryLoading ? "Loading… click to retry" : "Retry"}
+                </button>
+              )}
+            </div>
+            {registryLoading && registry.length === 0 ? (
+              <div className="text-[10px] text-zinc-500">Loading registry…</div>
+            ) : registryError ? (
+              <div className="rounded-lg border border-red-800/40 bg-red-950/20 px-2 py-1.5">
+                <div className="text-[10px] text-red-300 font-semibold mb-0.5">
+                  Couldn't load the plugin registry
+                </div>
+                <div className="text-[10px] text-red-400/80">{registryError}</div>
+                <div className="mt-1 text-[10px] text-zinc-500">
+                  Check the platform server is reachable at /plugins. The Retry button is in the header above.
+                </div>
+              </div>
+            ) : registry.length === 0 ? (
+              <div className="rounded-lg border border-zinc-800/40 bg-zinc-950/40 px-2 py-1.5">
+                <div className="text-[10px] text-zinc-400 mb-0.5">Registry returned 0 plugins.</div>
+                <div className="text-[10px] text-zinc-600">
+                  This usually means the platform's plugins/ directory is empty.
+                  Run scripts/clone-manifest.sh to populate it from the standalone repos.
+                </div>
+              </div>
             ) : (
               <div className="space-y-1.5">
                 {registry.map((p) => {
diff --git a/canvas/src/components/tabs/__tests__/ConfigTab.hermes.test.tsx b/canvas/src/components/tabs/__tests__/ConfigTab.hermes.test.tsx
index a685a2f3..79f6a36e 100644
--- a/canvas/src/components/tabs/__tests__/ConfigTab.hermes.test.tsx
+++ b/canvas/src/components/tabs/__tests__/ConfigTab.hermes.test.tsx
@@ -128,7 +128,13 @@ describe("ConfigTab — hermes workspace", () => {
     });
   });
 
-  it("shows hermes-specific info banner pointing to Terminal tab (#1894)", async () => {
+  it("does NOT show the hermes-specific info banner (removed in #2061)", async () => {
+    // Banner-text inversion: the multilevel-layout-UX PR drops "hermes"
+    // from RUNTIMES_WITH_OWN_CONFIG (now {"external"} only). Hermes now
+    // shows the normal Config form — the banner "Hermes manages its own
+    // config" is reserved for the "external" runtime, not hermes itself.
+    // If this ever flips back, revisit the banner/error UX before
+    // unpinning this assertion.
     wireApi({
       workspaceRuntime: "hermes",
       configYamlContent: null,
@@ -137,9 +143,11 @@ describe("ConfigTab — hermes workspace", () => {
 
     render(<ConfigTab workspaceId="ws-test" />);
 
-    await waitFor(() => {
-      expect(screen.getByText(/Hermes manages its own config/i)).toBeTruthy();
-    });
+    // Wait for the render+loads to settle (template list drives the runtime combobox).
+    await waitFor(() =>
+      screen.getByRole("combobox", { name: /runtime/i }),
+    );
+    expect(screen.queryByText(/Hermes manages its own config/i)).toBeNull();
   });
 
   it("DOES show 'No config.yaml found' error for langgraph workspace (default runtime)", async () => {
@@ -161,14 +169,28 @@ describe("ConfigTab — hermes workspace", () => {
 });
 
 describe("ConfigTab — config.yaml on disk", () => {
-  it("config.yaml runtime/model wins when present, workspace metadata is fallback", async () => {
-    // If the workspace DB has runtime=langgraph but config.yaml declares
-    // runtime: crewai, the form should show crewai (config.yaml wins).
-    // Prevents silent runtime drift across reads.
+  it("workspace metadata (DB) wins over config.yaml when both are present (#2061)", async () => {
+    // Priority inversion in #2061: previously config.yaml overrode DB, so
+    // the tier-on-node badge and runtime-in-form could drift when the
+    // user edited config.yaml on disk. The multilevel-layout-UX PR made
+    // the DB authoritative — config.yaml is read for non-DB keys (tools,
+    // MCP server list, etc.) but runtime/model/tier come from the
+    // workspace row so the node badge matches the form.
+    //
+    // Scenario: DB says "hermes", config.yaml says "crewai". The form
+    // must show hermes (DB wins).
+    //
+    // We pick hermes (not langgraph) on the DB side because "langgraph"
+    // is collapsed to the empty-string "LangGraph (default)" option in
+    // the runtime dropdown — so a "langgraph" DB value would render as
+    // the empty-valued option and obscure whether the DB-wins logic
+    // actually fired. Hermes has its own non-empty option value and
+    // gives the assertion a clean signal.
     wireApi({
-      workspaceRuntime: "langgraph", // DB
+      workspaceRuntime: "hermes", // DB — authoritative
       configYamlContent: 'runtime: crewai\nmodel: "claude-opus"\n',
       templates: [
+        { id: "t-hermes", name: "Hermes", runtime: "hermes", models: [] },
         { id: "t-crewai", name: "CrewAI", runtime: "crewai", models: [] },
       ],
     });
@@ -176,6 +198,6 @@ describe("ConfigTab — config.yaml on disk", () => {
     render(<ConfigTab workspaceId="ws-test" />);
 
     const select = await waitFor(() => screen.getByRole("combobox", { name: /runtime/i }));
-    expect((select as HTMLSelectElement).value).toBe("crewai");
+    expect((select as HTMLSelectElement).value).toBe("hermes");
   });
 });
diff --git a/canvas/src/components/tabs/chat/AgentCommsPanel.tsx b/canvas/src/components/tabs/chat/AgentCommsPanel.tsx
index 7315e7be..909616bf 100644
--- a/canvas/src/components/tabs/chat/AgentCommsPanel.tsx
+++ b/canvas/src/components/tabs/chat/AgentCommsPanel.tsx
@@ -1,13 +1,17 @@
 "use client";
 
 import { useState, useEffect, useRef } from "react";
+import ReactMarkdown from "react-markdown";
+import remarkGfm from "remark-gfm";
 import { api } from "@/lib/api";
 import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
 import { WS_URL } from "@/store/socket";
 import { closeWebSocketGracefully } from "@/lib/ws-close";
+import { showToast } from "../../Toaster";
 import { extractResponseText, extractRequestText } from "./message-parser";
+import { inferA2AErrorHint } from "./a2aErrorHint";
 
-interface ActivityEntry {
+export interface ActivityEntry {
   id: string;
   activity_type: string;
   source_id: string | null;
@@ -22,11 +26,29 @@ interface ActivityEntry {
 
 interface CommMessage {
   id: string;
-  direction: "in" | "out";
+  /** UI-facing flow from THIS workspace's point of view:
+   *
+   *    "out" — this workspace either initiated the call (a2a_send)
+   *            OR self-logged the reply from a peer it had called
+   *            (a2a_receive with source_id == workspaceId).
+   *    "in"  — a peer initiated the call to us (a2a_receive with
+   *            source_id != workspaceId).
+   *
+   *  Distinct from activity_type because the agent runtime self-
+   *  logs its outbound calls' replies as `a2a_receive` rows; without
+   *  this normalisation the UI labels would render those as
+   *  incoming ("← From X") and right-justify them on the wrong
+   *  side, even though from the user's perspective the call WAS
+   *  outgoing. See toCommMessage for the resolution rules. */
+  flow: "in" | "out";
   peerName: string;
   peerId: string;
   text: string;
   responseText: string | null;
+  /** "ok" | "error" — surfaces failed deliveries with their own
+   *  visual treatment + recovery actions instead of an opaque
+   *  "[A2A_ERROR]" body the user can't act on. */
+  status: string;
   timestamp: string;
 }
 
@@ -36,9 +58,31 @@ function resolveName(id: string): string {
   return (node?.data as WorkspaceNodeData)?.name || id.slice(0, 8);
 }
 
-function toCommMessage(entry: ActivityEntry, workspaceId: string): CommMessage | null {
-  const isOutgoing = entry.activity_type === "a2a_send";
-  const peerId = isOutgoing ? (entry.target_id || "") : (entry.source_id || "");
+export function toCommMessage(entry: ActivityEntry, workspaceId: string): CommMessage | null {
+  // a2a_receive activity rows come in two shapes:
+  //
+  //   1. Real incoming call (a peer called us): source_id = the peer,
+  //      target_id = us. peerId is source_id, flow is "in".
+  //
+  //   2. Self-logged response to an outbound call (the workspace's own
+  //      runtime calls report_activity("a2a_receive", ...) after
+  //      delegating; see workspace/a2a_tools.py:181). source_id =
+  //      our own workspace_id, target_id = the peer that replied.
+  //      peerId must come from target_id (otherwise the peer-name
+  //      resolves to "us" and Restart would target THIS workspace),
+  //      and flow is "out" — from the user's perspective this row
+  //      belongs to the outbound thread, not an incoming one.
+  //
+  // a2a_send rows are always outbound from us: source_id = us,
+  // target_id = the peer.
+  const isSendActivity = entry.activity_type === "a2a_send";
+  const isSelfLoggedReceive =
+    entry.activity_type === "a2a_receive" && entry.source_id === workspaceId;
+  const flow: "in" | "out" = isSendActivity || isSelfLoggedReceive ? "out" : "in";
+  const peerId =
+    isSendActivity || isSelfLoggedReceive
+      ? entry.target_id || ""
+      : entry.source_id || "";
   if (!peerId) return null;
 
   const text = extractRequestText(entry.request_body) || entry.summary || "";
@@ -46,15 +90,35 @@ function toCommMessage(entry: ActivityEntry, workspaceId: string): CommMessage |
 
   return {
     id: entry.id,
-    direction: isOutgoing ? "out" : "in",
+    flow,
     peerName: resolveName(peerId),
     peerId,
     text,
     responseText,
+    status: entry.status || "ok",
     timestamp: entry.created_at,
   };
 }
 
+/** Strip the [A2A_ERROR] sentinel prefix the workspace runtime adds
+ *  to failed delegation responses, so the UI can render the underlying
+ *  message (or fall back to a generic explanation when the inner text
+ *  is empty — currently common because httpx exceptions often
+ *  stringify as ""). */
+const A2A_ERROR_PREFIX = "[A2A_ERROR]";
+
+function unwrapErrorText(raw: string | null): string {
+  if (!raw) return "";
+  const trimmed = raw.trim();
+  if (trimmed.startsWith(A2A_ERROR_PREFIX)) {
+    return trimmed.slice(A2A_ERROR_PREFIX.length).trim();
+  }
+  return trimmed;
+}
+
+// inferA2AErrorHint moved to ./a2aErrorHint so the Activity tab and
+// this panel render identical hints for the same symptom.
+
 export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {
   const [messages, setMessages] = useState<CommMessage[]>([]);
   const [loading, setLoading] = useState(true);
@@ -67,22 +131,45 @@ export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {
     setLoading(true);
     api.get<ActivityEntry[]>(`/workspaces/${workspaceId}/activity?source=agent&limit=50`)
       .then((entries) => {
-        const filtered = entries
+        const filtered = (entries ?? [])
           .filter((e) => e.activity_type === "a2a_send" || e.activity_type === "a2a_receive")
           .reverse();
         const msgs: CommMessage[] = [];
         for (const e of filtered) {
-          const m = toCommMessage(e, workspaceId);
-          if (m) {
-            const key = `${m.timestamp}:${m.direction}:${m.peerId}`;
-            msgs.push(m);
-            seenKeys.current.add(key);
+          // Per-row try/catch so a single malformed activity row
+          // (e.g. unexpected request_body shape) doesn't kill the
+          // batch — the previous code threw out of the for-loop and
+          // setMessages([3 items]) never ran, leaving the panel
+          // stuck on the empty state with no diagnostic in the
+          // console because the outer .catch silently swallowed
+          // everything.
+          try {
+            const m = toCommMessage(e, workspaceId);
+            if (m) {
+              const key = `${m.timestamp}:${m.flow}:${m.peerId}`;
+              msgs.push(m);
+              seenKeys.current.add(key);
+            }
+          } catch (rowErr) {
+            console.warn(
+              "AgentCommsPanel: failed to map activity row",
+              { id: e.id, type: e.activity_type, err: rowErr },
+            );
           }
         }
         setMessages(msgs);
         setLoading(false);
       })
-      .catch(() => setLoading(false));
+      .catch((err) => {
+        // Surface the failure in the console so a stuck panel is
+        // diagnosable without a debugger. Previous bare
+        // `.catch(() => setLoading(false))` swallowed every load
+        // failure (network errors, JSON parse errors, throws inside
+        // the .then body) — the panel just sat on the empty state
+        // with zero signal.
+        console.warn("AgentCommsPanel: load activity failed", err);
+        setLoading(false);
+      });
   }, [workspaceId]);
 
   // Live updates via WebSocket
@@ -115,7 +202,7 @@ export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {
           };
           const m = toCommMessage(entry, workspaceId);
           if (m) {
-            const key = `${m.timestamp}:${m.direction}:${m.peerId}`;
+            const key = `${m.timestamp}:${m.flow}:${m.peerId}`;
             if (seenKeys.current.has(key)) return;
             seenKeys.current.add(key);
             setMessages((prev) => [...prev, m]);
@@ -148,31 +235,177 @@ export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {
 
   return (
     <div className="flex-1 overflow-y-auto p-3 space-y-2">
-      {messages.map((msg) => (
-        <div key={msg.id} className={`flex ${msg.direction === "out" ? "justify-end" : "justify-start"}`}>
-          <div
-            className={`max-w-[85%] rounded-lg px-3 py-2 text-xs ${
-              msg.direction === "out"
-                ? "bg-cyan-900/30 text-cyan-100 border border-cyan-700/20"
-                : "bg-zinc-800/80 text-zinc-200 border border-zinc-700/30"
-            }`}
-          >
-            <div className="text-[9px] text-zinc-500 mb-1">
-              {msg.direction === "out" ? `→ To ${msg.peerName}` : `← From ${msg.peerName}`}
-            </div>
-            <div className="text-zinc-300">{msg.text || "(no message text)"}</div>
-            {msg.responseText && (
-              <div className="mt-1.5 pt-1.5 border-t border-zinc-700/30 text-zinc-400">
-                {msg.responseText}
-              </div>
-            )}
-            <div className="text-[9px] text-zinc-500 mt-1">
-              {new Date(msg.timestamp).toLocaleTimeString()}
-            </div>
-          </div>
-        </div>
-      ))}
+      {messages.map((msg) =>
+        msg.status === "error" ? (
+          <ErrorMessage key={msg.id} msg={msg} />
+        ) : (
+          <NormalMessage key={msg.id} msg={msg} />
+        ),
+      )}
       <div ref={bottomRef} />
     </div>
   );
 }
+
+function NormalMessage({ msg }: { msg: CommMessage }) {
+  return (
+    <div className={`flex ${msg.flow === "out" ? "justify-end" : "justify-start"}`}>
+      <div
+        className={`max-w-[85%] rounded-lg px-3 py-2 text-xs ${
+          msg.flow === "out"
+            ? "bg-cyan-900/30 text-cyan-100 border border-cyan-700/20"
+            : "bg-zinc-800/80 text-zinc-200 border border-zinc-700/30"
+        }`}
+      >
+        <div className="text-[9px] text-zinc-500 mb-1">
+          {msg.flow === "out" ? `→ To ${msg.peerName}` : `← From ${msg.peerName}`}
+        </div>
+        {msg.text ? (
+          <MarkdownBody className="text-zinc-300">{msg.text}</MarkdownBody>
+        ) : (
+          <div className="text-zinc-300">(no message text)</div>
+        )}
+        {msg.responseText && (
+          <MarkdownBody className="mt-1.5 pt-1.5 border-t border-zinc-700/30 text-zinc-400">
+            {msg.responseText}
+          </MarkdownBody>
+        )}
+        <div className="text-[9px] text-zinc-500 mt-1">
+          {new Date(msg.timestamp).toLocaleTimeString()}
+        </div>
+      </div>
+    </div>
+  );
+}
+
+/** Failure-state row. Replaces the unactionable "X failed [A2A_ERROR]"
+ *  bubble with: a clear banner naming the peer, the underlying
+ *  error text (if any), an inferred cause hint, and recovery
+ *  actions — Restart workspace, Open workspace.
+ *
+ *  Recovery actions show on BOTH directions because both target the
+ *  same peer (toCommMessage now resolves peerId to the peer in
+ *  either case): an outbound delivery failure ("we called X and it
+ *  errored"), an inbound runtime failure ("X called us and our
+ *  reply errored" — rare), or the agent-self-logged "I called X and
+ *  got an error back" pattern that is the most common shape. The
+ *  user always wants to restart or inspect the failing peer. */
+function ErrorMessage({ msg }: { msg: CommMessage }) {
+  const selectNode = useCanvasStore((s) => s.selectNode);
+  const [restarting, setRestarting] = useState(false);
+  const errorText = unwrapErrorText(msg.responseText);
+  const hint = inferA2AErrorHint(errorText);
+
+  // Guard against acting on a peer whose workspace has been deleted
+  // since this row was logged. Without the guard, restart 404s
+  // surface as a generic toast and Open silently sets a dangling
+  // selection that renders nothing in the side panel.
+  const peerExists = (): boolean => {
+    return useCanvasStore.getState().nodes.some((n) => n.id === msg.peerId);
+  };
+
+  const handleRestart = async () => {
+    if (restarting) return;
+    if (!peerExists()) {
+      showToast(`${msg.peerName} no longer exists`, "error");
+      return;
+    }
+    setRestarting(true);
+    try {
+      await api.post(`/workspaces/${msg.peerId}/restart`, {});
+      showToast(`Restarting ${msg.peerName}…`, "success");
+    } catch (e) {
+      showToast(
+        `Restart failed: ${e instanceof Error ? e.message : "unknown error"}`,
+        "error",
+      );
+    } finally {
+      setRestarting(false);
+    }
+  };
+
+  const handleOpen = () => {
+    if (!peerExists()) {
+      showToast(`${msg.peerName} no longer exists`, "error");
+      return;
+    }
+    selectNode(msg.peerId);
+  };
+
+  return (
+    <div className={`flex ${msg.flow === "out" ? "justify-end" : "justify-start"}`}>
+      <div className="max-w-[85%] rounded-lg border border-red-800/50 bg-red-950/30 px-3 py-2 text-xs">
+        <div className="flex items-center gap-1.5 text-[10px] text-red-300 font-semibold uppercase tracking-wide mb-1.5">
+          <span aria-hidden="true">⚠</span>
+          {msg.flow === "out"
+            ? `Failed to deliver to ${msg.peerName}`
+            : `${msg.peerName} returned an error`}
+        </div>
+
+        {msg.text && (
+          <div className="text-[10px] text-zinc-500 mb-1.5">
+            <span className="uppercase tracking-wide">Task</span>
+            <MarkdownBody className="text-zinc-400">{msg.text}</MarkdownBody>
+          </div>
+        )}
+
+        <div className="rounded bg-zinc-950/60 border border-red-900/40 px-2 py-1.5 mb-1.5">
+          <div className="text-[9px] uppercase tracking-wide text-red-400 mb-0.5">
+            Underlying error
+          </div>
+          <code className="text-[11px] font-mono text-red-200 whitespace-pre-wrap break-words">
+            {errorText || "(no detail returned)"}
+          </code>
+        </div>
+
+        <p className="text-[10px] text-zinc-400 leading-snug mb-2">{hint}</p>
+
+        {msg.peerId && (
+          <div className="flex flex-wrap items-center gap-1.5">
+            <button
+              type="button"
+              onClick={handleRestart}
+              disabled={restarting}
+              className="px-2 py-0.5 rounded bg-red-900/50 hover:bg-red-800/60 border border-red-700/40 text-[10px] text-red-200 disabled:opacity-50 transition-colors"
+            >
+              {restarting ? "Restarting…" : `Restart ${msg.peerName}`}
+            </button>
+            <button
+              type="button"
+              onClick={handleOpen}
+              className="px-2 py-0.5 rounded bg-zinc-800 hover:bg-zinc-700 border border-zinc-700/50 text-[10px] text-zinc-300 transition-colors"
+            >
+              Open {msg.peerName}
+            </button>
+          </div>
+        )}
+
+        <div className="text-[9px] text-zinc-500 mt-1.5">
+          {new Date(msg.timestamp).toLocaleTimeString()}
+        </div>
+      </div>
+    </div>
+  );
+}
+
+/** Tiny markdown wrapper matching ChatTab's My Chat styling. Same
+ *  remark-gfm pipeline (tables, strikethrough, task lists) plus the
+ *  prose tweaks that keep paragraphs tight inside a small bubble.
+ *  Code blocks get an `overflow-x-auto` so a long line of code doesn't
+ *  blow out the bubble's max-width — agent-to-agent replies routinely
+ *  ship code samples and JSON. */
+function MarkdownBody({
+  children,
+  className,
+}: {
+  children: string;
+  className?: string;
+}) {
+  return (
+    <div
+      className={`prose prose-sm prose-invert max-w-none [&>p]:mb-1 [&>p:last-child]:mb-0 [&_pre]:overflow-x-auto [&_table]:block [&_table]:overflow-x-auto ${className ?? ""}`}
+    >
+      <ReactMarkdown remarkPlugins={[remarkGfm]}>{children}</ReactMarkdown>
+    </div>
+  );
+}
diff --git a/canvas/src/components/tabs/chat/AttachmentViews.tsx b/canvas/src/components/tabs/chat/AttachmentViews.tsx
new file mode 100644
index 00000000..fdf7fb6c
--- /dev/null
+++ b/canvas/src/components/tabs/chat/AttachmentViews.tsx
@@ -0,0 +1,94 @@
+"use client";
+
+// Small presentational components for chat attachments. Kept in a
+// separate file so ChatTab.tsx stays focused on state + send/receive
+// orchestration. Both variants share the file-icon + name + size
+// layout; the only difference is the trailing action (remove for
+// pending, download for completed).
+
+import type { ChatAttachment } from "./types";
+
+function formatSize(bytes: number | undefined): string {
+  if (bytes == null) return "";
+  if (bytes < 1024) return `${bytes} B`;
+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`;
+  return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
+}
+
+/** Inline pill for a file that the user has picked but not yet sent.
+ *  Renders above the textarea; clicking × pops it from the pending
+ *  list without uploading. */
+export function PendingAttachmentPill({
+  file,
+  onRemove,
+}: {
+  file: File;
+  onRemove: () => void;
+}) {
+  return (
+    <div className="flex items-center gap-1.5 rounded-md border border-zinc-700/60 bg-zinc-800/80 px-2 py-1 text-[10px] text-zinc-300 max-w-[200px]">
+      <FileGlyph className="text-zinc-400 shrink-0" />
+      <span className="truncate" title={file.name}>{file.name}</span>
+      <span className="text-zinc-500 shrink-0 tabular-nums">{formatSize(file.size)}</span>
+      <button
+        onClick={onRemove}
+        aria-label={`Remove ${file.name}`}
+        className="ml-0.5 text-zinc-500 hover:text-zinc-200 transition-colors shrink-0"
+      >
+        <svg width="10" height="10" viewBox="0 0 16 16" fill="none" aria-hidden="true">
+          <path d="M4 4l8 8M12 4l-8 8" stroke="currentColor" strokeWidth="1.6" strokeLinecap="round" />
+        </svg>
+      </button>
+    </div>
+  );
+}
+
+/** Chip rendered inside a message bubble for a sent/received file.
+ *  Clicking triggers the download via the passed onDownload callback
+ *  so the parent controls workspace-scoped URL resolution. */
+export function AttachmentChip({
+  attachment,
+  onDownload,
+  tone,
+}: {
+  attachment: ChatAttachment;
+  onDownload: (a: ChatAttachment) => void;
+  tone: "user" | "agent";
+}) {
+  const toneClasses =
+    tone === "user"
+      ? "border-blue-400/30 bg-blue-600/20 hover:bg-blue-600/30 text-blue-100"
+      : "border-zinc-600/50 bg-zinc-700/40 hover:bg-zinc-600/50 text-zinc-100";
+  return (
+    <button
+      onClick={() => onDownload(attachment)}
+      title={`Download ${attachment.name}`}
+      className={`flex items-center gap-1.5 rounded-md border px-2 py-1 text-[10px] transition-colors max-w-full ${toneClasses}`}
+    >
+      <FileGlyph className="shrink-0 opacity-70" />
+      <span className="truncate">{attachment.name}</span>
+      {attachment.size != null && (
+        <span className="opacity-60 shrink-0 tabular-nums">{formatSize(attachment.size)}</span>
+      )}
+      <DownloadGlyph className="opacity-70 shrink-0" />
+    </button>
+  );
+}
+
+function FileGlyph({ className }: { className?: string }) {
+  return (
+    <svg width="10" height="10" viewBox="0 0 16 16" fill="none" className={className} aria-hidden="true">
+      <path d="M4 2h5l3 3v9a1 1 0 0 1-1 1H4a1 1 0 0 1-1-1V3a1 1 0 0 1 1-1Z" stroke="currentColor" strokeWidth="1.3" strokeLinejoin="round" />
+      <path d="M9 2v3h3" stroke="currentColor" strokeWidth="1.3" strokeLinejoin="round" />
+    </svg>
+  );
+}
+
+function DownloadGlyph({ className }: { className?: string }) {
+  return (
+    <svg width="10" height="10" viewBox="0 0 16 16" fill="none" className={className} aria-hidden="true">
+      <path d="M8 2v9M4 7l4 4 4-4" stroke="currentColor" strokeWidth="1.4" strokeLinecap="round" strokeLinejoin="round" />
+      <path d="M3 13h10" stroke="currentColor" strokeWidth="1.4" strokeLinecap="round" />
+    </svg>
+  );
+}
diff --git a/canvas/src/components/tabs/chat/__tests__/AgentCommsPanel.test.ts b/canvas/src/components/tabs/chat/__tests__/AgentCommsPanel.test.ts
new file mode 100644
index 00000000..fc2eb70f
--- /dev/null
+++ b/canvas/src/components/tabs/chat/__tests__/AgentCommsPanel.test.ts
@@ -0,0 +1,113 @@
+// @vitest-environment jsdom
+import { describe, it, expect, vi } from "vitest";
+
+// Stub the canvas store before importing the SUT — toCommMessage calls
+// useCanvasStore.getState() inside resolveName to look up peer names,
+// which would otherwise hit the real Zustand store.
+vi.mock("@/store/canvas", () => ({
+  useCanvasStore: {
+    getState: () => ({
+      nodes: [
+        { id: "ws-self", data: { name: "Self" } },
+        { id: "ws-peer", data: { name: "Peer Agent" } },
+      ],
+    }),
+  },
+}));
+
+import { toCommMessage, type ActivityEntry } from "../AgentCommsPanel";
+
+const SELF = "ws-self";
+const PEER = "ws-peer";
+
+function makeEntry(overrides: Partial<ActivityEntry> = {}): ActivityEntry {
+  return {
+    id: "act-1",
+    activity_type: "a2a_send",
+    source_id: SELF,
+    target_id: PEER,
+    method: "message/send",
+    summary: "Delegating to Peer Agent",
+    request_body: null,
+    response_body: null,
+    status: "ok",
+    created_at: "2026-04-25T18:00:00Z",
+    ...overrides,
+  };
+}
+
+describe("toCommMessage — flow derivation", () => {
+  it("a2a_send is always outbound (flow=out, peer=target)", () => {
+    const m = toCommMessage(
+      makeEntry({ activity_type: "a2a_send", source_id: SELF, target_id: PEER }),
+      SELF,
+    );
+    expect(m).toBeTruthy();
+    expect(m!.flow).toBe("out");
+    expect(m!.peerId).toBe(PEER);
+    expect(m!.peerName).toBe("Peer Agent");
+  });
+
+  it("a2a_receive from a peer (peer-initiated call) is inbound", () => {
+    // Real incoming call: source = peer, target = us.
+    const m = toCommMessage(
+      makeEntry({
+        activity_type: "a2a_receive",
+        source_id: PEER,
+        target_id: SELF,
+      }),
+      SELF,
+    );
+    expect(m!.flow).toBe("in");
+    expect(m!.peerId).toBe(PEER);
+    expect(m!.peerName).toBe("Peer Agent");
+  });
+
+  it("a2a_receive self-logged by our runtime AFTER an outbound call is OUTBOUND from the user's POV", () => {
+    // workspace/a2a_tools.py:181 self-logs an a2a_receive on the
+    // CALLER's workspace_id with source_id=us, target_id=peer.
+    // From the user's perspective this row belongs to the outbound
+    // delegation thread — render flow=out + peer=target so the
+    // bubble right-justifies under "Delegating to peer" and the
+    // Restart button targets the actual peer (NOT us). Regression
+    // for the bug where these rows rendered as "← From Self" with
+    // a Restart button that would have restarted the user's own
+    // workspace.
+    const m = toCommMessage(
+      makeEntry({
+        activity_type: "a2a_receive",
+        source_id: SELF,
+        target_id: PEER,
+        summary: "Peer Agent failed",
+        status: "error",
+      }),
+      SELF,
+    );
+    expect(m!.flow).toBe("out");
+    expect(m!.peerId).toBe(PEER);
+    expect(m!.peerName).toBe("Peer Agent");
+    expect(m!.status).toBe("error");
+  });
+
+  it("returns null when no peer can be resolved", () => {
+    // a2a_receive with both ids null — discard rather than render a
+    // ghost bubble pointing at "Unknown".
+    const m = toCommMessage(
+      makeEntry({
+        activity_type: "a2a_receive",
+        source_id: null,
+        target_id: null,
+      }),
+      SELF,
+    );
+    expect(m).toBeNull();
+  });
+
+  it("propagates status through to the message (drives error rendering)", () => {
+    const m = toCommMessage(
+      makeEntry({ status: "error", activity_type: "a2a_send" }),
+      SELF,
+    );
+    expect(m!.status).toBe("error");
+  });
+});
diff --git a/canvas/src/components/tabs/chat/__tests__/a2aErrorHint.test.ts b/canvas/src/components/tabs/chat/__tests__/a2aErrorHint.test.ts
new file mode 100644
index 00000000..8518829b
--- /dev/null
+++ b/canvas/src/components/tabs/chat/__tests__/a2aErrorHint.test.ts
@@ -0,0 +1,67 @@
+import { describe, it, expect } from "vitest";
+import { inferA2AErrorHint } from "../a2aErrorHint";
+
+// Pure logic. Pin every named pattern so a future contributor adding a
+// new symptom doesn't accidentally collapse the buckets — and so the
+// "most specific first" ordering can't drift without a test failing.
+
+describe("inferA2AErrorHint", () => {
+  it("matches the Claude Code SDK init wedge specifically", () => {
+    const hint = inferA2AErrorHint("Control request timeout: initialize");
+    expect(hint).toMatch(/Claude Code SDK is wedged/);
+  });
+
+  it("does NOT misfire on user tasks containing 'initialize' generally", () => {
+    // Regression: an earlier bare-`initialize` pattern would have
+    // false-positived "failed to initialize database" into the SDK
+    // wedge hint. Confirm the full-phrase guard holds.
+    const hint = inferA2AErrorHint("failed to initialize database connection");
+    expect(hint).not.toMatch(/Claude Code SDK/);
+  });
+
+  it("recognises httpx ReadTimeout / ConnectTimeout class names", () => {
+    expect(inferA2AErrorHint("ReadTimeout: timeout")).toMatch(/proxy timeout/);
+    expect(inferA2AErrorHint("ConnectTimeout: ...")).toMatch(/proxy timeout/);
+  });
+
+  it("recognises generic timeout / deadline-exceeded language", () => {
+    expect(inferA2AErrorHint("deadline exceeded after 300s")).toMatch(/proxy timeout/);
+    expect(inferA2AErrorHint("Operation timeout")).toMatch(/proxy timeout/);
+  });
+
+  it("handles connection-reset family (RemoteProtocolError, ConnectionReset, no-message)", () => {
+    expect(inferA2AErrorHint("RemoteProtocolError: ...")).toMatch(/connection.*dropped/);
+    expect(inferA2AErrorHint("ConnectionResetError")).toMatch(/connection.*dropped/);
+    expect(inferA2AErrorHint("connection reset by peer")).toMatch(/connection.*dropped/);
+    expect(inferA2AErrorHint("RemoteProtocolError (no message — likely connection reset)")).toMatch(/connection.*dropped/);
+  });
+
+  it("recognises agent-runtime exceptions", () => {
+    expect(inferA2AErrorHint("Agent error: ValueError raised")).toMatch(/runtime threw an exception/);
+    expect(inferA2AErrorHint("RuntimeException in tool call")).toMatch(/runtime threw an exception/);
+  });
+
+  it("recognises peer-unreachable cases (Activity-tab originals)", () => {
+    expect(inferA2AErrorHint("workspace not found")).toMatch(/can't be reached/);
+    expect(inferA2AErrorHint("not accessible")).toMatch(/can't be reached/);
+    expect(inferA2AErrorHint("workspace is offline")).toMatch(/can't be reached/);
+  });
+
+  it("returns the empty-detail-specific hint when input is exactly empty", () => {
+    expect(inferA2AErrorHint("")).toMatch(/no error detail/);
+  });
+
+  it("returns a generic fallback for unrecognised text", () => {
+    const hint = inferA2AErrorHint("some completely novel error nobody has matched yet");
+    expect(hint).toMatch(/Check the workspace logs|delivery failure/);
+  });
+
+  it("Claude SDK wedge wins over the more general timeout pattern", () => {
+    // Both 'control request timeout' and 'timeout' match the same
+    // input. The SDK wedge hint is more actionable; the ordering in
+    // the function must keep it first. Lock that priority in.
+    const hint = inferA2AErrorHint("Control request timeout: initialize");
+    expect(hint).toMatch(/Claude Code SDK/);
+    expect(hint).not.toMatch(/proxy timeout/);
+  });
+});
diff --git a/canvas/src/components/tabs/chat/__tests__/activityLog.test.ts b/canvas/src/components/tabs/chat/__tests__/activityLog.test.ts
new file mode 100644
index 00000000..c66aa949
--- /dev/null
+++ b/canvas/src/components/tabs/chat/__tests__/activityLog.test.ts
@@ -0,0 +1,41 @@
+import { describe, it, expect } from "vitest";
+import { ACTIVITY_LOG_WINDOW, appendActivityLine } from "../activityLog";
+
+describe("appendActivityLine", () => {
+  it("appends a fresh line", () => {
+    expect(appendActivityLine([], "📄 Read /a")).toEqual(["📄 Read /a"]);
+  });
+
+  it("collapses an immediate duplicate", () => {
+    const prev = ["📄 Read /a"];
+    // Same exact string twice in a row is noise — the helper should
+    // return the original array reference, not a new one.
+    expect(appendActivityLine(prev, "📄 Read /a")).toBe(prev);
+  });
+
+  it("keeps non-adjacent duplicates", () => {
+    const prev = ["📄 Read /a", "⚡ Bash: ls"];
+    expect(appendActivityLine(prev, "📄 Read /a")).toEqual([
+      "📄 Read /a",
+      "⚡ Bash: ls",
+      "📄 Read /a",
+    ]);
+  });
+
+  it("rolls off the oldest line when the window fills", () => {
+    const seed = Array.from({ length: ACTIVITY_LOG_WINDOW }, (_, i) => `line-${i}`);
+    const next = appendActivityLine(seed, "newest");
+    expect(next.length).toBe(ACTIVITY_LOG_WINDOW);
+    expect(next[next.length - 1]).toBe("newest");
+    // Oldest entry is dropped — line-0 is gone.
+    expect(next[0]).toBe("line-1");
+  });
+
+  it("keeps the original array reference when below the window cap", () => {
+    const prev = ["a", "b"];
+    const next = appendActivityLine(prev, "c");
+    // Returned a new array (we appended); must NOT mutate prev.
+    expect(prev).toEqual(["a", "b"]);
+    expect(next).toEqual(["a", "b", "c"]);
+  });
+});
diff --git a/canvas/src/components/tabs/chat/__tests__/message-parser.test.ts b/canvas/src/components/tabs/chat/__tests__/message-parser.test.ts
index 75613a81..809c37ab 100644
--- a/canvas/src/components/tabs/chat/__tests__/message-parser.test.ts
+++ b/canvas/src/components/tabs/chat/__tests__/message-parser.test.ts
@@ -4,6 +4,7 @@ import {
   extractResponseText,
   extractAgentText,
   extractTextsFromParts,
+  extractFilesFromTask,
 } from "../message-parser";
 
 describe("extractRequestText", () => {
@@ -99,6 +100,67 @@ describe("extractResponseText", () => {
   it("returns empty when result has no parts", () => {
     expect(extractResponseText({ result: { other: true } })).toBe("");
   });
+
+  // Regression: Claude Code (and other long-reply runtimes) emits
+  // multi-part text replies. The previous implementation returned
+  // only the first part, silently truncating the rest. Observed
+  // 2026-04-25 on a 15k-char Wave 1 brief that rendered as just the
+  // markdown table header.
+  it("joins all text parts when result.parts has multiple", () => {
+    const body = {
+      result: {
+        parts: [
+          { kind: "text", text: "# Header" },
+          { kind: "text", text: "| Col |" },
+          { kind: "text", text: "| --- |" },
+          { kind: "text", text: "| Row |" },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("# Header\n| Col |\n| --- |\n| Row |");
+  });
+
+  it("joins all text parts across multiple artifacts", () => {
+    const body = {
+      result: {
+        artifacts: [
+          { parts: [{ kind: "text", text: "First artifact" }] },
+          { parts: [{ kind: "text", text: "Second artifact" }] },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("First artifact\nSecond artifact");
+  });
+
+  it("joins all .root.text variants when present", () => {
+    const body = {
+      result: {
+        parts: [
+          { root: { text: "alpha" } },
+          { root: { text: "beta" } },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("alpha\nbeta");
+  });
+
+  // Regression: when a response carries BOTH parts and artifacts
+  // (Hermes tool-call replies do this — summary in parts, detail in
+  // artifacts), the early-return-on-parts implementation silently
+  // dropped the artifacts body. The collected-from-every-source
+  // implementation must surface both.
+  it("collects text from BOTH result.parts AND result.artifacts when both present", () => {
+    const body = {
+      result: {
+        parts: [{ kind: "text", text: "Summary" }],
+        artifacts: [
+          { parts: [{ kind: "text", text: "Detail block one" }] },
+          { parts: [{ kind: "text", text: "Detail block two" }] },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("Summary\nDetail block one\nDetail block two");
+  });
 });
 
 describe("extractTextsFromParts", () => {
@@ -133,3 +195,71 @@ describe("extractTextsFromParts", () => {
     expect(extractTextsFromParts(parts)).toBe("Only text");
   });
 });
+
+describe("extractFilesFromTask", () => {
+  it("pulls A2A file parts out of a result", () => {
+    const task = {
+      parts: [
+        { kind: "text", text: "here's the report" },
+        {
+          kind: "file",
+          file: { name: "report.pdf", mimeType: "application/pdf", uri: "workspace:/reports/report.pdf", size: 4096 },
+        },
+      ],
+    };
+    const files = extractFilesFromTask(task);
+    expect(files).toEqual([
+      { name: "report.pdf", mimeType: "application/pdf", uri: "workspace:/reports/report.pdf", size: 4096 },
+    ]);
+  });
+
+  it("recovers a filename from the URI when `name` is absent", () => {
+    const task = {
+      parts: [
+        { kind: "file", file: { uri: "workspace:/workspace/out/graph.png" } },
+      ],
+    };
+    const files = extractFilesFromTask(task);
+    expect(files[0].name).toBe("graph.png");
+  });
+
+  it("skips file parts without a URI (inline bytes are not supported yet)", () => {
+    const task = {
+      parts: [
+        { kind: "file", file: { name: "inline.bin", bytes: "AAA=" } },
+      ],
+    };
+    expect(extractFilesFromTask(task)).toEqual([]);
+  });
+
+  it("walks artifacts[] so file parts nested inside artifact envelopes are found", () => {
+    const task = {
+      artifacts: [
+        {
+          parts: [
+            { kind: "file", file: { name: "trace.log", uri: "workspace:/logs/trace.log" } },
+          ],
+        },
+      ],
+    };
+    const files = extractFilesFromTask(task);
+    expect(files[0]).toMatchObject({ name: "trace.log", uri: "workspace:/logs/trace.log" });
+  });
+
+  it("returns [] on malformed input rather than throwing", () => {
+    expect(extractFilesFromTask({})).toEqual([]);
+    expect(extractFilesFromTask({ parts: "not-an-array" } as unknown as Record<string, unknown>)).toEqual([]);
+  });
+
+  it("walks result.message.parts — the non-task reply shape some A2A servers use", () => {
+    const task = {
+      message: {
+        parts: [
+          { kind: "file", file: { name: "out.txt", uri: "workspace:/workspace/out.txt" } },
+        ],
+      },
+    };
+    const files = extractFilesFromTask(task);
+    expect(files[0]).toMatchObject({ name: "out.txt", uri: "workspace:/workspace/out.txt" });
+  });
+});
diff --git a/canvas/src/components/tabs/chat/__tests__/uploads.test.ts b/canvas/src/components/tabs/chat/__tests__/uploads.test.ts
new file mode 100644
index 00000000..a08d5d19
--- /dev/null
+++ b/canvas/src/components/tabs/chat/__tests__/uploads.test.ts
@@ -0,0 +1,41 @@
+import { describe, it, expect } from "vitest";
+import { resolveAttachmentHref } from "../uploads";
+
+describe("resolveAttachmentHref — URI scheme normalisation", () => {
+  const wsId = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee";
+
+  it("rewrites the canonical workspace:<path> scheme to /chat/download", () => {
+    const url = resolveAttachmentHref(wsId, "workspace:/workspace/report.pdf");
+    expect(url).toContain(`/workspaces/${wsId}/chat/download`);
+    expect(url).toContain(encodeURIComponent("/workspace/report.pdf"));
+  });
+
+  it("accepts bare absolute container paths (some agents omit the scheme)", () => {
+    const url = resolveAttachmentHref(wsId, "/workspace/report.pdf");
+    expect(url).toContain(`/workspaces/${wsId}/chat/download`);
+    expect(url).toContain(encodeURIComponent("/workspace/report.pdf"));
+  });
+
+  it("accepts file:/// URIs pointing into an allowed root", () => {
+    const url = resolveAttachmentHref(wsId, "file:///workspace/report.pdf");
+    expect(url).toContain(`/workspaces/${wsId}/chat/download`);
+    expect(url).toContain(encodeURIComponent("/workspace/report.pdf"));
+  });
+
+  it("passes through HTTP(S) URIs unchanged so off-platform artefacts still render", () => {
+    const external = "https://example.com/static/report.pdf";
+    expect(resolveAttachmentHref(wsId, external)).toBe(external);
+  });
+
+  it("passes through container paths that are not under any allowed root", () => {
+    // /etc/passwd looks like a path but isn't one of the allowed
+    // roots — falling back to raw passthrough forces the caller into
+    // the external-URL branch, which opens a new tab and lets the
+    // browser refuse. Rewriting would 400 anyway server-side.
+    expect(resolveAttachmentHref(wsId, "/etc/passwd")).toBe("/etc/passwd");
+  });
+
+  it("passes through unknown schemes unchanged", () => {
+    expect(resolveAttachmentHref(wsId, "s3://bucket/key")).toBe("s3://bucket/key");
+  });
+});
diff --git a/canvas/src/components/tabs/chat/a2aErrorHint.ts b/canvas/src/components/tabs/chat/a2aErrorHint.ts
new file mode 100644
index 00000000..e29e643a
--- /dev/null
+++ b/canvas/src/components/tabs/chat/a2aErrorHint.ts
@@ -0,0 +1,54 @@
+/**
+ * Maps an A2A delivery-failure detail string (the bit AFTER stripping
+ * the [A2A_ERROR] sentinel prefix) to a one-line operator-actionable
+ * hint. Pattern matches are lowercase substring checks, ordered most-
+ * specific first so the right hint wins when multiple patterns
+ * overlap (e.g. "control request timeout" wins over generic "timeout").
+ *
+ * Used by both the chat Agent Comms panel and the Activity tab so the
+ * same symptom reads identically across surfaces. Two prior copies
+ * had already drifted (Activity tab gained `not found`/`offline`
+ * cases AgentCommsPanel never picked up) — this module is the merged
+ * superset and the only place hint text should change.
+ */
+export function inferA2AErrorHint(detail: string): string {
+  const t = detail.toLowerCase();
+
+  // "control request timeout" is the specific Claude Code SDK init
+  // wedge symptom. Pattern on the full phrase, not bare "initialize"
+  // — a user task containing "failed to initialize database" would
+  // false-positive into the SDK-wedge hint.
+  if (t.includes("control request timeout")) {
+    return "The remote agent's Claude Code SDK is wedged on initialization (often after a long idle period or OAuth refresh). A workspace restart usually clears it.";
+  }
+  if (
+    t.includes("readtimeout") ||
+    t.includes("connecttimeout") ||
+    t.includes("deadline exceeded") ||
+    t.includes("timeout")
+  ) {
+    return "The remote agent didn't respond within the proxy timeout. It may be busy with a long task, or the runtime is stuck — restart the workspace if this repeats.";
+  }
+  if (
+    t.includes("connectionreset") ||
+    t.includes("remoteprotocolerror") ||
+    t.includes("connection reset") ||
+    t.includes("no message")
+  ) {
+    return "The connection to the remote agent dropped before a reply arrived. Usually a transient network blip — retry once. If it repeats, the remote container may have crashed mid-request; check its logs.";
+  }
+  if (t.includes("agent error") || t.includes("exception")) {
+    return "The remote agent's runtime threw an exception. Check the workspace's container logs for the traceback. Restart usually clears transient runtime crashes.";
+  }
+  if (
+    t.includes("not found") ||
+    t.includes("not accessible") ||
+    t.includes("offline")
+  ) {
+    return "The remote workspace can't be reached — it may be stopped, removed, or outside the access control list. Verify the peer is online before retrying.";
+  }
+  if (detail === "") {
+    return "The remote agent returned no error detail (the underlying httpx exception had an empty message — typically a connection-reset or silent timeout). A workspace restart is the safe first move.";
+  }
+  return "The remote agent reported a delivery failure. Check the workspace logs or try restarting.";
+}
diff --git a/canvas/src/components/tabs/chat/activityLog.ts b/canvas/src/components/tabs/chat/activityLog.ts
new file mode 100644
index 00000000..57efa31d
--- /dev/null
+++ b/canvas/src/components/tabs/chat/activityLog.ts
@@ -0,0 +1,23 @@
+/**
+ * Sliding-window log for the in-chat activity feed (the live progress
+ * lines under the spinner while a chat reply is in flight).
+ *
+ * Sized to fit the spinner area without forcing a scroll; per-tool-use
+ * rows from the workspace's _report_tool_use can fire dozens per turn
+ * (Read 5 files + Grep + Bash + Edits + delegations), so a too-small
+ * window flushes useful early context before the user can read it.
+ *
+ * Consecutive identical lines collapse to a single entry — the same
+ * tool repeated on the same target (e.g. Read of the same file twice
+ * within a turn) is noise, not new progress.
+ */
+export const ACTIVITY_LOG_WINDOW = 20;
+
+export function appendActivityLine(prev: string[], line: string): string[] {
+  if (prev[prev.length - 1] === line) return prev; // collapse duplicates
+  const next =
+    prev.length >= ACTIVITY_LOG_WINDOW
+      ? prev.slice(-(ACTIVITY_LOG_WINDOW - 1))
+      : prev;
+  return [...next, line];
+}
diff --git a/canvas/src/components/tabs/chat/message-parser.ts b/canvas/src/components/tabs/chat/message-parser.ts
index 87731b31..54fa3a64 100644
--- a/canvas/src/components/tabs/chat/message-parser.ts
+++ b/canvas/src/components/tabs/chat/message-parser.ts
@@ -32,6 +32,64 @@ export function extractTextsFromParts(parts: unknown): string | null {
   return texts.length > 0 ? texts.join("\n") : null;
 }
 
+export interface ParsedFilePart {
+  name: string;
+  uri: string;
+  mimeType?: string;
+  size?: number;
+}
+
+/** Extract file parts from an A2A response. Walks parts[] + artifacts[].
+ *  Per the A2A spec a file part looks like:
+ *    { kind: "file", file: { name, mimeType, uri | bytes } }
+ *  We only surface parts that carry a `uri` — inline bytes would
+ *  require a different renderer (data URL) and are out of scope for
+ *  MVP. Names fall back to the URI's basename when absent. */
+export function extractFilesFromTask(task: Record<string, unknown>): ParsedFilePart[] {
+  const out: ParsedFilePart[] = [];
+  const pushFromParts = (parts: unknown) => {
+    if (!Array.isArray(parts)) return;
+    for (const raw of parts as Array<Record<string, unknown>>) {
+      if (raw.kind !== "file" && raw.type !== "file") continue;
+      const file = (raw.file ?? raw) as Record<string, unknown>;
+      const uri = typeof file.uri === "string" ? file.uri : "";
+      if (!uri) continue;
+      const name = (typeof file.name === "string" && file.name) || basename(uri);
+      out.push({
+        name,
+        uri,
+        mimeType: typeof file.mimeType === "string" ? file.mimeType : undefined,
+        size: typeof file.size === "number" ? file.size : undefined,
+      });
+    }
+  };
+  try {
+    pushFromParts(task.parts);
+    const artifacts = task.artifacts as Array<Record<string, unknown>> | undefined;
+    if (artifacts) for (const a of artifacts) pushFromParts(a.parts);
+    const status = task.status as Record<string, unknown> | undefined;
+    if (status?.message) {
+      const msg = status.message as Record<string, unknown>;
+      pushFromParts(msg.parts);
+    }
+    // Some A2A servers wrap a non-task reply as
+    // {result: {message: {parts: [...]}}} rather than {result: {parts}}.
+    // Without this branch we'd silently drop file parts returned by
+    // third-party implementations.
+    const message = task.message as Record<string, unknown> | undefined;
+    if (message) pushFromParts(message.parts);
+  } catch {
+    /* tolerate malformed shapes — chat falls through to text-only */
+  }
+  return out;
+}
+
+function basename(uri: string): string {
+  const cleaned = uri.replace(/^workspace:/, "").replace(/^https?:\/\//, "");
+  const slash = cleaned.lastIndexOf("/");
+  return slash >= 0 ? cleaned.slice(slash + 1) : cleaned || "file";
+}
+
 /** Extract user message text from an activity log request_body */
 export function extractRequestText(body: Record<string, unknown> | null): string {
   if (!body) return "";
@@ -41,22 +99,54 @@ export function extractRequestText(body: Record<string, unknown> | null): string
   return (parts?.[0]?.text as string) || "";
 }
 
-/** Extract text from an activity log response_body (multiple possible formats) */
+/** Extract text from an activity log response_body (multiple possible formats).
+ *
+ *  Collects from EVERY source — top-level `parts[].text`, `parts[].root.text`
+ *  (older nested shape), and `artifacts[].parts[].text` (task-shaped
+ *  replies) — and joins them with "\n". Two reasons to collect rather
+ *  than early-return:
+ *
+ *    1. Claude Code and other long-reply runtimes emit multiple text
+ *       parts in a single `parts` array. Returning just the first
+ *       silently truncates 15k-char briefs to their leading line
+ *       (observed UX A/B Lab Wave 1, 2026-04-25).
+ *
+ *    2. Some producers emit a summary in `parts[].text` AND details in
+ *       `artifacts[].parts[].text` (Hermes does this for tool calls).
+ *       The previous "first source wins" returned only the summary;
+ *       artifacts dropped silently. */
 export function extractResponseText(body: Record<string, unknown>): string {
   try {
     // {result: "text"} — from MCP server delegation logs
     if (typeof body.result === "string") return body.result;
 
-    // A2A JSON-RPC response: {result: {parts: [{kind: "text", text: "..."}]}}
     const result = body.result as Record<string, unknown> | undefined;
     if (result) {
+      const collected: string[] = [];
+
+      // A2A JSON-RPC: {result: {parts: [{kind: "text", text: "..."}]}}
+      const fromParts = extractTextsFromParts(result.parts);
+      if (fromParts) collected.push(fromParts);
+
+      // Older nested shape: {parts: [{root: {text: "..."}}]}
       const parts = (result.parts || []) as Array<Record<string, unknown>>;
+      const rootTexts: string[] = [];
       for (const p of parts) {
-        const t = (p.text as string) || "";
-        if (t) return t;
         const root = p.root as Record<string, unknown> | undefined;
-        if (root?.text) return root.text as string;
+        if (root?.text) rootTexts.push(root.text as string);
       }
+      if (rootTexts.length > 0) collected.push(rootTexts.join("\n"));
+
+      // Task shape: {result: {artifacts: [{parts: [...]}]}}
+      const artifacts = result.artifacts as Array<Record<string, unknown>> | undefined;
+      if (artifacts) {
+        for (const a of artifacts) {
+          const t = extractTextsFromParts(a.parts);
+          if (t) collected.push(t);
+        }
+      }
+
+      if (collected.length > 0) return collected.join("\n");
     }
 
     // {task: "text"} — request body format, shouldn't be in response but handle it
diff --git a/canvas/src/components/tabs/chat/types.ts b/canvas/src/components/tabs/chat/types.ts
index a5bfa3a0..a03cb459 100644
--- a/canvas/src/components/tabs/chat/types.ts
+++ b/canvas/src/components/tabs/chat/types.ts
@@ -1,12 +1,38 @@
+/** One file attached to a chat message. Shared shape for both
+ *  directions: when a user attaches a file the UI uploads it and
+ *  stashes the returned metadata here; when an agent returns a
+ *  `kind: file` part in an A2A response, the parser populates the
+ *  same fields. `uri` uses the `workspace:<abs-path>` scheme the
+ *  server returns — the renderer translates that to a download
+ *  request against GET /workspaces/:id/chat/download. */
+export interface ChatAttachment {
+  name: string;
+  uri: string;
+  mimeType?: string;
+  size?: number;
+}
+
 export interface ChatMessage {
   id: string;
   role: "user" | "agent" | "system";
   content: string;
+  /** Attachments sent with or returned alongside this message. */
+  attachments?: ChatAttachment[];
   timestamp: string; // ISO string for serialization
 }
 
-export function createMessage(role: ChatMessage["role"], content: string): ChatMessage {
-  return { id: crypto.randomUUID(), role, content, timestamp: new Date().toISOString() };
+export function createMessage(
+  role: ChatMessage["role"],
+  content: string,
+  attachments?: ChatAttachment[],
+): ChatMessage {
+  return {
+    id: crypto.randomUUID(),
+    role,
+    content,
+    attachments: attachments && attachments.length > 0 ? attachments : undefined,
+    timestamp: new Date().toISOString(),
+  };
 }
 
 // appendMessageDeduped adds a ChatMessage to `prev` unless the tail
@@ -25,11 +51,23 @@ export function createMessage(role: ChatMessage["role"], content: string): ChatM
 // messages ("hi", "hi") from a real user/agent still render.
 export function appendMessageDeduped(prev: ChatMessage[], msg: ChatMessage, dedupeWindowMs = 3000): ChatMessage[] {
   const cutoff = Date.now() - dedupeWindowMs;
+  const sig = attachmentSignature(msg.attachments);
   const alreadyThere = prev.some((m) => {
     if (m.role !== msg.role || m.content !== msg.content) return false;
+    // Attachments participate in the dedupe key so a text-only push
+    // doesn't shadow the file-carrying HTTP response (and vice versa).
+    // When both carry the same text AND the same files, collapse.
+    if (attachmentSignature(m.attachments) !== sig) return false;
     const t = Date.parse(m.timestamp);
     return !Number.isNaN(t) && t >= cutoff;
   });
   if (alreadyThere) return prev;
   return [...prev, msg];
 }
+
+function attachmentSignature(atts: ChatAttachment[] | undefined): string {
+  if (!atts || atts.length === 0) return "";
+  // URI is the stable identity — name can differ across delivery
+  // paths (agent vs our parser's basename fallback).
+  return atts.map((a) => a.uri).sort().join("|");
+}
diff --git a/canvas/src/components/tabs/chat/uploads.ts b/canvas/src/components/tabs/chat/uploads.ts
new file mode 100644
index 00000000..04209dde
--- /dev/null
+++ b/canvas/src/components/tabs/chat/uploads.ts
@@ -0,0 +1,135 @@
+import { PLATFORM_URL } from "@/lib/api";
+import { getTenantSlug } from "@/lib/tenant";
+import type { ChatAttachment } from "./types";
+
+/** Chat attachments are intentionally uploaded via a direct fetch()
+ *  instead of the `api.post` helper — `api.post` JSON-stringifies the
+ *  body, which would 500 on a Blob. Mirrors the header plumbing
+ *  (tenant slug, admin token, credentials) so SaaS + self-hosted
+ *  callers work the same way. */
+export async function uploadChatFiles(
+  workspaceId: string,
+  files: File[],
+): Promise<ChatAttachment[]> {
+  if (files.length === 0) return [];
+
+  const form = new FormData();
+  for (const f of files) form.append("files", f, f.name);
+
+  const headers: Record<string, string> = {};
+  const slug = getTenantSlug();
+  if (slug) headers["X-Molecule-Org-Slug"] = slug;
+  const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
+  if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
+
+  // Uploads legitimately take a while on cold cache (tar write +
+  // docker cp into the container). 60s is comfortable for the 25MB/
+  // 50MB caps the server enforces.
+  const res = await fetch(`${PLATFORM_URL}/workspaces/${workspaceId}/chat/uploads`, {
+    method: "POST",
+    headers,
+    body: form,
+    credentials: "include",
+    signal: AbortSignal.timeout(60_000),
+  });
+  if (!res.ok) {
+    const text = await res.text().catch(() => "");
+    throw new Error(`upload failed: ${res.status} ${text}`);
+  }
+  const json = (await res.json()) as { files: ChatAttachment[] };
+  return json.files ?? [];
+}
+
+/** Resolve a file URI into a browser-downloadable URL. Accepts:
+ *    - `workspace:<abs-path>` (our canonical form)
+ *    - `file:///workspace/...` (some agents emit this)
+ *    - `/workspace/...` (bare absolute path inside the container)
+ *  Everything that looks like an allowed-root container path is
+ *  rewritten to the authenticated /chat/download endpoint. HTTP(S)
+ *  URIs pass through unchanged so we can also render links to
+ *  artefacts hosted off-platform. Unknown schemes fall back to the
+ *  raw URI — the caller gets to decide how to render it. */
+export function resolveAttachmentHref(
+  workspaceId: string,
+  uri: string,
+): string {
+  const containerPath = normalizeWorkspaceUri(uri);
+  if (containerPath) {
+    return `${PLATFORM_URL}/workspaces/${workspaceId}/chat/download?path=${encodeURIComponent(containerPath)}`;
+  }
+  return uri;
+}
+
+/** Extracts the absolute container path from a workspace-scoped URI,
+ *  or null if the URI isn't a container path. The matching roots
+ *  mirror the server's `allowedRoots` allowlist. */
+const ALLOWED_CONTAINER_ROOTS = ["/configs", "/workspace", "/home", "/plugins"];
+
+function normalizeWorkspaceUri(uri: string): string | null {
+  let path: string | null = null;
+  if (uri.startsWith("workspace:")) {
+    path = uri.slice("workspace:".length);
+  } else if (uri.startsWith("file:///")) {
+    path = uri.slice("file://".length); // keep the leading slash
+  } else if (uri.startsWith("/")) {
+    path = uri;
+  }
+  if (!path) return null;
+  // Only rewrite when the path lands in an allowed root; otherwise
+  // return null so the caller falls through to raw-URI handling
+  // (which will open a new tab for HTTP-ish schemes).
+  for (const root of ALLOWED_CONTAINER_ROOTS) {
+    if (path === root || path.startsWith(root + "/")) return path;
+  }
+  return null;
+}
+
+/** Trigger a browser download for an attachment. Uses fetch+blob
+ *  rather than an anchor navigation because the download endpoint
+ *  requires workspace auth — and the browser won't attach
+ *  `Authorization: Bearer` or `X-Molecule-Org-Slug` to a bare anchor
+ *  click. A 25MB per-file cap server-side keeps the blob buffer
+ *  bounded. HTTP(S) URIs skip the fetch path and open directly
+ *  since they're off-platform artefacts that we don't own auth for. */
+export async function downloadChatFile(
+  workspaceId: string,
+  attachment: ChatAttachment,
+): Promise<void> {
+  const href = resolveAttachmentHref(workspaceId, attachment.uri);
+  const isContainerPath = normalizeWorkspaceUri(attachment.uri) !== null;
+  if (!isContainerPath) {
+    // External URL — let the browser navigate. Opens in new tab so
+    // the canvas context survives a navigation. `href` here is the
+    // raw URI (http(s), or anything else the agent sent back).
+    window.open(href, "_blank", "noopener,noreferrer");
+    return;
+  }
+
+  const headers: Record<string, string> = {};
+  const slug = getTenantSlug();
+  if (slug) headers["X-Molecule-Org-Slug"] = slug;
+  const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
+  if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
+
+  const res = await fetch(href, {
+    headers,
+    credentials: "include",
+    signal: AbortSignal.timeout(60_000),
+  });
+  if (!res.ok) {
+    throw new Error(`download failed: ${res.status}`);
+  }
+  const blob = await res.blob();
+  // Revoke the object URL after the click — browsers hold the blob
+  // until the URL is either revoked or the document unloads. 30s is
+  // plenty of headroom for the click → save dialog round-trip.
+  const url = URL.createObjectURL(blob);
+  const a = document.createElement("a");
+  a.href = url;
+  a.download = attachment.name;
+  a.rel = "noopener";
+  document.body.appendChild(a);
+  a.click();
+  a.remove();
+  setTimeout(() => URL.revokeObjectURL(url), 30_000);
+}
diff --git a/canvas/src/hooks/useTemplateDeploy.tsx b/canvas/src/hooks/useTemplateDeploy.tsx
new file mode 100644
index 00000000..4159ff40
--- /dev/null
+++ b/canvas/src/hooks/useTemplateDeploy.tsx
@@ -0,0 +1,170 @@
+"use client";
+
+import { useCallback, useState, type ReactNode } from "react";
+import { api } from "@/lib/api";
+import {
+  checkDeploySecrets,
+  resolveRuntime,
+  type PreflightResult,
+  type Template,
+} from "@/lib/deploy-preflight";
+import { MissingKeysModal } from "@/components/MissingKeysModal";
+
+/**
+ * useTemplateDeploy — shared preflight + POST + modal wiring for
+ * every surface that deploys a workspace from a template.
+ *
+ * Owns: `checkDeploySecrets` call, `MissingKeysModal` render, the
+ * `POST /workspaces` that follows, and per-template `deploying`
+ * state. Returns `modal` as a `ReactNode` ready to place inline.
+ *
+ * Why a hook rather than two copies: the runtime-fallback table
+ * (`resolveRuntime`) and the preflight wiring were previously
+ * copy-pasted between TemplatePalette and EmptyState. When the
+ * copies drifted (palette had the full id-to-runtime map,
+ * empty-state had only the `-default` strip), the two surfaces
+ * could silently disagree on future templates that need a
+ * non-identity mapping. Single owner closes the drift surface.
+ */
+export interface UseTemplateDeployOptions {
+  /** Compute canvas coords for the new workspace. Called once per
+   *  successful deploy. Defaults to random coords in the [100, 500] ×
+   *  [100, 400] band, matching the sidebar palette's historical
+   *  placement. Override for surfaces that want deterministic
+   *  placement (e.g. EmptyState's first-deploy "center-ish" target). */
+  canvasCoords?: () => { x: number; y: number };
+
+  /** Optional post-deploy side effect — passed the id of the new
+   *  workspace. EmptyState uses this to auto-select the node and
+   *  flip the side panel to Chat so a fresh tenant sees something
+   *  useful. */
+  onDeployed?: (workspaceId: string) => void;
+}
+
+/** Paired template + preflight result carried through the "user
+ *  clicked deploy → modal opens → keys saved → retry" loop. Named
+ *  so the `useState` generic and any future signature change have
+ *  a single place to track. */
+interface MissingKeysInfo {
+  template: Template;
+  preflight: PreflightResult;
+}
+
+export interface UseTemplateDeployResult {
+  /** Template id currently being deployed (incl. the preflight
+   *  network call), or null when idle. Callers pass this to disable
+   *  the relevant button and show a spinner. */
+  deploying: string | null;
+
+  /** Last deploy error message, or null. Cleared on next `deploy`
+   *  call. */
+  error: string | null;
+
+  /** Kick off a deploy. Opens the missing-keys modal if preflight
+   *  returns not-ok; otherwise fires POST /workspaces directly. */
+  deploy: (template: Template) => Promise<void>;
+
+  /** The missing-keys modal, ready to place inline. Always non-null
+   *  (the underlying component self-gates on `open`), so the caller
+   *  can drop `{modal}` anywhere without conditionals. */
+  modal: ReactNode;
+}
+
+export function useTemplateDeploy(
+  options: UseTemplateDeployOptions = {},
+): UseTemplateDeployResult {
+  const [deploying, setDeploying] = useState<string | null>(null);
+  const [error, setError] = useState<string | null>(null);
+  const [missingKeysInfo, setMissingKeysInfo] = useState<MissingKeysInfo | null>(null);
+
+  const { canvasCoords, onDeployed } = options;
+
+  /** Actually execute the POST /workspaces call. Split from `deploy`
+   *  so the "modal → keys added → retry" path can reuse it without
+   *  re-running preflight (the user just proved the keys are now set). */
+  const executeDeploy = useCallback(
+    async (template: Template) => {
+      setDeploying(template.id);
+      setError(null);
+      try {
+        const coords = canvasCoords
+          ? canvasCoords()
+          : {
+              x: Math.random() * 400 + 100,
+              y: Math.random() * 300 + 100,
+            };
+        const ws = await api.post<{ id: string }>("/workspaces", {
+          name: template.name,
+          template: template.id,
+          tier: template.tier,
+          canvas: coords,
+        });
+        onDeployed?.(ws.id);
+      } catch (e) {
+        setError(e instanceof Error ? e.message : "Deploy failed");
+      } finally {
+        setDeploying(null);
+      }
+    },
+    [canvasCoords, onDeployed],
+  );
+
+  const deploy = useCallback(
+    async (template: Template) => {
+      setDeploying(template.id);
+      setError(null);
+      let preflight: PreflightResult;
+      try {
+        const runtime = template.runtime ?? resolveRuntime(template.id);
+        preflight = await checkDeploySecrets({
+          runtime,
+          models: template.models,
+          required_env: template.required_env,
+        });
+      } catch (e) {
+        // Preflight network failure used to strand `deploying` — the
+        // button stayed disabled forever because the throw bypassed
+        // the setDeploying(null) in the non-ok branch below. Any
+        // future refactor that drops this try block will regress the
+        // same way; keep it narrow around just the preflight call
+        // so a successful preflight still lets executeDeploy own
+        // its own error path.
+        setError(e instanceof Error ? e.message : "Preflight check failed");
+        setDeploying(null);
+        return;
+      }
+      if (!preflight.ok) {
+        setMissingKeysInfo({ template, preflight });
+        setDeploying(null);
+        return;
+      }
+      await executeDeploy(template);
+    },
+    [executeDeploy],
+  );
+
+  // No useCallback here — consumers call this on every render anyway
+  // (it's placed inline in JSX), and useCallback's deps would
+  // invalidate on every state change, making the memoisation a wash.
+  // Plain ReactNode is simpler and equally performant.
+  const modal: ReactNode = (
+    <MissingKeysModal
+      open={!!missingKeysInfo}
+      missingKeys={missingKeysInfo?.preflight.missingKeys ?? []}
+      providers={missingKeysInfo?.preflight.providers ?? []}
+      runtime={missingKeysInfo?.preflight.runtime ?? ""}
+      onKeysAdded={() => {
+        if (missingKeysInfo) {
+          const template = missingKeysInfo.template;
+          setMissingKeysInfo(null);
+          // Intentional fire-and-forget — executeDeploy manages
+          // its own error state via setError.
+          void executeDeploy(template);
+        }
+      }}
+      onCancel={() => setMissingKeysInfo(null)}
+    />
+  );
+
+  return { deploying, error, deploy, modal };
+}
diff --git a/canvas/src/lib/__tests__/api.test.ts b/canvas/src/lib/__tests__/api.test.ts
index 09eb0eff..d95e367b 100644
--- a/canvas/src/lib/__tests__/api.test.ts
+++ b/canvas/src/lib/__tests__/api.test.ts
@@ -7,7 +7,7 @@ import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
 const mockFetch = vi.fn();
 globalThis.fetch = mockFetch;
 
-import { api } from "../api";
+import { api, PlatformUnavailableError } from "../api";
 
 // ---------------------------------------------------------------------------
 // Helpers
@@ -380,3 +380,99 @@ describe("api – request timeout signal", () => {
     expect(sigA).not.toBe(sigB);
   });
 });
+
+// ---------------------------------------------------------------------------
+// PlatformUnavailableError classification
+// ---------------------------------------------------------------------------
+//
+// When the platform's wsauth middleware can't reach Postgres/Redis to
+// validate a token, it returns 503 + {error, code:"platform_unavailable"}.
+// api.ts must surface that as a typed error so the page-level renderer
+// can show a dedicated diagnostic instead of a generic 5xx toast.
+
+describe("PlatformUnavailableError classification", () => {
+  beforeEach(() => {
+    mockFetch.mockReset();
+  });
+
+  function mock503Platform(detail = "platform datastore unavailable — retry shortly") {
+    const body = JSON.stringify({ error: detail, code: "platform_unavailable" });
+    mockFetch.mockResolvedValueOnce({
+      ok: false,
+      status: 503,
+      json: () => Promise.reject(new Error("not used")),
+      text: () => Promise.resolve(body),
+    } as unknown as Response);
+  }
+
+  it("throws PlatformUnavailableError on 503 + code=platform_unavailable", async () => {
+    mock503Platform();
+    let thrown: unknown;
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      thrown = e;
+    }
+    expect(thrown).toBeInstanceOf(PlatformUnavailableError);
+    expect((thrown as PlatformUnavailableError).code).toBe("platform_unavailable");
+  });
+
+  it("preserves the server-provided error string as the Error message", async () => {
+    mock503Platform("Postgres unreachable");
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      expect(e).toBeInstanceOf(PlatformUnavailableError);
+      expect((e as Error).message).toBe("Postgres unreachable");
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+
+  it("does NOT classify a generic 503 (no platform_unavailable code) as PlatformUnavailableError", async () => {
+    // Generic upstream-busy 503 — should keep the legacy generic-Error
+    // path so existing busy-retry UX isn't disrupted.
+    mockFetch.mockResolvedValueOnce({
+      ok: false,
+      status: 503,
+      json: () => Promise.reject(new Error("not used")),
+      text: () => Promise.resolve(JSON.stringify({ error: "upstream busy" })),
+    } as unknown as Response);
+    try {
+      await api.get("/workspaces/x/a2a");
+    } catch (e) {
+      expect(e).not.toBeInstanceOf(PlatformUnavailableError);
+      expect((e as Error).message).toContain("503");
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+
+  it("does NOT classify on 500 (server kept legacy 500 for true internal errors)", async () => {
+    mockFailure(500, "boom");
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      expect(e).not.toBeInstanceOf(PlatformUnavailableError);
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+
+  it("falls back to generic Error when 503 body isn't JSON", async () => {
+    mockFetch.mockResolvedValueOnce({
+      ok: false,
+      status: 503,
+      json: () => Promise.reject(new Error("not used")),
+      text: () => Promise.resolve("Service Unavailable"),
+    } as unknown as Response);
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      expect(e).not.toBeInstanceOf(PlatformUnavailableError);
+      expect((e as Error).message).toContain("503");
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+});
diff --git a/canvas/src/lib/api.ts b/canvas/src/lib/api.ts
index edd1d696..dae1152b 100644
--- a/canvas/src/lib/api.ts
+++ b/canvas/src/lib/api.ts
@@ -107,11 +107,39 @@ async function request<T>(
   }
   if (!res.ok) {
     const text = await res.text();
+    // Recognise the platform's structured "datastore unreachable"
+    // shape (returned by wsauth_middleware.abortAuthLookupError when
+    // Postgres/Redis is down). Surface as a typed error so callers
+    // can render a dedicated diagnostic instead of a generic toast.
+    if (res.status === 503 && text) {
+      try {
+        const parsed = JSON.parse(text) as { code?: string; error?: string };
+        if (parsed.code === "platform_unavailable") {
+          throw new PlatformUnavailableError(parsed.error || "platform datastore unavailable");
+        }
+      } catch (err) {
+        // Re-throw the typed error if that's what we just constructed.
+        // JSON.parse failures fall through to the generic Error below.
+        if (err instanceof PlatformUnavailableError) throw err;
+      }
+    }
     throw new Error(`API ${method} ${path}: ${res.status} ${text}`);
   }
   return res.json();
 }
 
+/** Thrown when the platform reports its datastore (Postgres/Redis) is
+ *  unreachable. Surface with a dedicated diagnostic UI rather than a
+ *  generic API-error toast — the user's next action is to check local
+ *  services, not to retry the API call. */
+export class PlatformUnavailableError extends Error {
+  readonly code = "platform_unavailable" as const;
+  constructor(message: string) {
+    super(message);
+    this.name = "PlatformUnavailableError";
+  }
+}
+
 export const api = {
   get: <T>(path: string, options?: RequestOptions) => request<T>("GET", path, undefined, 0, options),
   post: <T>(path: string, body?: unknown, options?: RequestOptions) => request<T>("POST", path, body, 0, options),
diff --git a/canvas/src/lib/deploy-preflight.ts b/canvas/src/lib/deploy-preflight.ts
index d333caaf..a1f1d7a6 100644
--- a/canvas/src/lib/deploy-preflight.ts
+++ b/canvas/src/lib/deploy-preflight.ts
@@ -33,6 +33,46 @@ export interface TemplateLike {
   required_env?: string[];
 }
 
+/** Full /templates response shape shared by TemplatePalette (sidebar)
+ *  and EmptyState (welcome grid). Was previously re-declared in each
+ *  with subtly different fields — EmptyState's narrower shape silently
+ *  dropped `runtime`, `models`, and `required_env`, so the preflight
+ *  couldn't see provider alternatives the template declared. Keep this
+ *  the single source of truth.  */
+export interface Template extends TemplateLike {
+  id: string;
+  name: string;
+  description: string;
+  tier: number;
+  model: string;
+  skills: string[];
+  skill_count: number;
+}
+
+/** Map from a template id to the runtime name the per-workspace
+ *  preflight expects. Used only when the server's `/templates`
+ *  response predates the `runtime` field on the summary (legacy
+ *  installs) — modern responses carry it verbatim. Strip `-default`
+ *  for the claude-code template and identity-map everything else
+ *  that matches our current runtime registry.
+ *
+ *  Lives in the preflight module (not TemplatePalette) so EmptyState
+ *  uses the SAME fallback table. A previous duplication in both call
+ *  sites left EmptyState with only the `-default` suffix strip, which
+ *  would silently disagree with TemplatePalette on templates whose
+ *  id needs a non-identity mapping. */
+export function resolveRuntime(templateId: string): string {
+  const runtimeMap: Record<string, string> = {
+    langgraph: "langgraph",
+    "claude-code-default": "claude-code",
+    openclaw: "openclaw",
+    deepagents: "deepagents",
+    crewai: "crewai",
+    autogen: "autogen",
+  };
+  return runtimeMap[templateId] ?? templateId.replace(/-default$/, "");
+}
+
 export interface SecretEntry {
   key: string;
   has_value: boolean;
diff --git a/canvas/src/store/__tests__/canvas-batch-partial-failure.test.ts b/canvas/src/store/__tests__/canvas-batch-partial-failure.test.ts
index ee3771ff..7f5594a9 100644
--- a/canvas/src/store/__tests__/canvas-batch-partial-failure.test.ts
+++ b/canvas/src/store/__tests__/canvas-batch-partial-failure.test.ts
@@ -5,27 +5,34 @@ import { describe, it, expect, beforeEach, vi } from "vitest";
 global.fetch = vi.fn();
 
 import { useCanvasStore } from "../canvas";
-import type { WorkspaceData } from "../socket";
+import type { WorkspaceNodeData } from "../canvas";
 
-function makeWS(overrides: Partial<WorkspaceData> & { id: string }): WorkspaceData {
+function makeWS(
+  overrides: Partial<WorkspaceNodeData> & { id: string },
+): WorkspaceNodeData {
+  // makeWS builds a minimal WorkspaceNodeData for tests that set state
+  // directly on the store (bypassing hydrate). The `id` override is
+  // ignored — node IDs live on the outer Node<> wrapper, not inside
+  // `data`. It's accepted here so callers can keep their existing
+  // `makeWS({ id: "ws-foo" })` call sites even though the id is only
+  // used on the Node<> wrapper at the call site.
+  void overrides.id;
   return {
     name: "WS",
     role: "agent",
     tier: 1,
     status: "online",
-    agent_card: null,
+    agentCard: null,
     url: "http://localhost:9000",
-    parent_id: null,
-    active_tasks: 0,
-    last_error_rate: 0,
-    last_sample_error: "",
-    uptime_seconds: 60,
-    current_task: "",
-    x: 0,
-    y: 0,
+    parentId: null,
+    activeTasks: 0,
+    lastErrorRate: 0,
+    lastSampleError: "",
+    currentTask: "",
     collapsed: false,
     runtime: "",
-    budget_limit: null,
+    needsRestart: false,
+    budgetLimit: null,
     ...overrides,
   };
 }
@@ -148,13 +155,13 @@ describe("batchRestart — partial failure", () => {
           id: "ws-ok",
           type: "workspace",
           position: { x: 0, y: 0 },
-          data: { ...makeWS({ id: "ws-ok" }), needsRestart: true } as WorkspaceData & { needsRestart: boolean },
+          data: { ...makeWS({ id: "ws-ok" }), needsRestart: true } as WorkspaceNodeData,
         },
         {
           id: "ws-fail",
           type: "workspace",
           position: { x: 0, y: 0 },
-          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceData & { needsRestart: boolean },
+          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceNodeData,
         },
       ],
       selectedNodeIds: new Set(["ws-ok", "ws-fail"]),
@@ -166,7 +173,7 @@ describe("batchRestart — partial failure", () => {
     });
 
     const byId = Object.fromEntries(
-      useCanvasStore.getState().nodes.map((n) => [n.id, n.data as WorkspaceData & { needsRestart?: boolean }])
+      useCanvasStore.getState().nodes.map((n) => [n.id, n.data as WorkspaceNodeData])
     );
     expect(byId["ws-ok"].needsRestart).toBe(false);
     expect(byId["ws-fail"].needsRestart).toBe(true);
@@ -179,7 +186,7 @@ describe("batchRestart — partial failure", () => {
           id: "ws-fail",
           type: "workspace",
           position: { x: 0, y: 0 },
-          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceData & { needsRestart: boolean },
+          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceNodeData,
         },
       ],
       selectedNodeIds: new Set(["ws-fail"]),
diff --git a/canvas/src/store/__tests__/canvas-events-pan.test.ts b/canvas/src/store/__tests__/canvas-events-pan.test.ts
index 77c687fc..499014c0 100644
--- a/canvas/src/store/__tests__/canvas-events-pan.test.ts
+++ b/canvas/src/store/__tests__/canvas-events-pan.test.ts
@@ -67,7 +67,19 @@ describe("canvas-events – molecule:pan-to-node dispatch", () => {
     vi.restoreAllMocks();
   });
 
-  it("dispatches molecule:pan-to-node with the new nodeId for a NEW provision", () => {
+  it("dispatches both molecule:pan-to-node AND molecule:fit-deploying-org for a NEW root-level provision", () => {
+    // Two custom events are dispatched on NEW root-level provision:
+    //   1. molecule:fit-deploying-org — tells useCanvasViewport to
+    //      frame the whole deploying subtree. Fires for root nodes
+    //      too (commit 5adc8a74) so the canvas centers the just-
+    //      landed root immediately instead of waiting for the
+    //      first child to arrive.
+    //   2. molecule:pan-to-node — pans/zooms to the single node;
+    //      only for standalone creates (no parent), so org-import
+    //      children don't chase the spawn animation.
+    // A previous version of this test expected only #2 and failed
+    // when #1 was added for roots. If only one of these ever fires
+    // again, this test should flag the regression.
     const { get, set } = makeStore([]);
     const dispatched: Event[] = [];
     const spy = vi.spyOn(window, "dispatchEvent").mockImplementation((e) => {
@@ -81,9 +93,15 @@ describe("canvas-events – molecule:pan-to-node dispatch", () => {
       set
     );
 
-    expect(dispatched).toHaveLength(1);
-    expect(dispatched[0].type).toBe("molecule:pan-to-node");
-    expect((dispatched[0] as CustomEvent).detail?.nodeId).toBe("ws-new");
+    expect(dispatched).toHaveLength(2);
+    const panEvent = dispatched.find((e) => e.type === "molecule:pan-to-node");
+    const fitEvent = dispatched.find((e) => e.type === "molecule:fit-deploying-org");
+    expect(panEvent, "molecule:pan-to-node should fire for standalone create").toBeDefined();
+    expect(fitEvent, "molecule:fit-deploying-org should fire so the viewport frames the root").toBeDefined();
+    expect((panEvent as CustomEvent).detail?.nodeId).toBe("ws-new");
+    expect((fitEvent as CustomEvent).detail?.rootId).toBe("ws-new");
+
+    spy.mockRestore();
   });
 
   it("does NOT dispatch molecule:pan-to-node when restarting an existing node", () => {
diff --git a/canvas/src/store/__tests__/canvas-topology.test.ts b/canvas/src/store/__tests__/canvas-topology.test.ts
index db046e80..4cee168d 100644
--- a/canvas/src/store/__tests__/canvas-topology.test.ts
+++ b/canvas/src/store/__tests__/canvas-topology.test.ts
@@ -149,6 +149,75 @@ describe("buildNodesAndEdges – parent + child workspaces", () => {
   });
 });
 
+describe("buildNodesAndEdges – auto-rescue respects live grown parent size", () => {
+  // Regression: child the user dragged into a user-grown area was
+  // false-rescued by every periodic rehydrate (socket health check
+  // every 30s) because the rescue heuristic used the initial
+  // grid-derived parent bbox, not the currently-grown size. Result:
+  // child snapped to a stale grid slot, then settled back ~1 frame
+  // later when growParentsToFitChildren re-ran. Observed 2026-04-25
+  // as "child jumps to weird location, then 30s later it's fine".
+
+  it("does NOT rescue a child placed inside the user-grown parent area", () => {
+    // Parent's initial grid-derived size is small; user has since grown it
+    // to 800×600. Child sits at relative (700, 400) — inside the grown
+    // bbox but outside the initial bbox. Without currentParentSizes,
+    // the rescue would re-place the child into a default grid slot.
+    const parentAbs = { x: 100, y: 100 };
+    const childAbs = { x: parentAbs.x + 700, y: parentAbs.y + 400 };
+    const workspaces = [
+      makeWS({ id: "parent", x: parentAbs.x, y: parentAbs.y }),
+      makeWS({ id: "child", parent_id: "parent", x: childAbs.x, y: childAbs.y }),
+    ];
+    const grownDims = new Map([
+      ["parent", { width: 800, height: 600 }],
+    ]);
+
+    const { nodes } = buildNodesAndEdges(workspaces, new Map(), grownDims);
+    const child = nodes.find((n) => n.id === "child")!;
+    // Child's relative position should match what we passed in.
+    expect(child.position).toEqual({ x: 700, y: 400 });
+  });
+
+  it("DOES rescue a child whose stored position is outside even the grown parent", () => {
+    // Same parent but child is way outside (relative 5000, 5000).
+    // The rescue must still fire — the heuristic isn't "always trust
+    // the user", it's "trust the user up to the current parent bbox".
+    const parentAbs = { x: 100, y: 100 };
+    const childAbs = { x: parentAbs.x + 5000, y: parentAbs.y + 5000 };
+    const workspaces = [
+      makeWS({ id: "parent", x: parentAbs.x, y: parentAbs.y }),
+      makeWS({ id: "child", parent_id: "parent", x: childAbs.x, y: childAbs.y }),
+    ];
+    const grownDims = new Map([
+      ["parent", { width: 800, height: 600 }],
+    ]);
+
+    const { nodes } = buildNodesAndEdges(workspaces, new Map(), grownDims);
+    const child = nodes.find((n) => n.id === "child")!;
+    // Rescued: NOT the original (5000, 5000); some grid slot instead.
+    expect(child.position.x).toBeLessThan(5000);
+    expect(child.position.y).toBeLessThan(5000);
+  });
+
+  it("falls back to initial-min bbox when no live size is provided (preserves legacy behavior)", () => {
+    // Empty currentParentSizes — first hydrate or test without store
+    // priming. Child outside the initial bbox should still be rescued.
+    const parentAbs = { x: 100, y: 100 };
+    const childAbs = { x: parentAbs.x + 700, y: parentAbs.y + 400 };
+    const workspaces = [
+      makeWS({ id: "parent", x: parentAbs.x, y: parentAbs.y }),
+      makeWS({ id: "child", parent_id: "parent", x: childAbs.x, y: childAbs.y }),
+    ];
+
+    const { nodes } = buildNodesAndEdges(workspaces);
+    const child = nodes.find((n) => n.id === "child")!;
+    // Without a live size hint, the initial bbox applies — rescue
+    // fires, child gets a fresh slot, NOT the user-supplied (700,400).
+    expect(child.position).not.toEqual({ x: 700, y: 400 });
+  });
+});
+
 describe("buildNodesAndEdges – deeply nested hierarchy", () => {
   it("handles three levels of nesting", () => {
     const workspaces = [
diff --git a/canvas/src/store/__tests__/canvas.test.ts b/canvas/src/store/__tests__/canvas.test.ts
index df0460f6..790833f0 100644
--- a/canvas/src/store/__tests__/canvas.test.ts
+++ b/canvas/src/store/__tests__/canvas.test.ts
@@ -484,6 +484,70 @@ describe("removeNode", () => {
   });
 });
 
+// ---------- removeSubtree ----------
+
+describe("removeSubtree", () => {
+  beforeEach(() => {
+    useCanvasStore.getState().hydrate([
+      makeWS({ id: "root" }),
+      makeWS({ id: "mid", parent_id: "root" }),
+      makeWS({ id: "leaf", parent_id: "mid" }),
+      makeWS({ id: "sibling", parent_id: "root" }),
+      makeWS({ id: "unrelated" }), // separate root
+    ]);
+  });
+
+  it("removes the root and every descendant in one shot", () => {
+    useCanvasStore.getState().removeSubtree("root");
+    const ids = useCanvasStore
+      .getState()
+      .nodes.map((n) => n.id)
+      .sort();
+    expect(ids).toEqual(["unrelated"]);
+  });
+
+  it("removes a mid-level node and its descendants but leaves siblings + ancestors", () => {
+    useCanvasStore.getState().removeSubtree("mid");
+    const ids = useCanvasStore
+      .getState()
+      .nodes.map((n) => n.id)
+      .sort();
+    expect(ids).toEqual(["root", "sibling", "unrelated"]);
+  });
+
+  it("removing a leaf is a no-op cascade (just drops the leaf)", () => {
+    useCanvasStore.getState().removeSubtree("leaf");
+    const ids = useCanvasStore
+      .getState()
+      .nodes.map((n) => n.id)
+      .sort();
+    expect(ids).toEqual(["mid", "root", "sibling", "unrelated"]);
+  });
+
+  it("clears selection when the selected node is anywhere in the removed subtree", () => {
+    useCanvasStore.getState().selectNode("leaf");
+    useCanvasStore.getState().removeSubtree("root");
+    expect(useCanvasStore.getState().selectedNodeId).toBeNull();
+  });
+
+  it("preserves selection when the selected node is outside the removed subtree", () => {
+    useCanvasStore.getState().selectNode("unrelated");
+    useCanvasStore.getState().removeSubtree("root");
+    expect(useCanvasStore.getState().selectedNodeId).toBe("unrelated");
+  });
+
+  it("drops edges incident to any removed node", () => {
+    // The hydrate-built edges connect parent → child. After removing
+    // `root`, no edge involving root/mid/leaf/sibling should remain.
+    useCanvasStore.getState().removeSubtree("root");
+    const remaining = useCanvasStore.getState().edges;
+    for (const e of remaining) {
+      expect(["root", "mid", "leaf", "sibling"]).not.toContain(e.source);
+      expect(["root", "mid", "leaf", "sibling"]).not.toContain(e.target);
+    }
+  });
+});
+
 // ---------- isDescendant ----------
 
 describe("isDescendant", () => {
diff --git a/canvas/src/store/__tests__/socket.test.ts b/canvas/src/store/__tests__/socket.test.ts
index caef6878..ee7f94b7 100644
--- a/canvas/src/store/__tests__/socket.test.ts
+++ b/canvas/src/store/__tests__/socket.test.ts
@@ -1,7 +1,7 @@
 import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
 
 // ---------------------------------------------------------------------------
-// Mock the canvas store before importing socket.ts
+// Mock the canvas store and api before importing socket.ts
 // ---------------------------------------------------------------------------
 vi.mock("../canvas", () => ({
   useCanvasStore: {
@@ -13,6 +13,7 @@ vi.mock("../canvas", () => ({
   },
 }));
 
+
 // ---------------------------------------------------------------------------
 // Mock WebSocket
 // ---------------------------------------------------------------------------
@@ -76,7 +77,6 @@ function getLastWS(): MockWebSocket {
 beforeEach(() => {
   MockWebSocket.instances = [];
   vi.useFakeTimers();
-
   // Reset mocked store state
   vi.mocked(useCanvasStore.getState).mockReturnValue({
     applyEvent: vi.fn(),
@@ -263,13 +263,59 @@ describe("WebSocket onclose – auto-reconnect", () => {
     const ws = getLastWS();
     ws.triggerClose();
 
-    // Fast-forward timers to trigger the reconnect
-    vi.runAllTimers();
+    // First reconnect attempt is scheduled at 1s (Math.min(1000 * 2^0,
+    // 30000)). Advance just past that — vi.runAllTimers() would
+    // additionally re-fire the fallback poll setInterval forever and
+    // hit the 10000-timer abort.
+    vi.advanceTimersByTime(1100);
 
     expect(MockWebSocket.instances.length).toBeGreaterThan(1);
   });
 });
 
+describe("HTTP fallback poll while WS unhealthy", () => {
+  it("starts a setInterval after onclose so /workspaces stays fresh", () => {
+    const setIntervalSpy = vi.spyOn(globalThis, "setInterval");
+    connectSocket();
+    const ws = getLastWS();
+    ws.triggerClose();
+    // The fallback poll runs at 10s; the reconnect uses setTimeout, so
+    // any setInterval registered between connect and close must be the
+    // fallback poll.
+    const fallbackCalls = setIntervalSpy.mock.calls.filter(
+      ([, delay]) => delay === 10_000,
+    );
+    expect(fallbackCalls.length).toBeGreaterThan(0);
+    setIntervalSpy.mockRestore();
+  });
+
+  it("clears the fallback poll once the WS reconnects (onopen)", () => {
+    const clearIntervalSpy = vi.spyOn(globalThis, "clearInterval");
+    connectSocket();
+    const ws = getLastWS();
+    ws.triggerClose(); // starts fallback poll
+    clearIntervalSpy.mockClear();
+    // Advance past the first reconnect delay so a fresh ws exists,
+    // then trigger its open.
+    vi.advanceTimersByTime(1100);
+    const ws2 = getLastWS();
+    ws2.triggerOpen();
+    expect(clearIntervalSpy).toHaveBeenCalled();
+    clearIntervalSpy.mockRestore();
+  });
+
+  it("clears the fallback poll on disconnect", () => {
+    const clearIntervalSpy = vi.spyOn(globalThis, "clearInterval");
+    connectSocket();
+    const ws = getLastWS();
+    ws.triggerClose(); // starts fallback poll
+    clearIntervalSpy.mockClear();
+    disconnectSocket();
+    expect(clearIntervalSpy).toHaveBeenCalled();
+    clearIntervalSpy.mockRestore();
+  });
+});
+
 // ---------------------------------------------------------------------------
 // onerror handler
 // ---------------------------------------------------------------------------
@@ -328,3 +374,45 @@ describe("health check", () => {
     clearIntervalSpy.mockRestore();
   });
 });
+
+// Rehydrate dedup logic itself is exercised by `RehydrateDedup` unit
+// tests in this file (below). End-to-end coupling through the
+// dynamic-imported `@/lib/api` was non-trivial under our existing
+// fake-timer setup; isolating the gate in a pure helper keeps
+// regression coverage without that mocking complexity.
+
+import { RehydrateDedup } from "../socket";
+
+describe("RehydrateDedup", () => {
+  it("first call passes the gate (no prior fetch)", () => {
+    const d = new RehydrateDedup(1500);
+    expect(d.shouldSkip(0)).toBe(false);
+  });
+
+  it("blocks while a fetch is in flight", () => {
+    const d = new RehydrateDedup(1500);
+    d.beginFetch();
+    expect(d.shouldSkip(100)).toBe(true);
+  });
+
+  it("blocks within the post-completion window", () => {
+    const d = new RehydrateDedup(1500);
+    d.beginFetch();
+    d.completeFetch(1_000);
+    // 1100 - 1000 = 100 < 1500 → skip
+    expect(d.shouldSkip(1_100)).toBe(true);
+    // 2600 - 1000 = 1600 > 1500 → allow
+    expect(d.shouldSkip(2_600)).toBe(false);
+  });
+
+  it("a completed fetch followed by another beginFetch blocks for the new in-flight", () => {
+    const d = new RehydrateDedup(1500);
+    d.beginFetch();
+    d.completeFetch(1_000);
+    // First wait out the dedup window
+    expect(d.shouldSkip(2_600)).toBe(false);
+    d.beginFetch();
+    // Now a second fetch is in flight; further calls block again
+    expect(d.shouldSkip(2_700)).toBe(true);
+  });
+});
diff --git a/canvas/src/store/canvas-events.ts b/canvas/src/store/canvas-events.ts
index b3778378..77220a79 100644
--- a/canvas/src/store/canvas-events.ts
+++ b/canvas/src/store/canvas-events.ts
@@ -1,7 +1,7 @@
 import type { Node, Edge } from "@xyflow/react";
 import type { WSMessage } from "./socket";
 import type { WorkspaceNodeData } from "./canvas";
-import { extractResponseText } from "@/components/tabs/chat/message-parser";
+import { extractResponseText, extractFilesFromTask } from "@/components/tabs/chat/message-parser";
 
 // ---------------------------------------------------------------------------
 // Monotonically increasing counter used to assign grid positions.
@@ -21,13 +21,46 @@ import { extractResponseText } from "@/components/tabs/chat/message-parser";
 //
 // A monotonic counter is immune to deletions: it only ever increases.
 // ---------------------------------------------------------------------------
+import { appendClass, removeClass, scheduleNodeClassRemoval } from "./classNames";
+
 let _provisioningSequence = 0;
 
 /** Reset the sequence counter — exposed for test teardown only. */
 export function resetProvisioningSequence(): void {
   _provisioningSequence = 0;
+  _pendingOnline.clear();
 }
 
+/** WORKSPACE_ONLINE events that arrived BEFORE the matching
+ *  WORKSPACE_PROVISIONING — buffered here so the late-arriving
+ *  provision event can immediately flip to the correct status
+ *  instead of leaving the node stuck as "provisioning" forever.
+ *  Cleared when applied, or on module reset (tests). */
+const _pendingOnline = new Set<string>();
+
+/** Debounced parent-grow. Each child arrival schedules this; the
+ *  timer keeps resetting as more siblings land, so the actual
+ *  width/height update runs ONCE after arrivals go quiet. Avoids
+ *  the visible size-pulse that happened when growParentsToFitChildren
+ *  ran per event. */
+let _growTimer: ReturnType<typeof setTimeout> | null = null;
+function scheduleParentGrow(): void {
+  if (typeof window === "undefined") return;
+  if (_growTimer) clearTimeout(_growTimer);
+  _growTimer = setTimeout(() => {
+    _growTimer = null;
+    import("./canvas").then(({ useCanvasStore }) => {
+      useCanvasStore.getState().growParentsToFitChildren?.();
+    });
+  }, 300);
+}
+
+// (absoluteNodePosition was used by an earlier "spawn from parent"
+// revision that subtracted parent absolute coords from server-sent
+// absolute child coords. The server now ships parent-relative coords
+// directly, so the walk is no longer needed. Deleted rather than
+// kept as dead code.)
+
 /**
  * Standalone event handler extracted from the canvas store.
  * Applies a single WebSocket event to the current node/edge state.
@@ -38,7 +71,7 @@ export function handleCanvasEvent(
     nodes: Node<WorkspaceNodeData>[];
     edges: Edge[];
     selectedNodeId: string | null;
-    agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string }>>;
+    agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string; attachments?: Array<{ name: string; uri: string; mimeType?: string; size?: number }> }>>;
   },
   set: (partial: Record<string, unknown>) => void,
 ): void {
@@ -47,14 +80,44 @@ export function handleCanvasEvent(
   switch (msg.event) {
     case "WORKSPACE_ONLINE": {
       const existing = nodes.find((n) => n.id === msg.workspace_id);
-      if (existing) {
-        set({
-          nodes: nodes.map((n) =>
-            n.id === msg.workspace_id
-              ? { ...n, data: { ...n.data, status: "online" } }
-              : n
-          ),
-        });
+      if (!existing) {
+        // PROVISIONING event hasn't been applied yet (WS reorder or
+        // this tab joined mid-deploy). Buffer so the later PROVISIONING
+        // handler can flip status in one pass instead of leaving the
+        // node stuck in "provisioning" forever.
+        _pendingOnline.add(msg.workspace_id);
+        break;
+      }
+      // Flip incoming edge from blueprint → laser so the link is
+      // drawn solid the moment this child is live. The laser class
+      // plays the stroke-dashoffset keyframe once; after ~500ms the
+      // edge falls back to the default solid style (see
+      // org-deploy.css and the follow-up setTimeout below).
+      const updatedEdges = edges.map((e) =>
+        e.target === msg.workspace_id && e.className?.includes("mol-deploy-edge-blueprint")
+          ? { ...e, className: "mol-deploy-edge-laser" }
+          : e,
+      );
+      set({
+        edges: updatedEdges,
+        nodes: nodes.map((n) =>
+          n.id === msg.workspace_id
+            ? { ...n, data: { ...n.data, status: "online" } }
+            : n,
+        ),
+      });
+      // Remove the laser class after its keyframe ends so the edge
+      // settles into the app's default solid styling. Fire-and-forget.
+      if (typeof window !== "undefined") {
+        const targetEdgeId = `${existing.data.parentId ?? ""}-${msg.workspace_id}`;
+        window.setTimeout(() => {
+          const s = get();
+          set({
+            edges: s.edges.map((e) =>
+              e.id === targetEdgeId ? { ...e, className: undefined } : e,
+            ),
+          });
+        }, 600);
       }
       break;
     }
@@ -113,25 +176,73 @@ export function handleCanvasEvent(
           ),
         });
       } else {
-        // Spread new nodes in a grid so they don't stack at the viewport origin.
-        // Use the monotonic _provisioningSequence counter (not nodes.length) so
-        // deletions never cause two live nodes to share a grid slot.
-        const GRID_COLS = 4;
-        const COL_SPACING = 320;
-        const ROW_SPACING = 160;
-        const GRID_ORIGIN_X = 100;
-        const GRID_ORIGIN_Y = 100;
-        const idx = _provisioningSequence++;
-        const x = GRID_ORIGIN_X + (idx % GRID_COLS) * COL_SPACING;
-        const y = GRID_ORIGIN_Y + Math.floor(idx / GRID_COLS) * ROW_SPACING;
+        // Payload may carry parent_id + final x/y (org import broadcasts
+        // these so the canvas can animate the "spawn from parent" motion).
+        // Standalone workspace creates still omit them — fall back to the
+        // grid-slot behaviour that handled that case historically.
+        const parentIdRaw = (msg.payload.parent_id as string | undefined) ?? null;
+        const finalX = msg.payload.x as number | undefined;
+        const finalY = msg.payload.y as number | undefined;
 
+        let spawnX: number;
+        let spawnY: number;
+        let targetX: number;
+        let targetY: number;
+        let parentId: string | null = null;
+
+        // Place the node at its final slot immediately — no
+        // spring-from-parent motion. The earlier "materialize from
+        // parent then tween to target" was expensive (two set()
+        // calls + rAF) and produced wrong offsets because the
+        // server sends absolute coords computed against the template's
+        // own coord system while the client had placed the parent at
+        // a grid slot, so the target math always landed off-grid.
+        // Now: server coords are parent-relative (see org_import.go),
+        // we trust them verbatim.
+        const parentInStore = parentIdRaw
+          ? nodes.find((n) => n.id === parentIdRaw)
+          : undefined;
+        if (parentIdRaw && parentInStore && finalX !== undefined && finalY !== undefined) {
+          targetX = finalX;
+          targetY = finalY;
+          parentId = parentIdRaw;
+        } else {
+          // Standalone create OR org-child whose parent hasn't arrived
+          // yet (rare WS reorder) — monotonic-grid placement. The
+          // follow-up hydrate pass reconciles parent_id + the correct
+          // nested position if parent lands later.
+          const GRID_COLS = 4;
+          const COL_SPACING = 320;
+          const ROW_SPACING = 160;
+          const GRID_ORIGIN_X = 100;
+          const GRID_ORIGIN_Y = 100;
+          const idx = _provisioningSequence++;
+          targetX = GRID_ORIGIN_X + (idx % GRID_COLS) * COL_SPACING;
+          targetY = GRID_ORIGIN_Y + Math.floor(idx / GRID_COLS) * ROW_SPACING;
+        }
+        spawnX = targetX;
+        spawnY = targetY;
+
+        // Parent→child relationship is already visible via React
+        // Flow's nested rendering (the child card sits INSIDE the
+        // parent container). An explicit edge on top of that was
+        // visual double-counting and made the canvas look busy;
+        // removed per demo feedback. A2A edges (showA2AEdges) still
+        // render when enabled — those represent runtime traffic,
+        // which nesting doesn't express.
         set({
           nodes: [
             ...nodes,
             {
               id: msg.workspace_id,
               type: "workspaceNode",
-              position: { x, y },
+              position: { x: spawnX, y: spawnY },
+              // React Flow's parentId (distinct from data.parentId)
+              // triggers parent-relative positioning. Set it when the
+              // server told us this is an org-import child so the
+              // node renders nested inside the parent container.
+              ...(parentId ? { parentId } : {}),
+              className: "mol-deploy-spawn",
               data: {
                 name: (msg.payload.name as string) ?? "New Workspace",
                 status: "provisioning",
@@ -143,7 +254,7 @@ export function handleCanvasEvent(
                 lastErrorRate: 0,
                 lastSampleError: "",
                 url: "",
-                parentId: null,
+                parentId, // data.parentId mirrors React Flow's parentId
                 currentTask: "",
                 runtime: (msg.payload.runtime as string) ?? "",
                 needsRestart: false,
@@ -152,8 +263,76 @@ export function handleCanvasEvent(
           ],
         });
 
-        // Pan the canvas to the new node
+        // Grow the parent to fit the just-landed child. DEBOUNCED
+        // across rapid sibling arrivals — firing width/height updates
+        // on every child made the parent card visibly pulse in size
+        // as each kid landed, which read as the parent "flashing
+        // around". One grow pass ~300ms after the last arrival
+        // coalesces the whole burst into a single layout change.
+        if (parentId && typeof window !== "undefined") {
+          scheduleParentGrow();
+        }
+        // Parent-border pulse removed per demo feedback — the soft
+        // box-shadow ring on each arrival compounded with the size
+        // grow to make the whole parent card look unstable. The
+        // dim-light signal on the provisioning child is sufficient
+        // acknowledgement that something is happening.
+
+        // Remove the one-shot spawn class after the keyframe ends so
+        // future re-renders don't replay it.
+        scheduleNodeClassRemoval(msg.workspace_id, "mol-deploy-spawn", 400, get, set);
+
+        // Auto-pan+zoom to the whole deploying org after each
+        // arrival so the user always sees the full picture — unless
+        // they've panned themselves (handled by the viewport hook,
+        // which aborts the fit when the user moved after the last
+        // auto-fit). Event name matches the existing handler in
+        // useCanvasViewport that knows how to compute subtree bounds.
+        //
+        // Fire for roots too (not just children) so the canvas
+        // centers on the just-landed root immediately instead of
+        // waiting for the first child to arrive ~2s later. The
+        // viewport hook walks UP to find the true root, so passing
+        // the node's own id when there's no parent is equivalent
+        // to passing the root.
         if (typeof window !== "undefined") {
+          window.dispatchEvent(
+            new CustomEvent("molecule:fit-deploying-org", {
+              detail: { rootId: parentIdRaw ?? msg.workspace_id },
+            }),
+          );
+        }
+
+        // Race handling: if a WORKSPACE_ONLINE event beat the
+        // matching PROVISIONING to this tab, the online flag was
+        // buffered in _pendingOnline. Apply it now so the node
+        // doesn't stay stuck as "provisioning" forever.
+        //
+        // Only flip to "online" if the current status is still
+        // "provisioning" at drain time. Otherwise a WORKSPACE_DEGRADED
+        // / FAILED / PAUSED that arrived between the set() above and
+        // the scheduled drain would be silently clobbered — the
+        // buffered ONLINE is stale by then.
+        if (_pendingOnline.has(msg.workspace_id)) {
+          _pendingOnline.delete(msg.workspace_id);
+          if (typeof window !== "undefined") {
+            window.setTimeout(() => {
+              const s = get();
+              set({
+                nodes: s.nodes.map((n) =>
+                  n.id === msg.workspace_id && n.data.status === "provisioning"
+                    ? { ...n, data: { ...n.data, status: "online" } }
+                    : n,
+                ),
+              });
+            }, 0);
+          }
+        }
+
+        // Pan the canvas to the new node (standalone create only —
+        // during an org import, zooming to every child chases the
+        // spawn animation around the viewport which is jarring).
+        if (!parentIdRaw && typeof window !== "undefined") {
           window.dispatchEvent(
             new CustomEvent("molecule:pan-to-node", {
               detail: { nodeId: msg.workspace_id },
@@ -252,12 +431,19 @@ export function handleCanvasEvent(
     }
 
     case "A2A_RESPONSE": {
-      // A2A proxy completed — extract response text and store as agent message.
-      // This gives the ChatTab instant response delivery via WebSocket instead of polling.
+      // A2A proxy completed — extract response text AND any `kind: file`
+      // parts. Without the file extraction, agent-returned attachments
+      // delivered via this WebSocket path would disappear (the canvas
+      // would render a text-only message while the HTTP fallback
+      // rendered the same reply with download chips, depending on
+      // which delivery path raced to completion first).
       const responseBody = msg.payload.response_body as Record<string, unknown> | undefined;
       if (responseBody) {
         const text = extractResponseText(responseBody);
-        if (text) {
+        const attachments = extractFilesFromTask(
+          (responseBody.result ?? responseBody) as Record<string, unknown>,
+        );
+        if (text || attachments.length > 0) {
           const { agentMessages } = get();
           const existing = agentMessages[msg.workspace_id] || [];
           set({
@@ -265,7 +451,12 @@ export function handleCanvasEvent(
               ...agentMessages,
               [msg.workspace_id]: [
                 ...existing,
-                { id: crypto.randomUUID(), content: text, timestamp: new Date().toISOString() },
+                {
+                  id: crypto.randomUUID(),
+                  content: text,
+                  timestamp: new Date().toISOString(),
+                  attachments: attachments.length > 0 ? attachments : undefined,
+                },
               ],
             },
           });
diff --git a/canvas/src/store/canvas-topology.ts b/canvas/src/store/canvas-topology.ts
index fbd02601..396b89ff 100644
--- a/canvas/src/store/canvas-topology.ts
+++ b/canvas/src/store/canvas-topology.ts
@@ -280,6 +280,15 @@ export function computeAutoLayout(
  * Accepts an optional layoutOverrides map (from computeAutoLayout) to override
  * positions for workspaces that were at 0,0.
  *
+ * `currentParentSizes` carries the LIVE measured/grown dimensions of parent
+ * nodes from the existing client store. The auto-rescue heuristic below
+ * (line ~445) compares each child's stored relative position against its
+ * parent's bbox; without the live size, the bbox is whatever the
+ * grid-derived initial min-size formula produced. That falsely rescued
+ * children dragged into the user-grown area on every periodic rehydrate
+ * (socket.ts:87 fires every 30s if no WS events seen) — observed
+ * 2026-04-25 as "child jumps to weird location, then settles 30s later".
+ *
  * Parent/child rendering model: every workspace is a first-class React Flow
  * node (full card). When a workspace has parent_id set, its RF `parentId` is
  * set to the parent's id and its position is stored RELATIVE to the parent
@@ -290,7 +299,8 @@ export function computeAutoLayout(
  */
 export function buildNodesAndEdges(
   workspaces: WorkspaceData[],
-  layoutOverrides: Map<string, { x: number; y: number }> = new Map()
+  layoutOverrides: Map<string, { x: number; y: number }> = new Map(),
+  currentParentSizes: Map<string, { width: number; height: number }> = new Map(),
 ): {
   nodes: Node<WorkspaceNodeData>[];
   edges: Edge[];
@@ -439,7 +449,23 @@ export function buildNodesAndEdges(
       //     child.left = 500 < parent.right = 800 → overlaps → kept
       //   legacy huge positive (position.x = 50000):
       //     child.left = 50000 >= parent.right → no overlap → rescued
-      const psize = parentSize.get(ws.parent_id!)!;
+      const initialPsize = parentSize.get(ws.parent_id!)!;
+      // Use the larger of (initial min, currently grown) for the bbox
+      // test. Without this, a child the user dragged into the grown
+      // area appears "outside" the (smaller) initial bbox and the
+      // rescue below false-fires on every periodic rehydrate, jumping
+      // the child to a stale grid slot. Live grown dims arrive via
+      // currentParentSizes from hydrate(); on first load (empty
+      // store), the map is empty and we fall back to the initial min
+      // — preserving the original rescue semantics for genuinely
+      // detached legacy data.
+      const liveParentSize = currentParentSizes.get(ws.parent_id!);
+      const psize = liveParentSize
+        ? {
+            width: Math.max(initialPsize.width, liveParentSize.width),
+            height: Math.max(initialPsize.height, liveParentSize.height),
+          }
+        : initialPsize;
       const myW = subtreeSize.get(ws.id)?.width ?? CHILD_DEFAULT_WIDTH;
       const myH = subtreeSize.get(ws.id)?.height ?? CHILD_DEFAULT_HEIGHT;
       const overlapsX =
diff --git a/canvas/src/store/canvas.ts b/canvas/src/store/canvas.ts
index 02f93b25..cbe2f9db 100644
--- a/canvas/src/store/canvas.ts
+++ b/canvas/src/store/canvas.ts
@@ -138,6 +138,16 @@ interface CanvasState {
   updateNodeData: (id: string, data: Partial<WorkspaceNodeData>) => void;
   restartWorkspace: (id: string) => Promise<void>;
   removeNode: (id: string) => void;
+  /** Remove a node AND every descendant in one atomic update. Mirrors
+   *  the server-side cascade — `DELETE /workspaces/:id?confirm=true`
+   *  drops the row plus every descendant in one transaction. The
+   *  caller (Canvas / DetailsTab delete handlers) used to call
+   *  `removeNode(rootId)` and rely on per-descendant WORKSPACE_REMOVED
+   *  WS events to clear the rest. When the WS is unhealthy those
+   *  events never arrive and the children orphan to the root until a
+   *  manual page refresh — `removeSubtree` makes the cascade
+   *  WS-independent. */
+  removeSubtree: (rootId: string) => void;
   setDragOverNode: (id: string | null) => void;
   nestNode: (draggedId: string, targetId: string | null) => Promise<void>;
   isDescendant: (ancestorId: string, nodeId: string) => boolean;
@@ -177,6 +187,15 @@ interface CanvasState {
   setPendingDelete: (
     v: { id: string; name: string; hasChildren: boolean; children: { id: string; name: string }[] } | null
   ) => void;
+  /** Node IDs whose DELETE request is in flight. Populated the moment
+   *  the user confirms a cascade delete; drained as WORKSPACE_REMOVED
+   *  events strip the nodes (or all-at-once on request failure). Lets
+   *  the canvas render the "don't touch — something is happening"
+   *  treatment (dim + non-draggable) during the network round trip
+   *  and the server-side cascade, matching the deploy-lock UX. */
+  deletingIds: Set<string>;
+  beginDelete: (ids: Iterable<string>) => void;
+  endDelete: (ids: Iterable<string>) => void;
   searchOpen: boolean;
   setSearchOpen: (open: boolean) => void;
   viewport: { x: number; y: number; zoom: number };
@@ -190,8 +209,8 @@ interface CanvasState {
   batchPause: () => Promise<void>;
   batchDelete: () => Promise<void>;
   /** Agent-pushed messages keyed by workspace ID. ChatTab consumes and clears these. */
-  agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string }>>;
-  consumeAgentMessages: (workspaceId: string) => Array<{ id: string; content: string; timestamp: string }>;
+  agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string; attachments?: Array<{ name: string; uri: string; mimeType?: string; size?: number }> }>>;
+  consumeAgentMessages: (workspaceId: string) => Array<{ id: string; content: string; timestamp: string; attachments?: Array<{ name: string; uri: string; mimeType?: string; size?: number }> }>;
   /** WebSocket connection status — drives the live indicator in the Toolbar. */
   wsStatus: "connected" | "connecting" | "disconnected";
   setWsStatus: (status: "connected" | "connecting" | "disconnected") => void;
@@ -309,6 +328,17 @@ export const useCanvasStore = create<CanvasState>((set, get) => ({
   closeContextMenu: () => set({ contextMenu: null }),
   pendingDelete: null,
   setPendingDelete: (v) => set({ pendingDelete: v }),
+  deletingIds: new Set<string>(),
+  beginDelete: (ids) => {
+    const next = new Set(get().deletingIds);
+    for (const id of ids) next.add(id);
+    set({ deletingIds: next });
+  },
+  endDelete: (ids) => {
+    const next = new Set(get().deletingIds);
+    for (const id of ids) next.delete(id);
+    set({ deletingIds: next });
+  },
   searchOpen: false,
   setSearchOpen: (open) => set({ searchOpen: open }),
   agentMessages: {},
@@ -775,9 +805,69 @@ export const useCanvasStore = create<CanvasState>((set, get) => ({
     });
   },
 
+  removeSubtree: (rootId) => {
+    const { nodes, edges, selectedNodeId } = get();
+    // Build a parentId → childIds index once so the descent is O(N),
+    // not O(N · depth). The store typically holds <500 nodes; even
+    // doing a linear scan per parent would be fine, but the index
+    // keeps the cost predictable as orgs grow.
+    const childrenByParent = new Map<string, string[]>();
+    for (const n of nodes) {
+      const p = n.data.parentId ?? null;
+      if (p === null) continue;
+      const arr = childrenByParent.get(p);
+      if (arr) arr.push(n.id);
+      else childrenByParent.set(p, [n.id]);
+    }
+    const removed = new Set<string>([rootId]);
+    const stack = [rootId];
+    while (stack.length) {
+      const cur = stack.pop()!;
+      const kids = childrenByParent.get(cur);
+      if (!kids) continue;
+      for (const k of kids) {
+        if (!removed.has(k)) {
+          removed.add(k);
+          stack.push(k);
+        }
+      }
+    }
+    set({
+      nodes: nodes.filter((n) => !removed.has(n.id)),
+      edges: edges.filter((e) => !removed.has(e.source) && !removed.has(e.target)),
+      selectedNodeId:
+        selectedNodeId !== null && removed.has(selectedNodeId)
+          ? null
+          : selectedNodeId,
+    });
+  },
+
   hydrate: (workspaces: WorkspaceData[]) => {
     const layoutOverrides = computeAutoLayout(workspaces);
-    const { nodes, edges } = buildNodesAndEdges(workspaces, layoutOverrides);
+    // Carry the live measured/grown parent sizes from the existing
+    // store into the rebuild. buildNodesAndEdges runs an auto-rescue
+    // pass on each child to detach orphans whose stored relative
+    // position falls outside the parent bbox — without the live
+    // size, the bbox is the initial grid-derived minimum, which
+    // false-flags any child the user has dragged into the
+    // user-grown area. Periodic rehydrate (socket.ts health check,
+    // 30s) was reasserting the rescue against legitimate user
+    // placements, causing the "child jumps to weird location, then
+    // settles" symptom.
+    const current = get().nodes;
+    const currentParentSizes = new Map<string, { width: number; height: number }>();
+    for (const n of current) {
+      const w = (n.measured?.width ?? n.width) as number | undefined;
+      const h = (n.measured?.height ?? n.height) as number | undefined;
+      if (typeof w === "number" && typeof h === "number") {
+        currentParentSizes.set(n.id, { width: w, height: h });
+      }
+    }
+    const { nodes, edges } = buildNodesAndEdges(
+      workspaces,
+      layoutOverrides,
+      currentParentSizes,
+    );
     set({ nodes, edges });
     for (const [nodeId, { x, y }] of layoutOverrides) {
       api.patch(`/workspaces/${nodeId}`, { x, y }).catch(() => {});
diff --git a/canvas/src/store/classNames.ts b/canvas/src/store/classNames.ts
new file mode 100644
index 00000000..80ccc769
--- /dev/null
+++ b/canvas/src/store/classNames.ts
@@ -0,0 +1,53 @@
+/**
+ * React Flow className helpers shared across the store and canvas
+ * hooks. React Flow's Node.className / Edge.className is a single
+ * space-separated string, so every call site was previously doing
+ * the same `.split/.filter/.join` dance — centralise it here so
+ * any future class manipulation follows one policy.
+ */
+
+/** Add `cls` to the existing className, de-duplicating. Returns
+ *  the (possibly new) string; undefined/empty input → just `cls`. */
+export function appendClass(existing: string | undefined, cls: string): string {
+  if (!existing) return cls;
+  const parts = existing.split(/\s+/).filter(Boolean);
+  if (parts.includes(cls)) return existing;
+  parts.push(cls);
+  return parts.join(" ");
+}
+
+/** Remove `cls` if present. Returns the (possibly empty) string. */
+export function removeClass(existing: string | undefined, cls: string): string {
+  if (!existing) return "";
+  return existing
+    .split(/\s+/)
+    .filter((c) => c && c !== cls)
+    .join(" ");
+}
+
+/** Schedule `removeClass(nodeId, cls)` on the `nodes` slice after
+ *  `delayMs`. The callers used to inline this twice — once for
+ *  parent-pulse cleanup, once for spawn-class cleanup — and now
+ *  share the same impl so future one-shot animation classes land
+ *  consistently.
+ *
+ *  No-ops when `window` is undefined (SSR). Accepts the store's
+ *  get/set pair directly rather than a store reference so it
+ *  composes with the existing handleCanvasEvent signature. */
+export function scheduleNodeClassRemoval(
+  nodeId: string,
+  cls: string,
+  delayMs: number,
+  get: () => { nodes: Array<{ id: string; className?: string }> },
+  set: (partial: Record<string, unknown>) => void,
+): void {
+  if (typeof window === "undefined") return;
+  window.setTimeout(() => {
+    const state = get();
+    set({
+      nodes: state.nodes.map((n) =>
+        n.id === nodeId ? { ...n, className: removeClass(n.className, cls) } : n,
+      ),
+    });
+  }, delayMs);
+}
diff --git a/canvas/src/store/socket.ts b/canvas/src/store/socket.ts
index 858fc875..fca3b8a6 100644
--- a/canvas/src/store/socket.ts
+++ b/canvas/src/store/socket.ts
@@ -12,30 +12,129 @@ export interface WSMessage {
   payload: Record<string, unknown>;
 }
 
+/** Window during which a freshly-completed rehydrate is reused
+ *  instead of firing a new GET. Picked to absorb the connect→health-
+ *  check sequence (rehydrate runs once on onopen, then the first
+ *  health-check tick fires immediately after — both should share the
+ *  same fetch) without holding back legitimately-spaced rehydrates
+ *  triggered by genuine WS silence later. */
+const REHYDRATE_DEDUP_WINDOW_MS = 1_500;
+
+/** Pure dedup gate for rehydrate(). Tracks two states:
+ *
+ *    - in-flight (between beginFetch and completeFetch): every
+ *      shouldSkip returns true.
+ *    - post-completion window (now < completedAt + windowMs):
+ *      shouldSkip returns true.
+ *
+ *  Extracted from ReconnectingSocket so the gate is unit-testable
+ *  without mocking dynamic imports or fake timers. The class itself
+ *  is stateful but tiny — instances are not shared across sockets. */
+export class RehydrateDedup {
+  private inFlight = false;
+  // -Infinity so the very first shouldSkip(now) call always passes
+  // (now - (-Infinity) > windowMs). Initializing to 0 would false-
+  // trip on test runs where now is also 0 (vi.useFakeTimers default
+  // clock) AND on real runs in the first 1.5s after epoch on
+  // clock-skewed systems.
+  private completedAt = Number.NEGATIVE_INFINITY;
+  constructor(private readonly windowMs: number) {}
+
+  shouldSkip(now: number): boolean {
+    if (this.inFlight) return true;
+    if (now - this.completedAt < this.windowMs) return true;
+    return false;
+  }
+
+  beginFetch(): void {
+    this.inFlight = true;
+  }
+
+  completeFetch(now: number = Date.now()): void {
+    this.inFlight = false;
+    this.completedAt = now;
+  }
+}
+
+/** Cadence for the HTTP fallback rehydrate that runs while the WS is
+ *  in connecting/disconnected limbo. 10s is short enough that the user
+ *  sees STARTING → ONLINE within one tick after the platform finishes
+ *  provisioning, but long enough to not pound /workspaces if the
+ *  network truly is down. The dedup gate inside rehydrate() collapses
+ *  this against the post-onopen rehydrate, so reconnect doesn't pay
+ *  for a duplicate fetch. */
+const FALLBACK_POLL_MS = 10_000;
+
 class ReconnectingSocket {
   private ws: WebSocket | null = null;
   private attempt = 0;
   private url: string;
   private lastEventTime = 0;
   private healthCheckTimer: ReturnType<typeof setInterval> | null = null;
+  private reconnectTimer: ReturnType<typeof setTimeout> | null = null;
+  // Polls /workspaces while the WS is unhealthy so the canvas reflects
+  // truth even when realtime events aren't arriving. Without this the
+  // store can stay frozen for minutes — e.g. workspaces transition
+  // STARTING → ONLINE on the platform but the canvas keeps showing
+  // STARTING until the WS finally reconnects, triggering false
+  // "Provisioning Timeout" banners on already-online workspaces.
+  private fallbackPollTimer: ReturnType<typeof setInterval> | null = null;
+  // disposed signals that disconnect() has been called. Any in-flight
+  // reconnect / handshake must abort early rather than attach to a
+  // socket the caller no longer owns — otherwise React StrictMode's
+  // effect double-invoke (and any future intentional disconnect)
+  // leaves a zombie WebSocket alive forever.
+  private disposed = false;
+  // In-flight singleton + dedup window for rehydrate. Two reasons to
+  // collapse rapid calls:
+  //   1. connect.onopen fires rehydrate immediately, and the very next
+  //      health-check tick may fire it again before the first GET
+  //      returns — wasted round trip + rebuild churn that resets the
+  //      mid-flight UI state (auto-rescue heuristics, grow passes).
+  //   2. Future call sites (a manual "Refresh" button, post-import
+  //      hydrate, error-recovery rehydrate) might pile up.
+  // Keeping rehydrate idempotent at the call-site level means each
+  // caller can fire-and-forget without coordinating.
+  private rehydrateInFlight: Promise<void> | null = null;
+  private rehydrateDedup = new RehydrateDedup(REHYDRATE_DEDUP_WINDOW_MS);
 
   constructor(url: string) {
     this.url = url;
   }
 
   connect() {
+    if (this.disposed) return;
     useCanvasStore.getState().setWsStatus("connecting");
-    this.ws = new WebSocket(this.url);
+    // Start the HTTP fallback poll up-front, not just on onclose. Two
+    // scenarios this guards against:
+    //   1. The very first connect attempt — onclose hasn't fired yet
+    //      because we never had a successful onopen.
+    //   2. A failed handshake where the browser takes tens of seconds
+    //      to surface as onclose (Chrome can hold a SYN-SENT WebSocket
+    //      open for ~75s before giving up).
+    // Idempotent — startFallbackPoll early-returns if a timer is
+    // already running, so calling it from both places is cheap.
+    this.startFallbackPoll();
+    const ws = new WebSocket(this.url);
+    this.ws = ws;
 
-    this.ws.onopen = () => {
+    ws.onopen = () => {
+      if (this.disposed || this.ws !== ws) {
+        // Late-open on an abandoned socket. Close it cleanly; the
+        // caller already moved on.
+        try { ws.close(); } catch { /* noop */ }
+        return;
+      }
       this.attempt = 0;
       this.lastEventTime = Date.now();
       useCanvasStore.getState().setWsStatus("connected");
+      this.stopFallbackPoll();
       this.rehydrate();
       this.startHealthCheck();
     };
 
-    this.ws.onmessage = (event) => {
+    ws.onmessage = (event) => {
+      if (this.disposed || this.ws !== ws) return;
       this.lastEventTime = Date.now();
       try {
         const msg: WSMessage = JSON.parse(event.data);
@@ -45,15 +144,21 @@ class ReconnectingSocket {
       }
     };
 
-    this.ws.onclose = () => {
+    ws.onclose = () => {
+      // Fired on intentional close (disposed) OR server/network drop.
+      // Only schedule a reconnect when the socket is still live AND
+      // corresponds to the WS we just tore down (prevents a stale
+      // onclose from a zombie socket from re-arming the loop).
+      if (this.disposed || this.ws !== ws) return;
       this.stopHealthCheck();
       useCanvasStore.getState().setWsStatus("connecting");
+      this.startFallbackPoll();
       const delay = Math.min(1000 * 2 ** this.attempt, 30000);
       this.attempt++;
-      setTimeout(() => this.connect(), delay);
+      this.reconnectTimer = setTimeout(() => this.connect(), delay);
     };
 
-    this.ws.onerror = () => {
+    ws.onerror = () => {
       // Suppressed — onclose handles reconnection. onerror fires before onclose
       // and the Event object doesn't contain useful info (serializes to {}).
     };
@@ -80,20 +185,78 @@ class ReconnectingSocket {
     }
   }
 
-  private async rehydrate() {
-    try {
-      const { api } = await import("@/lib/api");
-      const workspaces = await api.get<WorkspaceData[]>("/workspaces");
-      useCanvasStore.getState().hydrate(workspaces);
-    } catch {
-      // Rehydration failed — will retry on next health check cycle
+  /** While the WS is in connecting/disconnected limbo, poll /workspaces
+   *  so the store stays fresh. The reconnect attempts continue in
+   *  parallel; whichever recovers first wins. rehydrate()'s own dedup
+   *  gate prevents this from racing with the open-time rehydrate. */
+  private startFallbackPoll() {
+    if (this.fallbackPollTimer) return;
+    this.fallbackPollTimer = setInterval(() => {
+      if (this.disposed) {
+        this.stopFallbackPoll();
+        return;
+      }
+      void this.rehydrate();
+    }, FALLBACK_POLL_MS);
+  }
+
+  private stopFallbackPoll() {
+    if (this.fallbackPollTimer) {
+      clearInterval(this.fallbackPollTimer);
+      this.fallbackPollTimer = null;
     }
   }
 
+  private rehydrate(): Promise<void> {
+    // Reuse an in-flight fetch — a second caller during the GET
+    // shouldn't kick off a parallel one.
+    if (this.rehydrateInFlight) return this.rehydrateInFlight;
+    if (this.rehydrateDedup.shouldSkip(Date.now())) {
+      return Promise.resolve();
+    }
+
+    // beginFetch lives INSIDE the IIFE's try so any future code added
+    // between gate-check and IIFE-construction can't throw and leave
+    // the gate stuck at inFlight=true forever. Today there's nothing
+    // that can throw here, but the cost of being defensive is one
+    // extra microtask of "in flight" status — negligible.
+    const promise = (async () => {
+      this.rehydrateDedup.beginFetch();
+      try {
+        const { api } = await import("@/lib/api");
+        const workspaces = await api.get<WorkspaceData[]>("/workspaces");
+        if (this.disposed) return;
+        useCanvasStore.getState().hydrate(workspaces);
+      } catch {
+        // Rehydration failed — will retry on next health check cycle.
+      } finally {
+        this.rehydrateDedup.completeFetch(Date.now());
+        this.rehydrateInFlight = null;
+      }
+    })();
+    this.rehydrateInFlight = promise;
+    return promise;
+  }
+
   disconnect() {
+    this.disposed = true;
     this.stopHealthCheck();
+    this.stopFallbackPoll();
+    if (this.reconnectTimer) {
+      clearTimeout(this.reconnectTimer);
+      this.reconnectTimer = null;
+    }
     if (this.ws) {
-      this.ws.close();
+      // Detach listeners before close() so we don't route the close
+      // event through our onclose → scheduleReconnect path. Belt +
+      // braces on top of the `disposed` check, because StrictMode
+      // cycles through so fast that an attached onclose can fire
+      // after disposed=true is set but before this assignment runs.
+      this.ws.onopen = null;
+      this.ws.onmessage = null;
+      this.ws.onclose = null;
+      this.ws.onerror = null;
+      try { this.ws.close(); } catch { /* noop */ }
       this.ws = null;
     }
     useCanvasStore.getState().setWsStatus("disconnected");
diff --git a/canvas/src/styles/org-deploy.css b/canvas/src/styles/org-deploy.css
new file mode 100644
index 00000000..15aeecfd
--- /dev/null
+++ b/canvas/src/styles/org-deploy.css
@@ -0,0 +1,151 @@
+/**
+ * Org-deploy animation module.
+ *
+ * Loaded globally (see app/globals.css). All values come from
+ * theme-tokens.css so a theme swap needs zero edits here.
+ *
+ * Component contract — canvas/src/components/canvas code adds
+ * these classes to the React Flow node / edge wrappers:
+ *
+ *   .mol-deploy-spawn             One-shot entry animation on a
+ *                                 node that just arrived. Applied
+ *                                 by canvas-events.ts for 600 ms
+ *                                 then removed.
+ *   .mol-deploy-shimmer           Persistent border shimmer while
+ *                                 a node's status === "provisioning".
+ *                                 Removed when status flips to
+ *                                 "online" / "failed".
+ *   .mol-deploy-parent-pulse      One-shot acknowledgement pulse
+ *                                 on the parent when a child lands.
+ *                                 Applied for parent-pulse duration
+ *                                 then removed.
+ *   .mol-deploy-locked            Applied to every non-root node
+ *                                 inside a deploying org so it dims
+ *                                 and the cursor signals un-
+ *                                 draggable.
+ *   .mol-deploy-root-complete     One-shot pop + glow on the root
+ *                                 when the last child comes online.
+ *
+ * Edges use React Flow edge data to pick styling — see the
+ * selectors below the node keyframes.
+ *
+ * Reduced motion is handled at the bottom via the same guard
+ * globals.css already installs for other animations.
+ */
+
+/* ────────────────────────────────────────────────────────
+   Keyframes — kept terse; values come from variables so
+   duplication across themes is nil.
+   ──────────────────────────────────────────────────────── */
+
+@keyframes mol-deploy-spawn {
+  /* Gentle fade-in-place. The earlier "spring from parent" motion
+     collided with the server-computed grid positions (parent and
+     child used different coord origins once the parent was placed
+     on the client's grid instead of the template's absolute
+     coords), which landed children in wrong slots. Keeping the
+     animation to a simple opacity+scale lets the server's layout
+     win — and reads as "node arrived" without the over-engineered
+     spring. */
+  from { opacity: 0; transform: scale(0.85); }
+  to   { opacity: 1; transform: scale(1);    }
+}
+
+/* mol-deploy-parent-pulse keyframe removed with the effect — the
+   box-shadow expanding ring made the parent card visibly "flash" on
+   every child arrival when the grow pass also bumped width/height.
+   Kept as a deliberate non-class so the theme-tokens vars can drop
+   with it on the next theme pass. */
+
+@keyframes mol-deploy-root-complete {
+  0%   { transform: scale(1);    box-shadow: 0 0 0 0 transparent;   }
+  40%  { transform: scale(var(--mol-deploy-root-scale-peak)); box-shadow: var(--mol-deploy-root-glow); }
+  100% { transform: scale(1);    box-shadow: 0 0 0 0 transparent;   }
+}
+
+/* (mol-deploy-edge-draw keyframe removed with the edge effects.) */
+
+@keyframes mol-deploy-cancel-pulse {
+  0%, 100% { box-shadow: 0 0 0 0   var(--mol-deploy-cancel-ring); }
+  50%      { box-shadow: 0 0 0 10px transparent;                   }
+}
+
+/* ────────────────────────────────────────────────────────
+   Node classes
+   ──────────────────────────────────────────────────────── */
+
+/* Qualify with .react-flow__node so this rule beats the default
+   `node-appear` animation defined later in globals.css. Without
+   the qualifier, CSS source-order wins and the standard
+   node-appear overrides our scale/opacity keyframe, visually
+   dropping the "spawn from parent" motion. */
+.react-flow__node.mol-deploy-spawn {
+  animation:
+    mol-deploy-spawn var(--mol-duration-spawn) var(--mol-easing-bounce-out) both;
+}
+
+/* Provisioning signal — the earlier rotating conic-gradient border
+   read as distracting "spinner" clutter during a 15-child org
+   import (dozens of them spinning simultaneously). A static dim
+   (reduced opacity + saturation) communicates "this one is still
+   coming online" without the motion noise. The locked-child style
+   already uses the same pattern — we reuse the filter values so
+   a provisioning ROOT node and a locked CHILD look consistent. */
+.mol-deploy-shimmer {
+  filter: saturate(var(--mol-deploy-locked-saturation)) opacity(var(--mol-deploy-locked-opacity));
+  transition: filter var(--mol-duration-base) var(--mol-easing-standard);
+}
+
+.mol-deploy-locked {
+  filter: saturate(var(--mol-deploy-locked-saturation)) opacity(var(--mol-deploy-locked-opacity));
+  cursor: not-allowed !important;
+  transition: filter var(--mol-duration-base) var(--mol-easing-standard);
+}
+
+.react-flow__node.mol-deploy-root-complete {
+  animation: mol-deploy-root-complete var(--mol-duration-root-complete) var(--mol-easing-emphasize) both;
+}
+
+/* ────────────────────────────────────────────────────────
+   Edge classes — intentionally inert.
+
+   Earlier revisions painted incoming edges with a dashed-blueprint
+   → animated-laser-trace effect as the child landed. User feedback
+   on the first demo was "remove connection line effects" — the
+   moving dashes read as noise during a multi-child deploy. Keeping
+   the class hooks so canvas-events.ts event handlers can still
+   apply/strip them without blowing up, but the styling is a no-op
+   (edges fall through to the default styling in globals.css).
+   If a future demo wants the effect back, wire the rules below.
+   ──────────────────────────────────────────────────────── */
+
+/* ────────────────────────────────────────────────────────
+   Cancel-deployment pill — rendered by OrgCancelButton.tsx
+   attached to the root node during deploy. Class `.mol-deploy-cancel`
+   is always applied; the pulse is additive.
+   ──────────────────────────────────────────────────────── */
+.mol-deploy-cancel {
+  background: var(--mol-deploy-cancel-bg);
+  color: var(--mol-deploy-cancel-text);
+  transition: background var(--mol-duration-fast) var(--mol-easing-standard);
+}
+.mol-deploy-cancel:hover {
+  background: var(--mol-deploy-cancel-bg-hover);
+}
+.mol-deploy-cancel-pulse {
+  animation: mol-deploy-cancel-pulse var(--mol-duration-parent-pulse) var(--mol-easing-standard) infinite;
+}
+
+/* ────────────────────────────────────────────────────────
+   Reduced-motion guard — mirror globals.css's policy so this
+   module stays WCAG 2.3.3 compliant without relying on the
+   global file being loaded first.
+   ──────────────────────────────────────────────────────── */
+@media (prefers-reduced-motion: reduce) {
+  .react-flow__node.mol-deploy-spawn,
+  .react-flow__node.mol-deploy-root-complete,
+  .mol-deploy-cancel-pulse {
+    animation: none !important;
+  }
+  /* Dim-light signal is already static; no override needed. */
+}
diff --git a/canvas/src/styles/theme-tokens.css b/canvas/src/styles/theme-tokens.css
new file mode 100644
index 00000000..be47cbe4
--- /dev/null
+++ b/canvas/src/styles/theme-tokens.css
@@ -0,0 +1,69 @@
+/**
+ * Canvas theme tokens — single source of truth for colors, durations,
+ * easings, and sizes used by every animated / stateful canvas
+ * component. Importable from any stylesheet; individual feature
+ * modules (org-deploy.css, settings-panel.css, ...) only reference
+ * variables defined here so a future theme swap touches this one
+ * file.
+ *
+ * Adding a theme:
+ *   Put a scoped override block like `[data-theme="light"] { ... }`
+ *   and set only the tokens whose values differ from the default
+ *   dark theme. Unset tokens inherit the default.
+ *
+ * Naming convention:
+ *   --mol-<feature>-<semantic-role>       → values the user sees
+ *   --mol-duration-<name>                 → motion timings
+ *   --mol-easing-<name>                   → motion curves
+ * Prefix `mol-` avoids collisions with Tailwind / React Flow vars.
+ */
+
+:root {
+  /* ────────────────────────────────────────────────────────
+     Motion primitives — pick one of these; don't hardcode ms
+     values in feature stylesheets. If a new feature genuinely
+     needs a bespoke duration, add a token here and reference it.
+     ──────────────────────────────────────────────────────── */
+  --mol-duration-fast: 150ms;
+  --mol-duration-base: 300ms;
+  --mol-duration-spawn: 350ms;
+  --mol-duration-root-complete: 700ms;
+  --mol-duration-fit-view: 800ms;
+
+  --mol-easing-standard: cubic-bezier(0.2, 0, 0, 1);
+  --mol-easing-bounce-out: cubic-bezier(0.2, 0.8, 0.2, 1.05);
+  --mol-easing-emphasize: cubic-bezier(0.3, 0, 0, 1);
+
+  /* ────────────────────────────────────────────────────────
+     Org-deploy animation palette (dark theme defaults)
+     ──────────────────────────────────────────────────────── */
+
+  /* Root-complete moment — one-shot glow when the last child lands. */
+  --mol-deploy-root-glow: 0 0 36px 6px rgba(59, 130, 246, 0.55);
+  --mol-deploy-root-scale-peak: 1.05;
+
+  /* Locked-child visual — non-root nodes during deploy cannot be
+     dragged; this dims them so the user's attention stays on the
+     active spawn. Saturation + opacity instead of a badge keeps
+     the card recognisable while signalling "not available". */
+  --mol-deploy-locked-saturation: 0.55;
+  --mol-deploy-locked-opacity: 0.78;
+
+  /* Cancel-deployment pill attached to the root node. Red, pulsing,
+     one button that kills the whole tree. */
+  --mol-deploy-cancel-bg: rgba(220, 38, 38, 0.92);     /* red-600/92 */
+  --mol-deploy-cancel-bg-hover: rgba(239, 68, 68, 1);  /* red-500 */
+  --mol-deploy-cancel-ring: rgba(239, 68, 68, 0.45);
+  --mol-deploy-cancel-text: #fff;
+}
+
+/* Example template for a future light theme. Intentionally empty
+   — product hasn't shipped a light theme yet but this shows the
+   override surface any future theme must fill. Uncomment + tune
+   when the light theme lands.
+[data-theme="light"] {
+  --mol-deploy-shimmer-from: rgba(37, 99, 235, 0.08);
+  --mol-deploy-shimmer-to:   rgba(37, 99, 235, 0.9);
+  ...
+}
+*/
diff --git a/docker-compose.yml b/docker-compose.yml
index c9c88d7c..2be0d3f6 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -126,6 +126,13 @@ services:
       REDIS_URL: redis://redis:6379
       PORT: "${PLATFORM_PORT:-8080}"
       PLATFORM_URL: "http://platform:${PLATFORM_PORT:-8080}"
+      # Default MOLECULE_ENV=development so the WorkspaceAuth / AdminAuth
+      # middleware fail-open path activates when ADMIN_TOKEN is unset —
+      # otherwise the canvas (which runs without a bearer in pure local
+      # dev) gets 401 "missing workspace auth token" on every request.
+      # Override to "production" for SaaS/staged deploys; in those modes
+      # ADMIN_TOKEN must also be set or every request rejects.
+      MOLECULE_ENV: "${MOLECULE_ENV:-development}"
       CORS_ORIGINS: ${CORS_ORIGINS:-http://localhost:${CANVAS_PUBLISH_PORT:-3000},http://127.0.0.1:${CANVAS_PUBLISH_PORT:-3000},http://localhost:3001}
       RATE_LIMIT: "${RATE_LIMIT:-1000}"
       CONFIGS_DIR: /configs
@@ -153,6 +160,24 @@ services:
       HIBERNATION_IDLE_MINUTES: "${HIBERNATION_IDLE_MINUTES:-}"
       # Plugin supply chain hardening (issue #768 / PR #775). Never set in production.
       PLUGIN_ALLOW_UNPINNED: "${PLUGIN_ALLOW_UNPINNED:-}"
+      # Force ImagePull/ContainerCreate to request linux/amd64 manifests
+      # for the workspace-template-* images. The templates ship single-arch
+      # amd64 today; without this override, an arm64 host (Apple Silicon)
+      # asks the daemon for linux/arm64/v8, which doesn't match the manifest
+      # and the pull fails with "no matching manifest". Apple Silicon will
+      # run the amd64 image under Rosetta — slower (~2-3×) but functional.
+      # Override to "" or another platform when the templates start shipping
+      # multi-arch (then this hardcoded amd64 becomes unnecessary).
+      MOLECULE_IMAGE_PLATFORM: "${MOLECULE_IMAGE_PLATFORM:-linux/amd64}"
+      # GHCR auth for the workspace-images refresh endpoint
+      # (POST /admin/workspace-images/refresh). When set, the platform's
+      # Docker SDK ImagePull on private workspace-template-* images
+      # succeeds without per-host `docker login`. GHCR_USER is the GitHub
+      # username; GHCR_TOKEN is a fine-grained PAT with `read:packages`
+      # on the Molecule-AI org. Both unset → endpoint can only pull
+      # public images (current state for all 8 templates).
+      GHCR_USER: "${GHCR_USER:-}"
+      GHCR_TOKEN: "${GHCR_TOKEN:-}"
     volumes:
       - ./workspace-configs-templates:/configs
       - ./org-templates:/org-templates:ro
diff --git a/docs/architecture/database-schema.md b/docs/architecture/database-schema.md
index 9e79489e..92542bed 100644
--- a/docs/architecture/database-schema.md
+++ b/docs/architecture/database-schema.md
@@ -77,7 +77,7 @@ CREATE TABLE workspace_secrets (
 );
 ```
 
-Stores API keys, credentials, and other secrets needed by workspace agents. Values are encrypted with AES-256 at the application layer. The encryption key comes from the `SECRETS_ENCRYPTION_KEY` environment variable on the platform — never stored in the database.
+Stores API keys, credentials, and other secrets needed by workspace agents. Values are encrypted with AES-256-GCM at the application layer. The encryption key comes from the tenant's `SECRETS_ENCRYPTION_KEY` environment variable, provisioned at tenant boot by the control plane (which holds the master key in AWS KMS — see [secrets-key-custody.md](./secrets-key-custody.md)). The key is never stored in the database.
 
 The provisioner reads secrets from this table, decrypts them, and injects them as environment variables when spinning up workspace containers. Secrets are never included in bundles (see [Constraints — Rule 5](../development/constraints-and-rules.md)).
 
diff --git a/docs/architecture/molecule-technical-doc.md b/docs/architecture/molecule-technical-doc.md
index dd38d4a6..77f2117a 100644
--- a/docs/architecture/molecule-technical-doc.md
+++ b/docs/architecture/molecule-technical-doc.md
@@ -902,7 +902,7 @@ Postgres + Redis + Langfuse only (for local development without containerized wo
 | `REDIS_URL` | `redis://localhost:6379` | Redis connection |
 | `PORT` | `8080` | Platform listen port |
 | `PLATFORM_URL` | `http://host.docker.internal:8080` | Injected to workspace containers |
-| `SECRETS_ENCRYPTION_KEY` | Optional | AES-256 key (32 bytes) for secret encryption |
+| `SECRETS_ENCRYPTION_KEY` | Optional | AES-256-GCM key (32 bytes) for tenant secret encryption. Provisioned at tenant boot by the control plane, which holds the master key in AWS KMS — see [secrets-key-custody.md](./secrets-key-custody.md). |
 | `CONFIGS_DIR` | `/configs` | Workspace config template directory |
 | `PLUGINS_DIR` | `/plugins` | Shared plugin directory |
 | `ACTIVITY_RETENTION_DAYS` | `7` | Activity log retention |
diff --git a/docs/architecture/secrets-key-custody.md b/docs/architecture/secrets-key-custody.md
new file mode 100644
index 00000000..75e9f9c4
--- /dev/null
+++ b/docs/architecture/secrets-key-custody.md
@@ -0,0 +1,85 @@
+# Secrets Key Custody
+
+How the encryption keys that protect Molecule workspace secrets are managed, where each key lives, and what an attacker who compromises one layer can or cannot read.
+
+This document exists because the platform repo (`workspace-server`) reads `SECRETS_ENCRYPTION_KEY` from its process env, which on its own looks like "encryption-at-rest theater." The full custody chain runs through the control plane (`molecule-controlplane`) where AWS KMS holds the key material at rest. Anyone reading only the platform repo sees half the picture.
+
+## Two modes
+
+The control plane's `internal/crypto.Envelope` ships in two modes, picked at boot from env:
+
+| Mode | Trigger | At-rest format | Recommended for |
+|------|---------|----------------|-----------------|
+| **KMS envelope** | `KMS_KEY_ARN` set | Per-blob KMS-wrapped DEK + AES-256-GCM ciphertext | Production, multi-tenant SaaS |
+| **Static key** | Only `SECRETS_ENCRYPTION_KEY` set | AES-256-GCM with one process-wide key | Dev, self-hosted single-tenant |
+
+`Envelope.Decrypt` is dual-mode — it can read either format on the way out, so a deployment can flip from static-key to KMS envelope without re-encrypting historical rows. Code: `molecule-controlplane/internal/crypto/kms.go`.
+
+## KMS envelope flow
+
+When `KMS_KEY_ARN` is configured, every secret write looks like:
+
+1. CP calls `kms.GenerateDataKey(KeyId=KMS_KEY_ARN, KeySpec=AES_256)` → returns `{Plaintext, CiphertextBlob}`.
+2. CP encrypts the secret with AES-256-GCM using `Plaintext` as the key.
+3. CP discards `Plaintext` from memory; persists the blob:
+
+   ```
+   [0x02 prefix][uint16 BE: encrypted_dek_len][encrypted_dek][nonce(12)][ct+tag]
+   ```
+
+   The `0x02` byte distinguishes v2 (KMS-wrapped) blobs from legacy static-key blobs.
+
+4. To read: CP calls `kms.Decrypt(CiphertextBlob)` → recovers the AES key → unwraps the GCM ciphertext.
+
+KMS calls cost ~$0.03 per 10k requests. We do not cache DEKs — provisioning rate is orders below steady-state reads, and not caching keeps key rotation reasoning simple.
+
+## What lives where
+
+| Layer | Key custody | Plaintext key in memory? |
+|-------|-------------|--------------------------|
+| AWS KMS | KMS-resident, never leaves the HSM | No (hardware) |
+| `molecule-controlplane` process | KMS client + IAM role | Briefly per-secret-op only |
+| CP database (`database_url_encrypted`, tenant secrets) | KMS-wrapped blobs | Never |
+| Per-tenant `workspace-server` env (`SECRETS_ENCRYPTION_KEY`) | Provisioned at tenant boot by CP | Yes, for the tenant's process lifetime |
+| Tenant Postgres (`workspace_secrets.value`) | AES-256-GCM with the tenant's key | Never |
+
+The "plaintext in tenant memory" row is the standard envelope-encryption trade-off: a DEK has to be unwrapped somewhere to be used. The blast radius of compromising one tenant's process is one tenant's secrets — not the whole fleet.
+
+## Threat model
+
+| Attacker capability | Can they read tenant secrets? |
+|---------------------|-------------------------------|
+| Reads CP database backup | No — KMS unwrap requires IAM-scoped `kms:Decrypt` |
+| Steals `KMS_KEY_ARN` value | No — ARN alone does nothing without IAM access |
+| Compromises CP IAM role | Yes — can `kms:Decrypt` any wrapped DEK |
+| Reads tenant Postgres (one tenant) | No — `SECRETS_ENCRYPTION_KEY` lives only in the tenant's own EC2 process env, not in DB |
+| Compromises one tenant's EC2 | Yes for that tenant's secrets, no for any other tenant |
+| Compromises CP host | Game over (CP can provision arbitrary tenants) |
+
+The two boundaries the design protects:
+
+- **DB-only compromise (incl. backups)** → secrets remain encrypted; attacker needs separate access to either KMS (prod) or CP env (dev).
+- **One-tenant compromise** → blast radius limited to that tenant; no cross-tenant key reuse.
+
+## Rotation
+
+- **Tenant key rotation** (per-tenant `SECRETS_ENCRYPTION_KEY`): re-encrypt the tenant's `workspace_secrets` rows under a new key, then swap the env var. Static-key mode requires this for all rotation; KMS mode only requires it on suspected key compromise.
+- **KMS CMK rotation**: AWS KMS handles annual automatic rotation of the customer master key. Re-wrapping data keys is unnecessary because each `Decrypt` call routes through the current CMK version automatically (KMS keeps prior versions for decrypt-only).
+
+## Audit / compliance posture
+
+For SOC2 / ISO 27001 / customer security questionnaires:
+
+- **Key custody**: AWS KMS (FIPS 140-2 Level 3 HSM-backed)
+- **Key isolation**: per-tenant DEK; no shared keys across tenants
+- **Access control**: IAM-scoped `kms:Decrypt`, audited via CloudTrail
+- **At-rest encryption**: AES-256-GCM (NIST-approved, authenticated)
+- **In-transit encryption**: TLS 1.2+ for KMS, CP-to-tenant, tenant-to-DB
+- **Rotation**: AWS-managed CMK rotation annually; manual DEK rotation on incident
+
+## Pointers
+
+- KMS envelope code: [`molecule-controlplane/internal/crypto/kms.go`](https://github.com/Molecule-AI/molecule-controlplane/blob/main/internal/crypto/kms.go)
+- Static-key fallback: [`molecule-controlplane/internal/crypto/aes.go`](https://github.com/Molecule-AI/molecule-controlplane/blob/main/internal/crypto/aes.go)
+- Tenant secrets handler: [`workspace-server/internal/crypto/aes.go`](../../workspace-server/internal/crypto/aes.go)
+- Tenant secrets schema: [database-schema.md](./database-schema.md#workspace_secrets)
diff --git a/docs/development/constraints-and-rules.md b/docs/development/constraints-and-rules.md
index 1dbac461..4c2ffc71 100644
--- a/docs/development/constraints-and-rules.md
+++ b/docs/development/constraints-and-rules.md
@@ -56,7 +56,7 @@ Direct A2A calls between workspaces are unauthenticated in MVP. Access control i
 
 ## 11. Secrets in Postgres, Encrypted
 
-Workspace secrets (API keys, credentials) are stored in Postgres with AES-256 encryption at the application layer. The encryption key comes from the `SECRETS_ENCRYPTION_KEY` environment variable. Secrets are never included in bundles, never logged, never exposed via API responses.
+Workspace secrets (API keys, credentials) are stored in Postgres with AES-256-GCM encryption at the application layer. The tenant's `SECRETS_ENCRYPTION_KEY` is provisioned at boot by the control plane, which holds the master key material in AWS KMS (envelope encryption, dual-mode with a static-key fallback for dev). Full custody chain in [secrets-key-custody.md](../architecture/secrets-key-custody.md). Secrets are never included in bundles, never logged, never exposed via API responses.
 
 ## 12. Last-Write-Wins for MVP
 
diff --git a/docs/workspace-runtime-package.md b/docs/workspace-runtime-package.md
index a24ce42b..aed86368 100644
--- a/docs/workspace-runtime-package.md
+++ b/docs/workspace-runtime-package.md
@@ -2,29 +2,67 @@
 
 ## Overview
 
-The shared workspace runtime infrastructure lives in two places:
+The shared workspace runtime infrastructure has **one editable source** and
+**one published artifact**:
 
-1. **Source of truth (monorepo):** `workspace/` — this is where all development happens
-2. **Published package:** [`molecule-ai-workspace-runtime`](https://pypi.org/project/molecule-ai-workspace-runtime/) on PyPI
+1. **Source of truth (monorepo, editable):** `workspace/` — every runtime
+   change lands here. Edit it like any other monorepo code.
+2. **Published artifact (PyPI, generated):** [`molecule-ai-workspace-runtime`](https://pypi.org/project/molecule-ai-workspace-runtime/)
+   — produced by `.github/workflows/publish-runtime.yml` on every
+   `runtime-vX.Y.Z` tag push. Do NOT edit this independently — it gets
+   overwritten on every publish.
+
+The legacy sibling repo `molecule-ai-workspace-runtime` (the GitHub repo, as
+distinct from the PyPI package) is no longer the source-of-truth and should
+be treated as a publish artifact only. It can be archived or used as a
+read-only mirror.
+
+## Why this shape
+
+The 8 workspace template repos (claude-code, langgraph, hermes, etc.) each
+build their own Docker image and `pip install molecule-ai-workspace-runtime`
+from PyPI. PyPI is the right distribution channel — semver, reproducible
+builds, no submodule dance per-repo. But the runtime ALSO needs to evolve
+in lock-step with the platform's wire protocol (queue shape, A2A metadata,
+event payloads). Shipping cross-cutting protocol changes as separate
+runtime + platform PRs in two repos creates ordering pain and broken
+intermediate states.
+
+The monorepo + auto-publish split gives both: edit cross-cutting changes
+in one PR, publish the runtime artifact via a tag.
 
 ## What's in the package
 
-Everything in `workspace/` except adapter-specific code:
+Everything in `workspace/*.py` plus the `adapters/`, `builtin_tools/`,
+`plugins_registry/`, `policies/`, `skill_loader/` subpackages. Build
+artifacts (`Dockerfile`, `*.sh`, `pytest.ini`, `requirements.txt`) are
+excluded.
 
-- `molecule_runtime/` — all shared `.py` files (main.py, config.py, heartbeat.py, etc.)
-- `molecule_runtime/adapters/` — `BaseAdapter`, `AdapterConfig`, `SetupResult`, `shared_runtime`
-- `molecule_runtime/builtin_tools/` — delegation, memory, approvals, sandbox, telemetry
-- `molecule_runtime/skill_loader/` — skill loading + hot-reload
-- `molecule_runtime/plugins_registry/` — plugin discovery and install pipeline
-- `molecule_runtime/policies/` — namespace routing policies
-- Console script: `molecule-runtime` → `molecule_runtime.main:main_sync`
+The build script rewrites bare imports so the published package is a
+proper Python namespace:
+
+```
+# In monorepo workspace/:
+from a2a_client import discover_peer
+from builtin_tools.memory import store
+
+# In published molecule_runtime/ (auto-rewritten at publish time):
+from molecule_runtime.a2a_client import discover_peer
+from molecule_runtime.builtin_tools.memory import store
+```
+
+The closed allowlist of rewritten module names lives in
+`scripts/build_runtime_package.py` (`TOP_LEVEL_MODULES` + `SUBPACKAGES`).
+Add a new top-level module to workspace/? Add it to the allowlist in the
+same PR.
 
 ## Adapter repos
 
-Each of the 8 adapter repos now contains:
+Each of the 8 adapter template repos contains:
 - `adapter.py` — runtime-specific `Adapter` class
-- `requirements.txt` — `molecule-ai-workspace-runtime>=0.1.0` + adapter deps
-- `Dockerfile` — standalone image (no longer extends workspace-template:base)
+- `requirements.txt` — `molecule-ai-workspace-runtime>=0.1.X` + adapter deps
+- `Dockerfile` — standalone image with `ENV ADAPTER_MODULE=adapter` and
+  `ENTRYPOINT ["molecule-runtime"]`
 
 | Adapter | Repo |
 |---------|------|
@@ -39,8 +77,8 @@ Each of the 8 adapter repos now contains:
 
 ## Adapter discovery (ADAPTER_MODULE)
 
-Standalone adapter repos set `ENV ADAPTER_MODULE=adapter` in their Dockerfile.
-The runtime's `get_adapter()` checks this env var first:
+Standalone adapter repos set `ENV ADAPTER_MODULE=adapter` in their
+Dockerfile. The runtime's `get_adapter()` checks this env var first:
 
 ```python
 # In molecule_runtime/adapters/__init__.py
@@ -49,25 +87,104 @@ def get_adapter(runtime: str) -> type[BaseAdapter]:
     if adapter_module:
         mod = importlib.import_module(adapter_module)
         return getattr(mod, "Adapter")
-    # Fall back to built-in subdirectory scan (monorepo local dev)
-    ...
+    raise KeyError(...)
 ```
 
 ## Publishing a new version
 
 ```bash
-cd workspace-template
-# 1. Bump version in pyproject.toml
-# 2. Sync to molecule-ai-workspace-runtime repo
-# 3. Tag and push — CI publishes to PyPI via PYPI_TOKEN secret
+# From any local checkout of monorepo, after merging your runtime change:
+git tag runtime-v0.1.6
+git push origin runtime-v0.1.6
 ```
 
-Or manually:
-```bash
-cd workspace-template
-python -m build
-python -m twine upload dist/*
+The `publish-runtime` workflow takes over — checks out the tag, runs
+`scripts/build_runtime_package.py --version 0.1.6`, builds wheel + sdist,
+runs a smoke import to catch broken rewrites, and uploads to PyPI via
+the `PYPI_TOKEN` repo secret.
+
+For dev/test releases without tagging, dispatch the workflow manually
+with an explicit version (e.g. `0.1.6.dev1` — PEP 440 dev/rc/post forms
+are accepted).
+
+After publish, the 8 template repos pick up the new version on their
+next `:latest` rebuild. To force-pull immediately, bump the pin in each
+template's `requirements.txt`.
+
+## End-to-end CD chain
+
+The full chain from monorepo merge → workspace containers running new code:
+
 ```
+1. Merge PR with workspace/ changes to main
+   ↓
+2. .github/workflows/auto-tag-runtime.yml fires
+   ↓ reads PR labels (release:major/minor) or defaults to patch
+   ↓ pushes runtime-vX.Y.Z tag
+   ↓
+3. .github/workflows/publish-runtime.yml fires (on the tag)
+   ↓ builds wheel via scripts/build_runtime_package.py
+   ↓ smoke-imports the wheel
+   ↓ uploads to PyPI
+   ↓ cascade job fires repository_dispatch (event-type: runtime-published)
+   ↓ to all 8 workspace-template-* repos
+   ↓
+4. Each template's publish-image.yml fires (on repository_dispatch)
+   ↓ rebuilds Dockerfile (which pip-installs the new PyPI version)
+   ↓ pushes ghcr.io/molecule-ai/workspace-template-<runtime>:latest
+   ↓
+5. Production hosts run scripts/refresh-workspace-images.sh
+   OR an operator hits POST /admin/workspace-images/refresh on the platform
+   ↓ docker pull all 8 :latest tags
+   ↓ remove + force-recreate any running ws-* containers using a refreshed image
+   ↓ canvas re-provisions the workspaces on next interaction
+```
+
+Steps 1-4 are fully automated. Step 5 is one-click: a single curl or shell
+command. SaaS deployments typically wire step 5 into their normal deploy
+pipeline (every release pulls fresh images on every host); local dev fires
+it manually after a runtime release lands.
+
+### Required secrets
+
+| Secret | Where | Why |
+|---|---|---|
+| `PYPI_TOKEN` | molecule-core repo | Twine upload auth (PyPI) |
+| `TEMPLATE_DISPATCH_TOKEN` | molecule-core repo | Fine-grained PAT with `actions:write` on the 8 template repos. Without it the `cascade` job warns and exits clean — PyPI still publishes; templates just don't auto-rebuild. |
+
+### Step 5 specifics
+
+**Local dev (compose stack):**
+```bash
+bash scripts/refresh-workspace-images.sh                  # all runtimes
+bash scripts/refresh-workspace-images.sh --runtime claude-code
+bash scripts/refresh-workspace-images.sh --no-recreate    # pull only, leave containers
+```
+
+**Via platform admin endpoint (any deploy):**
+```bash
+curl -X POST "$PLATFORM/admin/workspace-images/refresh"
+curl -X POST "$PLATFORM/admin/workspace-images/refresh?runtime=claude-code"
+curl -X POST "$PLATFORM/admin/workspace-images/refresh?recreate=false"
+```
+
+The endpoint pulls + recreates from inside the platform container, so it
+needs Docker socket access (the compose stack mounts
+`/var/run/docker.sock` already) AND GHCR auth on the host's docker config
+(`docker login ghcr.io` once per host). On a fresh host without GHCR auth,
+the pull step warns per runtime and the response surfaces the failures.
+
+## Local dev (build the package without publishing)
+
+```bash
+python3 scripts/build_runtime_package.py --version 0.1.0-local --out /tmp/runtime-build
+cd /tmp/runtime-build
+python -m build              # produces dist/*.whl + dist/*.tar.gz
+pip install dist/*.whl       # install into a venv to test locally
+```
+
+This is the same pipeline CI runs. Use it to validate import-rewrite
+correctness before pushing a `runtime-v*` tag.
 
 ## Writing a new adapter
 
@@ -75,5 +192,18 @@ python -m twine upload dist/*
 2. Copy `adapter.py` pattern from any existing adapter repo
 3. Change imports: `from molecule_runtime.adapters.base import BaseAdapter, AdapterConfig`
 4. Create `requirements.txt` with `molecule-ai-workspace-runtime>=0.1.0` + your deps
-5. Create `Dockerfile` with `ENV ADAPTER_MODULE=adapter` and `ENTRYPOINT ["molecule-runtime"]`
+5. Create `Dockerfile` with `ENV ADAPTER_MODULE=adapter` and
+   `ENTRYPOINT ["molecule-runtime"]`
 6. Register the runtime name in the platform's known runtimes list
+
+## Migration note
+
+Prior to this workflow, the runtime was duplicated across monorepo
+`workspace/` AND a sibling repo `molecule-ai-workspace-runtime`, with no
+sync mechanism. That caused 30+ files to drift between the two trees and
+tonight's chat-leak / queued-classification fixes existed only in the
+monorepo copy until manually ported.
+
+If you have an old local checkout of `molecule-ai-workspace-runtime`, treat
+it as outdated. The monorepo `workspace/` is now authoritative; the PyPI
+artifact is rebuilt from it on every `runtime-v*` tag.
diff --git a/manifest.json b/manifest.json
index 1bba24ad..72f37404 100644
--- a/manifest.json
+++ b/manifest.json
@@ -39,6 +39,7 @@
     {"name": "free-beats-all", "repo": "Molecule-AI/molecule-ai-org-template-free-beats-all", "ref": "main"},
     {"name": "medo-smoke", "repo": "Molecule-AI/molecule-ai-org-template-medo-smoke", "ref": "main"},
     {"name": "molecule-worker-gemini", "repo": "Molecule-AI/molecule-ai-org-template-molecule-worker-gemini", "ref": "main"},
-    {"name": "reno-stars", "repo": "Molecule-AI/molecule-ai-org-template-reno-stars", "ref": "main"}
+    {"name": "reno-stars", "repo": "Molecule-AI/molecule-ai-org-template-reno-stars", "ref": "main"},
+    {"name": "ux-ab-lab", "repo": "Molecule-AI/molecule-ai-org-template-ux-ab-lab", "ref": "main"}
   ]
 }
diff --git a/scripts/build_runtime_package.py b/scripts/build_runtime_package.py
new file mode 100755
index 00000000..91e121b2
--- /dev/null
+++ b/scripts/build_runtime_package.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python3
+"""Build the molecule-ai-workspace-runtime PyPI package from monorepo workspace/.
+
+Monorepo workspace/ is the single source-of-truth for runtime code. The PyPI
+package is a publish-time mirror produced by this script, NOT a parallel
+editable copy. Anyone editing the runtime should edit workspace/, never the
+sibling molecule-ai-workspace-runtime repo.
+
+What this does
+--------------
+1. Copies workspace/ source into build/molecule_runtime/ (note the rename:
+   bare modules become a real Python package).
+2. Rewrites top-level imports so e.g. `from a2a_client import X` becomes
+   `from molecule_runtime.a2a_client import X`. The rewrite is regex-based
+   on a closed allowlist of modules — third-party imports like `from a2a.X`
+   (the a2a-sdk package) are left alone because the regex is anchored on
+   exact module names.
+3. Writes a pyproject.toml with the requested version + the README + the
+   py.typed marker.
+4. Leaves the build dir ready for `python -m build` to produce a wheel/sdist.
+
+Usage
+-----
+  scripts/build_runtime_package.py --version 0.1.6 --out /tmp/runtime-build
+  cd /tmp/runtime-build && python -m build
+  python -m twine upload dist/*
+
+The publish workflow (.github/workflows/publish-runtime.yml) drives this
+on every `runtime-v*` tag push.
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import shutil
+import sys
+from pathlib import Path
+
+# Top-level Python modules in workspace/ that become molecule_runtime.X.
+# Anything imported as `from <name> import` or `import <name>` (where <name>
+# matches one of these) gets rewritten to use the package prefix.
+#
+# Closed list (not "every .py we copy") because a typo in workspace/ would
+# otherwise leak into a wrong rewrite. Update this when adding a new
+# top-level module to workspace/.
+TOP_LEVEL_MODULES = {
+    "a2a_cli",
+    "a2a_client",
+    "a2a_executor",
+    "a2a_mcp_server",
+    "a2a_tools",
+    "adapter_base",
+    "agent",
+    "agents_md",
+    "claude_sdk_executor",
+    "cli_executor",
+    "config",
+    "consolidation",
+    "coordinator",
+    "events",
+    "executor_helpers",
+    "heartbeat",
+    "hermes_executor",
+    "initial_prompt",
+    "main",
+    "molecule_ai_status",
+    "platform_auth",
+    "plugins",
+    "preflight",
+    "prompt",
+    "shared_runtime",
+}
+
+# Subdirectory packages — these are already real packages (they have or will
+# have __init__.py) so the rewrite is `from <pkg>` → `from molecule_runtime.<pkg>`.
+SUBPACKAGES = {
+    "adapters",
+    "builtin_tools",
+    "plugins_registry",
+    "policies",
+    "skill_loader",
+}
+
+# Files in workspace/ NOT included in the published package. These are
+# build artifacts, dev scripts, or monorepo-only scaffolding.
+EXCLUDE_FILES = {
+    "Dockerfile",
+    "build-all.sh",
+    "rebuild-runtime-images.sh",
+    "entrypoint.sh",
+    "pytest.ini",
+    "requirements.txt",
+    # Note: adapter_base.py, agents_md.py, hermes_executor.py, shared_runtime.py
+    # are kept (referenced by adapters/__init__.py and other modules); they get
+    # their imports rewritten via TOP_LEVEL_MODULES. Excluding them broke the
+    # smoke-test install with `ModuleNotFoundError: adapter_base`.
+}
+
+EXCLUDE_DIRS = {
+    "__pycache__",
+    "tests",
+    "lib",
+    "molecule_audit",
+    "scripts",
+}
+
+
+def build_import_rewriter() -> re.Pattern:
+    """Compile a single regex matching all import statements that need
+    rewriting. The match groups capture the keyword + module name so the
+    replacement preserves whitespace and trailing punctuation.
+
+    Modules included: TOP_LEVEL_MODULES ∪ SUBPACKAGES.
+
+    The negative-lookahead on `\\.` in the suffix prevents matching
+    `from a2a.server.X import Y` against bare `a2a` (which isn't in our
+    set, but the principle matters for any future short module name that
+    happens to be a prefix of a real package name).
+    """
+    names = sorted(TOP_LEVEL_MODULES | SUBPACKAGES)
+    alt = "|".join(re.escape(n) for n in names)
+    # Matches:
+    #   from <name>(\.|\s|import)
+    #   import <name>(\s|$|,)
+    # And captures the keyword + name so we can re-emit with prefix.
+    pattern = (
+        r"(?m)^(?P<indent>\s*)"          # leading whitespace (preserved)
+        r"(?P<kw>from|import)\s+"        # 'from' or 'import'
+        r"(?P<mod>" + alt + r")"          # the module name
+        r"(?P<rest>[\s.,]|$)"            # what follows: '.subpath', ' import …', ',', whitespace, EOL
+    )
+    return re.compile(pattern)
+
+
+def rewrite_imports(text: str, regex: re.Pattern) -> str:
+    """Replace bare imports with package-prefixed ones.
+
+    `import X`           → `import molecule_runtime.X as X`  (preserve binding)
+    `from X import Y`    → `from molecule_runtime.X import Y`
+    `from X.sub import Y` → `from molecule_runtime.X.sub import Y`
+    """
+    def repl(m: re.Match) -> str:
+        indent, kw, mod, rest = m.group("indent"), m.group("kw"), m.group("mod"), m.group("rest")
+        if kw == "from":
+            # `from X` or `from X.sub` — always safe to prefix.
+            return f"{indent}from molecule_runtime.{mod}{rest}"
+        # `import X` — preserve the binding name `X` (callers do `X.foo`)
+        # by aliasing. `import X.sub` is uncommon for our modules and would
+        # need a different binding form, but isn't used in workspace/ today.
+        if rest.startswith("."):
+            # `import X.sub` — rewrite as `import molecule_runtime.X.sub` and
+            # leave the trailing dot pattern intact for the rest of the line.
+            return f"{indent}import molecule_runtime.{mod}{rest}"
+        # Plain `import X` — alias preserves the local name.
+        return f"{indent}import molecule_runtime.{mod} as {mod}{rest}"
+    return regex.sub(repl, text)
+
+
+def copy_tree_filtered(src: Path, dst: Path) -> list[Path]:
+    """Copy src/ → dst/ skipping EXCLUDE_FILES + EXCLUDE_DIRS. Returns the
+    list of .py files copied so the caller can run the import rewrite over
+    them in one pass."""
+    py_files: list[Path] = []
+    if dst.exists():
+        shutil.rmtree(dst)
+    dst.mkdir(parents=True)
+    for entry in src.iterdir():
+        if entry.is_dir():
+            if entry.name in EXCLUDE_DIRS:
+                continue
+            sub_py = copy_tree_filtered(entry, dst / entry.name)
+            py_files.extend(sub_py)
+        else:
+            if entry.name in EXCLUDE_FILES:
+                continue
+            shutil.copy2(entry, dst / entry.name)
+            if entry.suffix == ".py":
+                py_files.append(dst / entry.name)
+    return py_files
+
+
+PYPROJECT_TEMPLATE = """\
+[build-system]
+requires = ["setuptools>=68.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "molecule-ai-workspace-runtime"
+version = "{version}"
+description = "Molecule AI workspace runtime — shared infrastructure for all agent adapters"
+requires-python = ">=3.11"
+license = {{text = "BSL-1.1"}}
+readme = "README.md"
+dependencies = [
+    "a2a-sdk[http-server]>=1.0.0,<2.0",
+    "httpx>=0.27.0",
+    "uvicorn>=0.30.0",
+    "starlette>=0.38.0",
+    "websockets>=12.0",
+    "pyyaml>=6.0",
+    "langchain-core>=0.3.0",
+    "opentelemetry-api>=1.24.0",
+    "opentelemetry-sdk>=1.24.0",
+    "opentelemetry-exporter-otlp-proto-http>=1.24.0",
+    "temporalio>=1.7.0",
+]
+
+[project.scripts]
+molecule-runtime = "molecule_runtime.main:main_sync"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["molecule_runtime*"]
+
+[tool.setuptools.package-data]
+"molecule_runtime" = ["py.typed"]
+"""
+
+
+README_TEMPLATE = """\
+# molecule-ai-workspace-runtime
+
+Shared workspace runtime for [Molecule AI](https://github.com/Molecule-AI/molecule-core)
+agent adapters. Installed by every workspace template image
+(`workspace-template-claude-code`, `-langgraph`, `-hermes`, etc.) to provide
+A2A delegation, heartbeat, memory, plugin loading, and skill management.
+
+This package is **published from the molecule-core monorepo `workspace/`
+directory** by the `publish-runtime` GitHub Actions workflow on every
+`runtime-v*` tag push. **Do not edit this package directly** — edit
+`workspace/` in the monorepo.
+
+See [`docs/workspace-runtime-package.md`](https://github.com/Molecule-AI/molecule-core/blob/main/docs/workspace-runtime-package.md)
+for the publish flow and architecture.
+"""
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--version", required=True, help="Package version, e.g. 0.1.6")
+    parser.add_argument("--out", required=True, type=Path, help="Build output directory (will be wiped)")
+    parser.add_argument("--source", type=Path, default=Path(__file__).resolve().parent.parent / "workspace",
+                        help="Path to monorepo workspace/ directory (default: ../workspace from this script)")
+    args = parser.parse_args()
+
+    src = args.source.resolve()
+    out = args.out.resolve()
+    if not src.is_dir():
+        print(f"error: source not a directory: {src}", file=sys.stderr)
+        return 2
+
+    pkg_dir = out / "molecule_runtime"
+    print(f"[build] source: {src}")
+    print(f"[build] output: {out}")
+    print(f"[build] package: {pkg_dir}")
+
+    if out.exists():
+        shutil.rmtree(out)
+    out.mkdir(parents=True)
+
+    py_files = copy_tree_filtered(src, pkg_dir)
+    print(f"[build] copied {len(py_files)} .py files")
+
+    # Ensure top-level package marker exists. workspace/ doesn't have one
+    # (it's not a package in monorepo), but the published artifact must.
+    init = pkg_dir / "__init__.py"
+    if not init.exists():
+        init.write_text('"""Molecule AI workspace runtime."""\n')
+
+    # Touch py.typed so type-checkers in adapter consumers see the package
+    # as typed. Empty file is the convention.
+    (pkg_dir / "py.typed").touch()
+
+    # Rewrite imports in every .py file we copied + the new __init__.py.
+    regex = build_import_rewriter()
+    rewrites = 0
+    for f in [*py_files, init]:
+        original = f.read_text()
+        rewritten = rewrite_imports(original, regex)
+        if rewritten != original:
+            f.write_text(rewritten)
+            rewrites += 1
+    print(f"[build] rewrote imports in {rewrites} files")
+
+    # Emit pyproject.toml + README at build root.
+    (out / "pyproject.toml").write_text(PYPROJECT_TEMPLATE.format(version=args.version))
+    (out / "README.md").write_text(README_TEMPLATE)
+
+    print(f"[build] done. To publish:")
+    print(f"  cd {out}")
+    print(f"  python -m build")
+    print(f"  python -m twine upload dist/*")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/refresh-workspace-images.sh b/scripts/refresh-workspace-images.sh
new file mode 100755
index 00000000..ec9ea0ba
--- /dev/null
+++ b/scripts/refresh-workspace-images.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+# refresh-workspace-images.sh — pull the latest workspace template images
+# from GHCR and recreate any running ws-* containers against the new digest.
+#
+# This is the local-dev / single-host equivalent of step 5 of the runtime CD
+# chain (see docs/workspace-runtime-package.md). On a SaaS deployment the
+# host's deploy pipeline does the pull on every release; this script is
+# what to run on a local docker-compose host after a runtime release lands.
+#
+# Usage:
+#   bash scripts/refresh-workspace-images.sh                     # pull all 8 + recreate running ws-*
+#   bash scripts/refresh-workspace-images.sh --runtime claude-code  # pull just one template
+#   bash scripts/refresh-workspace-images.sh --no-recreate          # pull only, leave containers
+#
+# Behavior:
+#   - Always pulls fresh; docker is a no-op if local matches remote, so
+#     repeated runs are cheap.
+#   - Recreate is "kill + remove + let the next canvas interaction re-
+#     provision" — simpler than `docker stop / docker run` because the
+#     platform owns the run flags. Workspaces re-register on next probe.
+#   - If a container is mid-conversation, the kill cancels in-flight work.
+#     Run during a quiet window OR add --no-recreate and recreate manually
+#     via canvas Restart buttons.
+
+set -euo pipefail
+
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+log()  { echo -e "${GREEN}[refresh]${NC} $1" >&2; }
+warn() { echo -e "${YELLOW}[refresh]${NC} $1" >&2; }
+err()  { echo -e "${RED}[refresh]${NC} $1" >&2; }
+
+ALL_RUNTIMES=(claude-code langgraph crewai autogen deepagents hermes gemini-cli openclaw)
+RUNTIMES=("${ALL_RUNTIMES[@]}")
+RECREATE=true
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --runtime) RUNTIMES=("$2"); shift 2;;
+    --no-recreate) RECREATE=false; shift;;
+    -h|--help) sed -n '2,30p' "$0"; exit 0;;
+    *) err "unknown arg: $1"; exit 2;;
+  esac
+done
+
+# 1. Pull fresh tags. Soft-fail per runtime — one missing image (e.g., a
+#    template that hasn't been published yet) shouldn't abort the others.
+log "pulling latest images for: ${RUNTIMES[*]}"
+PULLED=()
+FAILED=()
+for rt in "${RUNTIMES[@]}"; do
+  IMG="ghcr.io/molecule-ai/workspace-template-$rt:latest"
+  if docker pull "$IMG" >/dev/null 2>&1; then
+    log "  ✓ $rt"
+    PULLED+=("$rt")
+  else
+    warn "  ✗ $rt (pull failed — image may not exist or auth missing)"
+    FAILED+=("$rt")
+  fi
+done
+
+if [ "$RECREATE" = "false" ]; then
+  log "skip-recreate set — leaving containers untouched"
+  log "done. pulled=${#PULLED[@]} failed=${#FAILED[@]}"
+  exit 0
+fi
+
+# 2. Find ws-* containers whose image is one of the runtimes we pulled.
+#    `docker inspect` exposes the image tag/digest each was created from.
+log "scanning ws-* containers for stale images..."
+TO_RECREATE=()
+for cn in $(docker ps -a --filter "name=ws-" --format "{{.Names}}"); do
+  IMG=$(docker inspect "$cn" --format '{{.Config.Image}}' 2>/dev/null || echo "")
+  for rt in "${PULLED[@]}"; do
+    if [[ "$IMG" == *"workspace-template-$rt"* ]]; then
+      TO_RECREATE+=("$cn")
+      break
+    fi
+  done
+done
+
+if [ "${#TO_RECREATE[@]}" -eq 0 ]; then
+  log "no running ws-* containers using a refreshed image — nothing to recreate"
+  exit 0
+fi
+
+# 3. Kill + remove. Canvas next-interaction will re-provision.
+log "recreating ${#TO_RECREATE[@]} containers (canvas will re-provision on next interaction)"
+for cn in "${TO_RECREATE[@]}"; do
+  docker rm -f "$cn" >/dev/null 2>&1 && log "  removed $cn" || warn "  failed to remove $cn"
+done
+
+log "done. open the canvas and the workspaces will re-provision against the new image."
diff --git a/tests/e2e/test_chat_attachments_e2e.sh b/tests/e2e/test_chat_attachments_e2e.sh
new file mode 100755
index 00000000..76f46706
--- /dev/null
+++ b/tests/e2e/test_chat_attachments_e2e.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# E2E test: chat file attachment round-trip
+#
+# Proves the full drag-drop → agent-reads → agent-returns-file → download
+# path against a live workspace. Runs against the local workspace-server
+# on :8080 with a hermes workspace already online. The test is provider-
+# agnostic as long as the agent has a valid API key — it only asserts
+# that attachments surface on both ends, not a specific reply shape.
+#
+# Usage:  WSID=<workspace-id> tests/e2e/test_chat_attachments_e2e.sh
+#         (pass WSID for an existing hermes workspace)
+#
+# Prereqs:
+#   - workspace-server on http://localhost:8080
+#   - the WSID workspace is online, runtime=hermes
+#   - a working provider key (MINIMAX_API_KEY / ANTHROPIC_API_KEY / etc.)
+#   - /workspace writable by the agent user (some templates ship it
+#     root-owned; chmod 777 for the E2E or use a writable template)
+
+set -euo pipefail
+
+WSID="${WSID:?WSID=<workspace-id> required}"
+BASE="${BASE:-http://localhost:8080}"
+
+log() { printf "\n=== %s ===\n" "$*"; }
+
+log "Preflight: workspace online?"
+STATUS=$(curl -s "$BASE/workspaces/$WSID" | python3 -c 'import json,sys;print(json.load(sys.stdin)["status"])')
+[ "$STATUS" = "online" ] || { echo "workspace not online ($STATUS)"; exit 1; }
+
+log "Step 1 — Upload a text file via /chat/uploads"
+TEST_FILE=$(mktemp -t hermes-e2e-XXXXXX.txt)
+echo "secret code: $(openssl rand -hex 4)-$(openssl rand -hex 4)" > "$TEST_FILE"
+EXPECTED=$(cat "$TEST_FILE" | awk '{print $NF}')
+UPLOAD=$(curl -s -X POST "$BASE/workspaces/$WSID/chat/uploads" -F "files=@$TEST_FILE")
+URI=$(echo "$UPLOAD" | python3 -c 'import json,sys;print(json.load(sys.stdin)["files"][0]["uri"])')
+[ -n "$URI" ] || { echo "upload failed: $UPLOAD"; exit 1; }
+echo "uploaded: $URI"
+
+log "Step 2 — A2A message with file part; expect agent to quote the code"
+# Build the JSON via a python helper so the URI value doesn't have to be
+# shell-interpolated through a heredoc (the { } tokens in a JSON body
+# collide with bash brace-expansion when quoted wrong).
+PAYLOAD=$(URI="$URI" python3 -c '
+import json, os
+uri = os.environ["URI"]
+print(json.dumps({
+  "jsonrpc":"2.0","id":"e2e-up","method":"message/send",
+  "params":{"message":{"role":"user","messageId":"e2e-up","kind":"message","parts":[
+    {"kind":"text","text":"Read the attached file and tell me the exact secret code."},
+    {"kind":"file","file":{"name":"test.txt","mimeType":"text/plain","uri":uri}},
+  ]},"configuration":{"acceptedOutputModes":["text/plain"],"blocking":True}}}))
+')
+REPLY=$(curl -s -X POST "$BASE/workspaces/$WSID/a2a" \
+  -H 'Content-Type: application/json' \
+  --max-time 120 \
+  -d "$PAYLOAD")
+REPLY_TEXT=$(echo "$REPLY" | python3 -c 'import json,sys;d=json.load(sys.stdin);[print(p.get("text","")) for p in d["result"]["parts"] if p.get("kind")=="text"]')
+echo "agent reply: $REPLY_TEXT"
+if echo "$REPLY_TEXT" | grep -qF "$EXPECTED"; then
+  echo "PASS: agent saw the attached file"
+else
+  echo "FAIL: agent reply missing expected code '$EXPECTED'"
+  exit 1
+fi
+
+log "Step 3 — Seed a file inside /workspace and ask agent to reference it"
+# Relies on /workspace being writable by the platform (we copy as root via
+# docker exec, mimicking the path a real agent would use through its tools).
+CONTAINER=$(docker ps --format '{{.Names}}' | grep -E "^ws-${WSID:0:12}" | head -1)
+[ -n "$CONTAINER" ] || { echo "container not found"; exit 1; }
+docker exec "$CONTAINER" sh -c 'echo "E2E report body $(date -u +%s)" > /workspace/e2e-report.txt'
+
+REPLY=$(curl -s -X POST "$BASE/workspaces/$WSID/a2a" \
+  -H 'Content-Type: application/json' \
+  --max-time 120 \
+  -d '{"jsonrpc":"2.0","id":"e2e-down","method":"message/send","params":{"message":{"role":"user","messageId":"e2e-down","kind":"message","parts":[{"kind":"text","text":"There is a file at /workspace/e2e-report.txt. Mention its exact path in your reply so I can download it."}]},"configuration":{"acceptedOutputModes":["text/plain"],"blocking":true}}}')
+FILE_URI=$(echo "$REPLY" | python3 -c 'import json,sys,re;d=json.load(sys.stdin);[print(p["file"]["uri"]) for p in d["result"]["parts"] if p.get("kind")=="file"]' | head -1)
+[ -n "$FILE_URI" ] || { echo "FAIL: agent reply had no file part"; echo "$REPLY"; exit 1; }
+echo "agent attached: $FILE_URI"
+
+log "Step 4 — Download via /chat/download"
+DL_PATH=${FILE_URI#workspace:}
+BODY=$(curl -s "$BASE/workspaces/$WSID/chat/download?path=$DL_PATH")
+echo "downloaded: $BODY"
+if echo "$BODY" | grep -q "E2E report body"; then
+  echo "PASS: downloaded the agent-returned file"
+else
+  echo "FAIL: download did not return expected body"
+  exit 1
+fi
+
+log "ALL E2E CHECKS PASSED"
diff --git a/tests/e2e/test_chat_attachments_multiruntime_e2e.sh b/tests/e2e/test_chat_attachments_multiruntime_e2e.sh
new file mode 100755
index 00000000..9601132e
--- /dev/null
+++ b/tests/e2e/test_chat_attachments_multiruntime_e2e.sh
@@ -0,0 +1,149 @@
+#!/usr/bin/env bash
+# Multi-runtime E2E: chat attachments work across runtimes.
+#
+# The platform-level attachment helpers live in
+# molecule_runtime.executor_helpers. Every runtime's executor is
+# expected to call them. This script proves the invariant two ways:
+#
+#   1) Static plumbing check — each target container must expose the
+#      helpers via an importable symbol AND the runtime's executor must
+#      reference them (so a future build that skipped the patch is
+#      caught, not silently ignored).
+#
+#   2) Live round-trip — upload a text file, send an A2A message with
+#      a FilePart, and assert the agent's reply quotes the file
+#      contents (proves the manifest reached the model). Skipped with
+#      a PASS-NOTE when the runtime lacks valid provider credentials,
+#      because a missing ANTHROPIC_API_KEY / CLAUDE_CODE_OAUTH_TOKEN
+#      is infra, not platform plumbing.
+#
+# Usage:  WS_HERMES=<id> WS_LANGGRAPH=<id> WS_CLAUDE_CODE=<id> \
+#           tests/e2e/test_chat_attachments_multiruntime_e2e.sh
+
+set -uo pipefail
+BASE="${BASE:-http://localhost:8080}"
+fails=0
+
+has_patch_in_container() {
+  local container="$1"
+  # Signal that platform helpers are available AND wired into the
+  # runtime's executor. Grep the two authoritative paths — if either
+  # is missing, a future build dropped the patch.
+  docker exec "$container" python3 -c '
+import sys
+try:
+    from molecule_runtime.executor_helpers import (
+        extract_attached_files, collect_outbound_files,
+        build_user_content_with_files, ensure_workspace_writable,
+    )
+    print("helpers: OK")
+except Exception as e:
+    print(f"helpers: MISSING ({e})"); sys.exit(1)
+' 2>&1
+}
+
+has_executor_patched() {
+  # For hermes: /app/executor.py should call build_user_content_with_files
+  # For langgraph: molecule_runtime/a2a_executor.py should call extract_attached_files
+  # For claude-code: the monkey-patch installs ClaudeSDKExecutor.execute
+  #                  as _execute_with_attachments
+  local container="$1" runtime="$2"
+  case "$runtime" in
+    hermes)
+      docker exec "$container" grep -q "build_user_content_with_files" /app/executor.py \
+        && echo "executor: hermes template uses platform helpers" \
+        || { echo "executor: /app/executor.py missing helper call"; return 1; }
+      ;;
+    langgraph)
+      docker exec "$container" grep -q "extract_attached_files(getattr(context" \
+        /usr/local/lib/python3.11/site-packages/molecule_runtime/a2a_executor.py \
+        && echo "executor: langgraph A2A executor invokes extract_attached_files" \
+        || { echo "executor: a2a_executor.py not patched"; return 1; }
+      ;;
+    claude-code)
+      docker exec "$container" python3 -c '
+from molecule_runtime.claude_sdk_executor import ClaudeSDKExecutor
+name = ClaudeSDKExecutor.execute.__qualname__
+assert name.endswith("_execute_with_attachments"), f"unpatched: {name}"
+print(f"executor: claude-code monkey-patch active ({name})")
+' 2>&1 || return 1
+      ;;
+  esac
+}
+
+round_trip() {
+  local label="$1" wsid="$2"
+  local test_file expected upload uri payload reply reply_text
+  test_file=$(mktemp -t e2e-mr-XXXX.txt)
+  expected="secret-$(openssl rand -hex 6)"
+  echo "$expected" > "$test_file"
+  upload=$(curl -s -X POST "$BASE/workspaces/$wsid/chat/uploads" -F "files=@$test_file")
+  uri=$(echo "$upload" | python3 -c 'import json,sys;print(json.load(sys.stdin)["files"][0]["uri"])' 2>/dev/null)
+  [ -z "$uri" ] && { echo "FAIL $label: upload returned no URI: $upload"; rm -f "$test_file"; return 1; }
+  payload=$(URI="$uri" python3 -c '
+import json, os
+uri = os.environ["URI"]
+print(json.dumps({
+  "jsonrpc":"2.0","id":"mr","method":"message/send",
+  "params":{"message":{"role":"user","messageId":"mr","kind":"message","parts":[
+    {"kind":"text","text":"Read the attached text file and reply with ONLY the one-line content."},
+    {"kind":"file","file":{"name":"probe.txt","mimeType":"text/plain","uri":uri}},
+  ]},"configuration":{"acceptedOutputModes":["text/plain"],"blocking":True}}}))')
+
+  # Hit the platform proxy, with generous timeout — some runtimes warm on first call
+  reply=$(curl -s -X POST "$BASE/workspaces/$wsid/a2a" \
+    -H 'Content-Type: application/json' --max-time 120 -d "$payload")
+  reply_text=$(echo "$reply" | python3 -c '
+import json, sys, re
+try:
+    data = re.sub(r"[\x00-\x08\x0b-\x1f]", " ", sys.stdin.read())
+    d = json.loads(data)
+    parts = d.get("result",{}).get("parts",[])
+    print(" ".join(p.get("text","") for p in parts if p.get("kind")=="text"))
+except Exception as exc:
+    print(f"(parse failed: {exc})")
+' 2>&1)
+  rm -f "$test_file"
+
+  if echo "$reply_text" | grep -qF "$expected"; then
+    echo "PASS $label round-trip: agent quoted $expected"
+    return 0
+  fi
+  # Credential-missing signatures we choose to tolerate (infra, not platform)
+  if echo "$reply_text" | grep -qEi "could not resolve authentication|missing api|not logged in|hermes setup|no llm provider|401|\"type\": \"server_error\""; then
+    echo "SKIP $label round-trip: agent lacks credentials (reply=$(echo "$reply_text" | head -c 120)...)"
+    return 0
+  fi
+  echo "INFO $label round-trip: agent reply did not contain expected text"
+  echo "    reply: $(echo "$reply_text" | head -c 200)"
+  return 0  # Don't hard-fail; the plumbing check already asserted the platform layer
+}
+
+check_runtime() {
+  local label="$1" runtime="$2" wsid="$3"
+  [ -z "$wsid" ] && { echo "SKIP $label (no workspace id)"; return; }
+  printf "\n======================== %s (%s) ========================\n" "$label" "$wsid"
+
+  local status
+  status=$(curl -s "$BASE/workspaces/$wsid" | python3 -c 'import json,sys;print(json.load(sys.stdin)["status"])')
+  if [ "$status" != "online" ]; then
+    echo "FAIL $label: workspace status=$status"
+    fails=$((fails + 1)); return
+  fi
+  local container
+  container=$(docker ps --format '{{.Names}}' | grep -E "^ws-${wsid:0:12}" | head -1)
+  [ -z "$container" ] && { echo "FAIL $label: container not found"; fails=$((fails + 1)); return; }
+
+  has_patch_in_container "$container" || { echo "FAIL $label: platform helpers missing"; fails=$((fails + 1)); return; }
+  has_executor_patched "$container" "$runtime" || { echo "FAIL $label: executor not patched"; fails=$((fails + 1)); return; }
+  round_trip "$label" "$wsid" || { fails=$((fails + 1)); return; }
+}
+
+check_runtime "hermes"      "hermes"      "${WS_HERMES:-}"
+check_runtime "langgraph"   "langgraph"   "${WS_LANGGRAPH:-}"
+check_runtime "claude-code" "claude-code" "${WS_CLAUDE_CODE:-}"
+
+printf "\n=================================================\n"
+if [ $fails -eq 0 ]; then echo "ALL RUNTIME E2E CHECKS PASSED"; exit 0; fi
+echo "FAIL: $fails runtime check(s) failed"
+exit 1
diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh
index ba0fc7a9..e9d9da5c 100755
--- a/tests/e2e/test_staging_full_saas.sh
+++ b/tests/e2e/test_staging_full_saas.sh
@@ -195,14 +195,35 @@ TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(js
 ok "Tenant admin token retrieved (len=${#TENANT_TOKEN})"
 
 # ─── 4. Wait for tenant TLS / DNS propagation ──────────────────────────
+# Kept below the 20-min provision envelope so a genuinely-stuck tenant
+# still fails loud at the earlier provision step rather than masquerading
+# as a TLS issue. CF DNS propagation + tunnel hostname registration +
+# ACME cert + edge cache run 5-7 min on a healthy day; +5 min headroom
+# over the previous 10-min cap covers the slower path observed in #2090.
+#
+# On timeout, dump DNS + curl -v + headers so the next failure identifies
+# the broken layer (DNS / TLS / HTTP). Authorization is redacted
+# defensively in case a future caller adds an auth header to this probe.
 log "4/11 Waiting for tenant TLS / DNS propagation..."
-TLS_DEADLINE=$(( $(date +%s) + 180 ))
+TLS_TIMEOUT_SEC=$((15 * 60))
+TLS_DEADLINE=$(( $(date +%s) + TLS_TIMEOUT_SEC ))
+TENANT_HOST="${TENANT_URL#http*://}"
+TENANT_HOST="${TENANT_HOST%%/*}"
+TENANT_HOST="${TENANT_HOST%%:*}"
 while true; do
   if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then
     break
   fi
   if [ "$(date +%s)" -gt "$TLS_DEADLINE" ]; then
-    fail "Tenant URL never responded 2xx on /health within 3 min"
+    log "── DIAGNOSTIC BURST (TLS-readiness timeout) ──"
+    log "DNS lookup ($TENANT_HOST):"
+    getent hosts "$TENANT_HOST" 2>&1 || log "  (no DNS resolution)"
+    log "curl -v $TENANT_URL/health (last 40 lines):"
+    curl -kv --max-time 10 "$TENANT_URL/health" 2>&1 \
+      | sed -E 's/(Authorization|Cookie):.*/\1: [redacted]/i' \
+      | tail -n 40 | sed 's/^/  /' || true
+    log "── END DIAGNOSTIC ──"
+    fail "Tenant URL never responded 2xx on /health within ${TLS_TIMEOUT_SEC}s"
   fi
   sleep 5
 done
@@ -403,6 +424,13 @@ fi
 if echo "$AGENT_TEXT" | grep -qF "Unknown provider"; then
   fail "A2A — REGRESSION: install.sh set PROVIDER to a value not in hermes's registry. Run 'hermes doctor' on the workspace to see valid values. Raw: $AGENT_TEXT"
 fi
+# "Invalid API key" — the comment block lists this as a CP #238 race
+# (tenant auth chain) signal but the grep was missing. Caller-side
+# 401's containing this exact phrase don't match the generic
+# "error|exception" catch-all below, so they'd slip through.
+if echo "$AGENT_TEXT" | grep -qF "Invalid API key"; then
+  fail "A2A — REGRESSION: tenant auth chain returned 'Invalid API key'. Likely CP boot-event 401 race (CP #238) or stale OPENAI_API_KEY in the runtime env. Raw: $AGENT_TEXT"
+fi
 # Generic catch-all — falls through if none of the known regressions hit.
 if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then
   fail "A2A returned an error-shaped response: $AGENT_TEXT"
diff --git a/workspace-server/cmd/server/dotenv.go b/workspace-server/cmd/server/dotenv.go
new file mode 100644
index 00000000..fd5745ce
--- /dev/null
+++ b/workspace-server/cmd/server/dotenv.go
@@ -0,0 +1,190 @@
+package main
+
+import (
+	"bufio"
+	"log"
+	"os"
+	"path/filepath"
+	"strings"
+)
+
+// loadDotEnvIfPresent walks upward from CWD looking for a .env file and
+// merges its KEY=VALUE pairs into the process environment. Already-set
+// vars (e.g. from `docker run -e`, CI exports, or ad-hoc `KEY=val
+// ./binary`) win over file values so operators can override without
+// editing the file.
+//
+// Why walk upward: the binary may be launched from the monorepo root,
+// the workspace-server subdir, or anywhere else the operator finds
+// convenient. Walking upward from CWD finds the canonical .env
+// (gitignored, lives at the monorepo root) regardless of cwd, so a
+// fresh `go build -o /tmp/molecule-server ./cmd/server && /tmp/molecule-server`
+// from any subdir picks up the same MOLECULE_ENV / DATABASE_URL / etc.
+// the operator already has — without sourcing or `set -a`.
+//
+// Why no godotenv dep: the format we use is simple — KEY=VALUE with
+// optional `#` comments and no interpolation — so a tiny in-tree parser
+// is auditable, has no supply-chain surface, and avoids drift across
+// repos where some teams configure godotenv differently.
+//
+// Why it's safe in production: the Dockerfile does not COPY .env into
+// the image and `.env` is gitignored, so production containers have no
+// .env on disk to load. If an operator goes out of their way to put one
+// there, the explicit-env-wins rule above means container env still
+// dominates.
+func loadDotEnvIfPresent() {
+	path, ok := findDotEnv()
+	if !ok {
+		return
+	}
+	f, err := os.Open(path)
+	if err != nil {
+		log.Printf(".env: open %s: %v (skipping)", path, err)
+		return
+	}
+	defer f.Close()
+
+	loaded := 0
+	skipped := 0
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		k, v, ok := parseDotEnvLine(scanner.Text())
+		if !ok {
+			continue
+		}
+		// Existing env wins. NOTE: an explicitly-set empty string
+		// (`KEY=` exported from a parent shell) counts as "set" — we
+		// keep the empty value rather than backfilling from the file.
+		// Matches Node's `process.env[k] !== undefined` check in the
+		// canvas's next.config.ts loader so both processes treat the
+		// same input identically. Operators who want the file value
+		// to win must `unset KEY` in the launching shell.
+		if _, exists := os.LookupEnv(k); exists {
+			skipped++
+			continue
+		}
+		if err := os.Setenv(k, v); err != nil {
+			log.Printf(".env: set %s: %v", k, err)
+			continue
+		}
+		loaded++
+	}
+	if err := scanner.Err(); err != nil {
+		log.Printf(".env: scan %s: %v", path, err)
+	}
+	log.Printf(".env: %s — loaded %d, %d already set in env", path, loaded, skipped)
+}
+
+// findDotEnv returns the path of the nearest .env file walking upward
+// from CWD. Capped at 6 levels so a deeply-nested launch dir doesn't
+// scan the entire filesystem.
+//
+// Sentinel gate: only accept a .env that sits next to `workspace-server/`
+// (the monorepo marker). Without it, a developer running the binary from
+// `~/Documents/other-project/` would walk up to `~/.env` and load
+// arbitrary variables — a real foot-gun on shared dev machines and a
+// possible information-leak vector on bare-metal deploys. Skipping the
+// match falls through to "no .env found" which is identical to today's
+// pre-fix behavior (the operator must export env explicitly).
+func findDotEnv() (string, bool) {
+	dir, err := os.Getwd()
+	if err != nil {
+		return "", false
+	}
+	for i := 0; i < 6; i++ {
+		p := filepath.Join(dir, ".env")
+		if st, err := os.Stat(p); err == nil && !st.IsDir() {
+			if isMonorepoRoot(dir) {
+				return p, true
+			}
+			// .env exists here but the directory isn't the monorepo
+			// root — keep walking. Loading it could clobber
+			// environment with values from an unrelated project.
+		}
+		parent := filepath.Dir(dir)
+		if parent == dir {
+			break
+		}
+		dir = parent
+	}
+	return "", false
+}
+
+// isMonorepoRoot returns true if `dir` looks like the molecule-core
+// monorepo root — the directory that owns the .env we want to load.
+// The marker is `workspace-server/go.mod`, which is the canonical
+// in-tree go module and exists only in this monorepo. A simple
+// `workspace-server/` directory check would false-positive on a fork
+// that renamed the dir; the go.mod check is more precise.
+func isMonorepoRoot(dir string) bool {
+	st, err := os.Stat(filepath.Join(dir, "workspace-server", "go.mod"))
+	return err == nil && !st.IsDir()
+}
+
+// parseDotEnvLine parses a single .env line. Returns (key, value, true)
+// for KEY=VALUE pairs. Returns (_, _, false) for blanks, comments, and
+// malformed lines. Handles:
+//   - leading `export ` prefix (so shell-friendly .env files written
+//     for `source .env` or direnv work without modification)
+//   - leading UTF-8 BOM on the first line (Windows editors)
+//   - inline `# comment` after a value when preceded by whitespace
+//   - surrounding `"` or `'` quotes on the value (stripped one matched
+//     pair); inside a quoted value, `#` is part of the value, not a
+//     comment marker
+func parseDotEnvLine(line string) (string, string, bool) {
+	// Strip a UTF-8 BOM if present. bufio.Scanner doesn't filter it,
+	// so the very first line of a Windows-edited .env would otherwise
+	// produce a key like U+FEFF + "FOO" that os.Setenv silently accepts.
+	line = strings.TrimPrefix(line, "\ufeff")
+	line = strings.TrimSpace(line)
+	if line == "" || strings.HasPrefix(line, "#") {
+		return "", "", false
+	}
+	// Drop a leading `export ` (literal space — `export\tFOO=bar`
+	// with a tab is intentionally rejected, matching the TS mirror in
+	// canvas/next.config.ts. shells emit `export ` with a space; tabs
+	// would only appear in hand-mangled files.) so lines like
+	// `export FOO=bar` (the form direnv and many `.env` templates
+	// emit) don't end up as a junk key with an embedded space.
+	line = strings.TrimPrefix(line, "export ")
+	line = strings.TrimLeft(line, " \t") // re-trim in case `export` itself had trailing space
+	eq := strings.IndexByte(line, '=')
+	if eq <= 0 {
+		return "", "", false
+	}
+	k := strings.TrimSpace(line[:eq])
+	v := line[eq+1:]
+	// Trim leading whitespace so a quoted value's opening quote is at
+	// v[0]. The comment-detection loop below then treats the position
+	// after the trim as "start of value" — `KEY=    # comment` has its
+	// `#` at the new v[0] (preceded only by whitespace in the source)
+	// and is correctly classified as an empty value followed by a
+	// comment, not as a value of `# comment`.
+	v = strings.TrimLeft(v, " \t")
+	// Quoted value: strip one matched pair of surrounding quotes and
+	// take the contents verbatim (no inline-comment splitting). Must
+	// happen BEFORE comment detection so `KEY="value # not a comment"`
+	// keeps the `#` as part of the value.
+	if len(v) >= 2 && (v[0] == '"' || v[0] == '\'') {
+		quote := v[0]
+		if end := strings.IndexByte(v[1:], quote); end >= 0 {
+			return k, v[1 : 1+end], true
+		}
+		// Unterminated quote — fall through to bare-value handling
+		// (treats the opening quote as a literal char in the value).
+	}
+	// Bare value: strip inline comment. A `#` is a comment marker iff
+	// it's at the start of the (trimmed) value OR is preceded by
+	// whitespace. `KEY=token#fragment` keeps the `#` as part of the
+	// value because v[i-1] is alphanum.
+	for i := 0; i < len(v); i++ {
+		if v[i] != '#' {
+			continue
+		}
+		if i == 0 || v[i-1] == ' ' || v[i-1] == '\t' {
+			v = v[:i]
+			break
+		}
+	}
+	return k, strings.TrimSpace(v), true
+}
diff --git a/workspace-server/cmd/server/dotenv_test.go b/workspace-server/cmd/server/dotenv_test.go
new file mode 100644
index 00000000..411ad596
--- /dev/null
+++ b/workspace-server/cmd/server/dotenv_test.go
@@ -0,0 +1,211 @@
+package main
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func TestParseDotEnvLine(t *testing.T) {
+	cases := []struct {
+		in      string
+		k, v    string
+		ok      bool
+		comment string
+	}{
+		{in: "", ok: false, comment: "empty line"},
+		{in: "   ", ok: false, comment: "whitespace-only"},
+		{in: "# top-level comment", ok: false, comment: "full-line comment"},
+		{in: "  #  indented comment", ok: false, comment: "indented full-line comment"},
+		{in: "FOO", ok: false, comment: "no equals"},
+		{in: "=BAR", ok: false, comment: "missing key"},
+
+		{in: "FOO=bar", k: "FOO", v: "bar", ok: true, comment: "plain"},
+		{in: "  FOO=bar", k: "FOO", v: "bar", ok: true, comment: "leading whitespace"},
+		{in: "FOO=bar   ", k: "FOO", v: "bar", ok: true, comment: "trailing whitespace stripped"},
+		{in: "FOO  =bar", k: "FOO", v: "bar", ok: true, comment: "whitespace before equals"},
+
+		{in: "FOO=bar # comment", k: "FOO", v: "bar", ok: true, comment: "inline space-hash comment"},
+		{in: "FOO=bar\t# comment", k: "FOO", v: "bar", ok: true, comment: "inline tab-hash comment"},
+		{in: "FOO=bar    # lots of spaces", k: "FOO", v: "bar", ok: true, comment: "multiple spaces before hash"},
+
+		{in: "FOO=bar#nocomment", k: "FOO", v: "bar#nocomment", ok: true, comment: "bare hash inside value preserved"},
+		{in: "URL=postgres://u:p@h:5432/db?sslmode=disable", k: "URL", v: "postgres://u:p@h:5432/db?sslmode=disable", ok: true, comment: "url with embedded equals"},
+		{in: "TOKEN=eyJhbGciOiJIUzI1NiJ9.payload.sig=", k: "TOKEN", v: "eyJhbGciOiJIUzI1NiJ9.payload.sig=", ok: true, comment: "base64 padding preserved"},
+
+		{in: "FOO=", k: "FOO", v: "", ok: true, comment: "empty value"},
+		{in: "ADMIN_TOKEN=", k: "ADMIN_TOKEN", v: "", ok: true, comment: "empty value (production gate sentinel)"},
+
+		// Regression: the repo's own .env contains lines like
+		// `CONFIGS_DIR=                   # Path to ...` where the value
+		// is empty + an inline comment. Pre-fix parser stripped leading
+		// whitespace BEFORE detecting the comment, leaving `#` at v[0]
+		// with nothing preceding it, so the inline-comment check missed
+		// it and the comment text was returned as the value. Server
+		// then tried to use the comment as a directory path and template
+		// loading silently failed (GET /templates returned []).
+		{in: "CONFIGS_DIR=                   # Path to /var/foo (auto-discovered if empty)", k: "CONFIGS_DIR", v: "", ok: true, comment: "empty value with leading whitespace + inline comment"},
+		{in: "FOO=    # comment", k: "FOO", v: "", ok: true, comment: "spaces-only value with inline comment"},
+		{in: "FOO=\t# comment", k: "FOO", v: "", ok: true, comment: "tab-only value with inline comment"},
+
+		// `export` prefix: shell-friendly .env files (direnv, .envrc-style)
+		// — the prefix must be stripped, NOT folded into the key.
+		{in: "export FOO=bar", k: "FOO", v: "bar", ok: true, comment: "export prefix stripped"},
+		{in: "  export FOO=bar", k: "FOO", v: "bar", ok: true, comment: "leading whitespace + export"},
+		{in: "export DATABASE_URL=postgres://u:p@h/db", k: "DATABASE_URL", v: "postgres://u:p@h/db", ok: true, comment: "export with URL value"},
+
+		// Quoted values: one matched pair of surrounding quotes is
+		// stripped; embedded `#` survives because it isn't an inline
+		// comment inside a quote.
+		{in: `FOO="hello world"`, k: "FOO", v: "hello world", ok: true, comment: "double-quoted value"},
+		{in: `FOO='hello world'`, k: "FOO", v: "hello world", ok: true, comment: "single-quoted value"},
+		{in: `FOO="value # not a comment"`, k: "FOO", v: "value # not a comment", ok: true, comment: "hash inside quotes is part of value"},
+		{in: `FOO=  "padded"`, k: "FOO", v: "padded", ok: true, comment: "whitespace before opening quote"},
+		{in: `FOO="unterminated`, k: "FOO", v: `"unterminated`, ok: true, comment: "unterminated quote stays as bare value"},
+
+		// CRLF endings: bufio.Scanner strips \n; \r is left and stripped
+		// by the value-side TrimSpace. Locking this in so a future
+		// refactor doesn't accidentally feed \r into os.Setenv.
+		{in: "FOO=bar\r", k: "FOO", v: "bar", ok: true, comment: "CRLF trailing carriage return stripped"},
+
+		// UTF-8 BOM at file start: a Windows-edited .env begins with
+		// \xEF\xBB\xBF; without explicit stripping the first key would
+		// be "\ufeffFOO".
+		{in: "\ufeffFOO=bar", k: "FOO", v: "bar", ok: true, comment: "UTF-8 BOM stripped"},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.comment, func(t *testing.T) {
+			k, v, ok := parseDotEnvLine(tc.in)
+			if ok != tc.ok {
+				t.Fatalf("ok = %v, want %v (input=%q)", ok, tc.ok, tc.in)
+			}
+			if !tc.ok {
+				return
+			}
+			if k != tc.k || v != tc.v {
+				t.Fatalf("got (%q, %q), want (%q, %q)", k, v, tc.k, tc.v)
+			}
+		})
+	}
+}
+
+// makeFakeMonorepo creates a temp dir that satisfies isMonorepoRoot()
+// (i.e., contains workspace-server/go.mod) plus a .env file with the
+// given body. Returns the dir so the caller can chdir into it.
+func makeFakeMonorepo(t *testing.T, envBody string) string {
+	t.Helper()
+	dir := t.TempDir()
+	wsDir := filepath.Join(dir, "workspace-server")
+	if err := os.MkdirAll(wsDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(wsDir, "go.mod"), []byte("module fake\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(dir, ".env"), []byte(envBody), 0o644); err != nil {
+		t.Fatalf("write .env: %v", err)
+	}
+	return dir
+}
+
+func TestLoadDotEnvIfPresent_PreservesExisting(t *testing.T) {
+	dir := makeFakeMonorepo(t, "DOTENV_TEST_NEW=from_file\nDOTENV_TEST_EXISTING=from_file\n")
+
+	// Pre-set one of the keys — file value must NOT clobber it.
+	t.Setenv("DOTENV_TEST_EXISTING", "from_real_env")
+	// Ensure the other key starts unset.
+	os.Unsetenv("DOTENV_TEST_NEW")
+	t.Cleanup(func() { os.Unsetenv("DOTENV_TEST_NEW") })
+
+	// Run from the temp dir so findDotEnv picks our fixture.
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(dir); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	loadDotEnvIfPresent()
+
+	if got := os.Getenv("DOTENV_TEST_NEW"); got != "from_file" {
+		t.Errorf("DOTENV_TEST_NEW = %q, want %q", got, "from_file")
+	}
+	if got := os.Getenv("DOTENV_TEST_EXISTING"); got != "from_real_env" {
+		t.Errorf("existing env clobbered: got %q, want %q", got, "from_real_env")
+	}
+}
+
+func TestLoadDotEnvIfPresent_NoFile_NoOp(t *testing.T) {
+	dir := t.TempDir() // empty — no .env at this level
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(dir); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	// Should not panic, log loud errors, or set anything. Best-effort
+	// silent miss is the contract.
+	loadDotEnvIfPresent()
+}
+
+func TestFindDotEnv_WalksUpward(t *testing.T) {
+	root := makeFakeMonorepo(t, "X=1\n")
+	nested := filepath.Join(root, "a", "b", "c")
+	if err := os.MkdirAll(nested, 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(nested); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	got, ok := findDotEnv()
+	if !ok {
+		t.Fatal("expected to find .env walking upward")
+	}
+	want := filepath.Join(root, ".env")
+	// macOS resolves /var → /private/var on TempDir, so compare via
+	// EvalSymlinks for both sides to dodge that.
+	gotR, _ := filepath.EvalSymlinks(got)
+	wantR, _ := filepath.EvalSymlinks(want)
+	if gotR != wantR {
+		t.Errorf("findDotEnv() = %q, want %q", got, want)
+	}
+}
+
+func TestFindDotEnv_RejectsUnrelatedDotEnv(t *testing.T) {
+	// Simulates a developer running the binary from inside an
+	// unrelated project tree that happens to have its own .env (or
+	// from $HOME with a personal ~/.env). Without the monorepo
+	// sentinel, findDotEnv would happily load it and clobber env
+	// with arbitrary values — a real foot-gun this regression test
+	// guards against.
+	dir := t.TempDir()
+	if err := os.WriteFile(filepath.Join(dir, ".env"), []byte("LEAKY=value\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(dir); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	if got, ok := findDotEnv(); ok {
+		t.Errorf("findDotEnv() = %q, ok=true; want ok=false (no workspace-server sibling)", got)
+	}
+}
diff --git a/workspace-server/cmd/server/main.go b/workspace-server/cmd/server/main.go
index c1676e29..3805452b 100644
--- a/workspace-server/cmd/server/main.go
+++ b/workspace-server/cmd/server/main.go
@@ -33,6 +33,14 @@ import (
 )
 
 func main() {
+	// .env auto-load: in dev, the operator keeps MOLECULE_ENV /
+	// DATABASE_URL / etc. in the monorepo's .env file. Loading it here
+	// — before any code reads env — means a fresh `/tmp/molecule-server`
+	// run picks up dev config without `set -a && source .env`. No-op
+	// in production (Docker image doesn't ship a .env, and existing env
+	// always wins over file values, so container env stays dominant).
+	loadDotEnvIfPresent()
+
 	// CP self-refresh: pull any operator-rotated config (e.g. a new
 	// MOLECULE_CP_SHARED_SECRET) before any other code reads env.
 	// Best-effort — if the CP is unreachable we keep booting with the
@@ -221,6 +229,18 @@ func main() {
 		})
 	}
 
+	// Orphan-container reconcile sweep — finds running containers
+	// whose workspace row is already status='removed' and stops
+	// them. Defence in depth on top of the inline cleanup in
+	// handlers/workspace_crud.go: any Docker hiccup that left a
+	// container alive after the user clicked delete heals on the
+	// next sweep instead of leaking forever.
+	if prov != nil {
+		go supervised.RunWithRecover(ctx, "orphan-sweeper", func(c context.Context) {
+			registry.StartOrphanSweeper(c, prov)
+		})
+	}
+
 	// Provision-timeout sweep — flips workspaces that have been stuck in
 	// status='provisioning' past the timeout window to 'failed' and emits
 	// WORKSPACE_PROVISION_TIMEOUT. Without this the UI banner is cosmetic
diff --git a/workspace-server/internal/events/broadcaster.go b/workspace-server/internal/events/broadcaster.go
index 514d9781..53427010 100644
--- a/workspace-server/internal/events/broadcaster.go
+++ b/workspace-server/internal/events/broadcaster.go
@@ -15,6 +15,29 @@ import (
 
 const broadcastChannel = "events:broadcast"
 
+// EventEmitter is the contract handler code needs from a broadcaster.
+// Defining it here lets tests substitute a capture-only stub instead
+// of standing up the full Redis + WebSocket hub topology that the
+// concrete *Broadcaster builds (and that previously blocked
+// TestProvisionWorkspace_* regression tests on issue #1814).
+//
+// Includes BroadcastOnly because the activity-log + A2A-response paths
+// inside the handler package fan out via that method — narrowing
+// further would force production callers back to the concrete type.
+//
+// *Broadcaster satisfies this interface trivially. Production code that
+// needs the wider surface (SubscribeSSE, Subscribe) keeps using the
+// concrete *Broadcaster type — sse.go + cmd/server/main.go are the
+// only such call sites today.
+type EventEmitter interface {
+	RecordAndBroadcast(ctx context.Context, eventType string, workspaceID string, payload interface{}) error
+	BroadcastOnly(workspaceID string, eventType string, payload interface{})
+}
+
+// Compile-time assertion: a renamed/reshaped Broadcaster method that
+// silently broke this interface would fail to build here.
+var _ EventEmitter = (*Broadcaster)(nil)
+
 // sseSubscription is a single in-process SSE subscriber.
 // deliverToSSE writes to ch; StreamEvents reads from it.
 type sseSubscription struct {
diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go
index 13c46641..7ff618a1 100644
--- a/workspace-server/internal/handlers/a2a_proxy.go
+++ b/workspace-server/internal/handlers/a2a_proxy.go
@@ -20,6 +20,7 @@ import (
 	"time"
 
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/registry"
 	"github.com/gin-gonic/gin"
@@ -120,18 +121,26 @@ func isUpstreamBusyError(err error) bool {
 	if err == nil {
 		return false
 	}
+	// Typed sentinels propagate cleanly through *url.Error.Unwrap
+	// since Go 1.13, so errors.Is is the primary check for both
+	// DeadlineExceeded and Canceled. The substring fallbacks below
+	// stay only for shapes net/http does NOT type — bare "EOF" /
+	// "connection reset" can arrive as plain *net.OpError with no
+	// errors.Is hook to the stdlib sentinels.
 	if errors.Is(err, context.DeadlineExceeded) {
 		return true
 	}
+	// applyIdleTimeout uses context.WithCancel; surfaces here as
+	// Canceled, distinct from DeadlineExceeded but the same "upstream
+	// busy" class — caller produces a 503 + Retry-After.
+	if errors.Is(err, context.Canceled) {
+		return true
+	}
 	if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
 		return true
 	}
-	// url.Error wraps "read tcp … EOF" and "Post …: context deadline
-	// exceeded" strings from the stdlib HTTP client without typing the
-	// inner cause. Fall back to substring match for those.
 	msg := err.Error()
-	return strings.Contains(msg, "context deadline exceeded") ||
-		strings.Contains(msg, "EOF") ||
+	return strings.Contains(msg, "EOF") ||
 		strings.Contains(msg, "connection reset")
 }
 
@@ -286,7 +295,7 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
 	body = normalizedBody
 
 	startTime := time.Now()
-	resp, cancelFwd, err := h.dispatchA2A(ctx, agentURL, body, callerID)
+	resp, cancelFwd, err := h.dispatchA2A(ctx, workspaceID, agentURL, body, callerID)
 	if cancelFwd != nil {
 		defer cancelFwd()
 	}
@@ -478,25 +487,80 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
 	return marshaledBody, a2aMethod, nil
 }
 
+// idleTimeoutDuration is the per-dispatch silence window: if the
+// platform's broadcaster emits no events for this workspace for the
+// full duration, the dispatch ctx is cancelled. Resets on every
+// ACTIVITY_LOGGED / TASK_UPDATED / A2A_RESPONSE event for the
+// workspace, so a chat that's actively reporting tool calls or
+// streaming status updates never trips it. Picked to be longer than
+// any reasonable single-tool-use cadence (Claude Code's slowest
+// observed silence between tools is ~30s) but short enough that a
+// truly wedged runtime fails in 1 minute, not 5.
+const idleTimeoutDuration = 60 * time.Second
+
 // dispatchA2A POSTs `body` to `agentURL`. Uses WithoutCancel so delegation
-// chains survive client disconnect (browser tab close). Default timeouts:
-// canvas (callerID == "") = 5 min, agent-to-agent = 30 min. Callers can
-// override via the X-Timeout header (applied to ctx upstream in ProxyA2A).
-func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, agentURL string, body []byte, callerID string) (*http.Response, context.CancelFunc, error) {
+// chains survive client disconnect (browser tab close). Two layers of
+// timeout per dispatch:
+//
+//   - Idle timeout (always applied): cancels the dispatch when no
+//     broadcaster events for the workspace fire for
+//     idleTimeoutDuration. Any progress event resets the clock — so
+//     a long but actively-streaming reply runs forever, while a
+//     wedged runtime fails fast.
+//   - Absolute ceiling (agent-to-agent only): 30 min cap as a
+//     defence against runaway delegation loops. Canvas dispatches
+//     have no absolute ceiling — the user can wait as long as they
+//     want, the idle timer is the only hangup signal.
+//
+// Either layer is overridable by the X-Timeout header upstream in
+// ProxyA2A; X-Timeout: 0 explicitly disables the absolute ceiling.
+func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, workspaceID, agentURL string, body []byte, callerID string) (*http.Response, context.CancelFunc, error) {
+	// #1483 SSRF defense-in-depth: the primary call path through
+	// proxyA2ARequest → resolveAgentURL already validates via isSafeURL
+	// (a2a_proxy.go:424), but adding the check here closes the gap for
+	// any future code path that calls dispatchA2A directly without
+	// going through resolveAgentURL. Wrapping as proxyDispatchBuildError
+	// keeps the caller's error-classification path unchanged — the same
+	// shape it already produces a 500 for.
+	if err := isSafeURL(agentURL); err != nil {
+		return nil, nil, &proxyDispatchBuildError{err: err}
+	}
 	forwardCtx := context.WithoutCancel(ctx)
-	var cancel context.CancelFunc
+	var ceilingCancel context.CancelFunc
 	if _, hasDeadline := ctx.Deadline(); !hasDeadline {
-		if callerID == "" {
-			forwardCtx, cancel = context.WithTimeout(forwardCtx, 5*time.Minute)
-		} else {
-			forwardCtx, cancel = context.WithTimeout(forwardCtx, 30*time.Minute)
+		if callerID != "" {
+			forwardCtx, ceilingCancel = context.WithTimeout(forwardCtx, 30*time.Minute)
+		}
+		// callerID == "" (canvas): no absolute ceiling. The idle
+		// timeout below is the only deadline.
+	}
+	// Idle timeout — cancels the dispatch ctx after
+	// idleTimeoutDuration of broadcaster silence for this workspace.
+	// Always applied (canvas + agent-to-agent both benefit; the
+	// ceiling above is a separate runaway-loop cap that only fires
+	// for agent traffic). Combines with the ceiling cancel into a
+	// single returned cancel func that the caller defers.
+	// applyIdleTimeout needs SubscribeSSE which only lives on the
+	// concrete *Broadcaster, not on the EventEmitter interface the
+	// handler now stores. Type-assert + fall through to a no-op idle
+	// timer if the broadcaster doesn't support subscriptions (the
+	// EventEmitter mock used by some tests, e.g.). Production wires
+	// the concrete *Broadcaster, so the assertion always succeeds in
+	// real deploys.
+	var b *events.Broadcaster
+	if concrete, ok := h.broadcaster.(*events.Broadcaster); ok {
+		b = concrete
+	}
+	forwardCtx, idleCancel := applyIdleTimeout(forwardCtx, b, workspaceID, idleTimeoutDuration)
+	cancel := func() {
+		idleCancel()
+		if ceilingCancel != nil {
+			ceilingCancel()
 		}
 	}
 	req, err := http.NewRequestWithContext(forwardCtx, "POST", agentURL, bytes.NewReader(body))
 	if err != nil {
-		if cancel != nil {
-			cancel()
-		}
+		cancel()
 		// Wrap the construction failure so the caller can distinguish it
 		// from an upstream Do() error and produce the correct 500 response.
 		return nil, nil, &proxyDispatchBuildError{err: err}
@@ -505,3 +569,52 @@ func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, agentURL string, bod
 	resp, doErr := a2aClient.Do(req)
 	return resp, cancel, doErr
 }
+
+// applyIdleTimeout returns a child ctx that gets cancelled when no
+// broadcaster events for `workspaceID` arrive for `idle` duration.
+// Any incoming event resets the clock. The returned cancel func
+// MUST be called to clean up the goroutine + subscription.
+//
+// nil broadcaster or non-positive idle returns the parent ctx
+// unchanged (and a no-op cancel) so test paths that don't wire a
+// broadcaster keep working.
+func applyIdleTimeout(parent context.Context, b *events.Broadcaster, workspaceID string, idle time.Duration) (context.Context, context.CancelFunc) {
+	if b == nil || idle <= 0 || workspaceID == "" {
+		return parent, func() {}
+	}
+	ctx, cancel := context.WithCancel(parent)
+	sub, unsub := b.SubscribeSSE(workspaceID)
+	go func() {
+		defer unsub()
+		timer := time.NewTimer(idle)
+		defer timer.Stop()
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case _, ok := <-sub:
+				if !ok {
+					// Subscription channel closed — fall back to
+					// pure-timer mode. Don't cancel: another caller
+					// may have closed our sub but the request itself
+					// is still in flight. Let the timer or the
+					// caller's defer drive cleanup.
+					continue
+				}
+				// Stop+drain pattern so a fired-but-unread timer
+				// doesn't double-cancel after the Reset.
+				if !timer.Stop() {
+					select {
+					case <-timer.C:
+					default:
+					}
+				}
+				timer.Reset(idle)
+			case <-timer.C:
+				cancel()
+				return
+			}
+		}
+	}()
+	return ctx, cancel
+}
diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go
index dcad98e2..ff5b6968 100644
--- a/workspace-server/internal/handlers/a2a_proxy_test.go
+++ b/workspace-server/internal/handlers/a2a_proxy_test.go
@@ -5,6 +5,7 @@ import (
 	"context"
 	"database/sql"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"net/http"
@@ -600,9 +601,21 @@ func TestIsUpstreamBusyError(t *testing.T) {
 	}{
 		{"nil", nil, false},
 		{"context.DeadlineExceeded", context.DeadlineExceeded, true},
+		// applyIdleTimeout cancels its child ctx via context.WithCancel
+		// when the broadcaster silence window elapses — surfaces here
+		// as context.Canceled. Same "upstream busy" classification.
+		{"context.Canceled", context.Canceled, true},
+		{"wrapped context.Canceled", fmt.Errorf("dispatch wrapped: %w", context.Canceled), true},
 		{"io.EOF", io.EOF, true},
 		{"io.ErrUnexpectedEOF", io.ErrUnexpectedEOF, true},
-		{"wrapped context deadline string", fmt.Errorf(`Post "http://ws-foo:8000": context deadline exceeded`), true},
+		// Real net/http wraps context.DeadlineExceeded via *url.Error.Unwrap,
+		// so errors.Is(err, context.DeadlineExceeded) catches it. The
+		// pre-892de784 substring "context deadline exceeded" fallback
+		// also accepted a string-only error like
+		// `fmt.Errorf("Post: context deadline exceeded")`; that fallback
+		// was dropped because errors.Is handles the real shape and the
+		// substring was indistinguishable from a user-content match.
+		{"wrapped context deadline (errors.Is path)", fmt.Errorf("Post: %w", context.DeadlineExceeded), true},
 		{"wrapped EOF string", fmt.Errorf(`Post "http://ws-foo:8000": EOF`), true},
 		{"connection reset", fmt.Errorf("read tcp 127.0.0.1:8080->127.0.0.1:12345: connection reset by peer"), true},
 		{"generic dns error", fmt.Errorf("no such host"), false},
@@ -1074,7 +1087,7 @@ func TestDispatchA2A_BuildRequestError(t *testing.T) {
 	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
 
 	// Malformed URL causes http.NewRequestWithContext to fail.
-	_, cancel, err := handler.dispatchA2A(context.Background(), "http://%%badhost", []byte("{}"), "")
+	_, cancel, err := handler.dispatchA2A(context.Background(), "ws-target", "http://%%badhost", []byte("{}"), "")
 	if cancel != nil {
 		cancel()
 	}
@@ -1097,13 +1110,13 @@ func TestDispatchA2A_CanvasTimeout(t *testing.T) {
 	}))
 	defer srv.Close()
 
-	resp, cancel, err := handler.dispatchA2A(context.Background(), srv.URL, []byte(`{}`), "")
+	resp, cancel, err := handler.dispatchA2A(context.Background(), "ws-target", srv.URL, []byte(`{}`), "")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	defer resp.Body.Close()
 	if cancel == nil {
-		t.Fatal("canvas caller (empty callerID) must set a timeout + return cancel")
+		t.Fatal("canvas caller must return a cancel func (idle-timeout cleanup)")
 	}
 	cancel() // restore
 }
@@ -1118,20 +1131,23 @@ func TestDispatchA2A_AgentTimeout(t *testing.T) {
 	}))
 	defer srv.Close()
 
-	resp, cancel, err := handler.dispatchA2A(context.Background(), srv.URL, []byte(`{}`), "ws-caller")
+	resp, cancel, err := handler.dispatchA2A(context.Background(), "ws-target", srv.URL, []byte(`{}`), "ws-caller")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	defer resp.Body.Close()
 	if cancel == nil {
-		t.Fatal("agent-to-agent caller must set a timeout + return cancel")
+		t.Fatal("agent-to-agent caller must return a cancel func (idle + ceiling cleanup)")
 	}
 	cancel()
 }
 
-func TestDispatchA2A_ContextDeadline_NoCancelAdded(t *testing.T) {
-	// When ctx already has a deadline, dispatchA2A must NOT layer its own
-	// timeout (cancel should be nil).
+func TestDispatchA2A_ContextDeadline_NoExtraCeiling(t *testing.T) {
+	// When ctx already has a deadline, dispatchA2A must not layer
+	// its own absolute ceiling on top — the caller's deadline wins.
+	// The idle-timer cleanup still produces a non-nil cancel func
+	// (introduced by the always-on idle timeout) but the cancel func
+	// is safe to call repeatedly and from a deferred path.
 	setupTestDB(t)
 	setupTestRedis(t)
 	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
@@ -1144,17 +1160,134 @@ func TestDispatchA2A_ContextDeadline_NoCancelAdded(t *testing.T) {
 	ctx, ctxCancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer ctxCancel()
 
-	resp, cancel, err := handler.dispatchA2A(ctx, srv.URL, []byte(`{}`), "")
+	resp, cancel, err := handler.dispatchA2A(ctx, "ws-target", srv.URL, []byte(`{}`), "")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	defer resp.Body.Close()
-	if cancel != nil {
-		t.Error("cancel should be nil when ctx already has a deadline")
-		cancel()
+	if cancel == nil {
+		t.Error("cancel must be non-nil (idle-timer cleanup)")
 	}
 }
 
+// --- applyIdleTimeout ---
+
+// TestApplyIdleTimeout_FiresOnSilence verifies the helper cancels its
+// child ctx when no broadcaster events arrive for `idle` duration.
+// Uses a short idle window (60ms) so the test runs fast.
+func TestApplyIdleTimeout_FiresOnSilence(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	b := newTestBroadcaster()
+
+	parent, parentCancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer parentCancel()
+
+	idleCtx, idleCancel := applyIdleTimeout(parent, b, "ws-silent", 60*time.Millisecond)
+	defer idleCancel()
+
+	select {
+	case <-idleCtx.Done():
+		// expected — no events ever arrived for ws-silent
+	case <-time.After(2 * time.Second):
+		t.Fatal("idleCtx never cancelled despite no events")
+	}
+	if !errors.Is(idleCtx.Err(), context.Canceled) {
+		t.Errorf("idleCtx err = %v, want context.Canceled", idleCtx.Err())
+	}
+}
+
+// TestApplyIdleTimeout_ResetsOnEvent verifies that a broadcaster event
+// for the workspace resets the timer. Sends one event mid-window and
+// confirms ctx is still alive after the original deadline would have
+// fired, but cancelled after a second silence window elapses.
+func TestApplyIdleTimeout_ResetsOnEvent(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	b := newTestBroadcaster()
+
+	parent, parentCancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer parentCancel()
+
+	idle := 80 * time.Millisecond
+	idleCtx, idleCancel := applyIdleTimeout(parent, b, "ws-active", idle)
+	defer idleCancel()
+
+	// Send a progress event halfway through the window — should
+	// extend the deadline by another `idle`.
+	time.Sleep(idle / 2)
+	b.BroadcastOnly("ws-active", "ACTIVITY_LOGGED", map[string]interface{}{"activity_type": "agent_log"})
+
+	// At t = idle (original deadline), ctx must still be alive
+	// because the event reset the clock.
+	select {
+	case <-idleCtx.Done():
+		t.Fatal("idleCtx cancelled despite mid-window event resetting the timer")
+	case <-time.After(idle - (idle / 2) + 10*time.Millisecond):
+		// ok — past the original deadline, still alive
+	}
+
+	// Now wait for the second silence window to actually fire.
+	select {
+	case <-idleCtx.Done():
+		// expected
+	case <-time.After(idle + 200*time.Millisecond):
+		t.Fatal("idleCtx never cancelled after the second silence window")
+	}
+}
+
+// TestApplyIdleTimeout_NilBroadcasterDegradesGracefully — nil
+// broadcaster (some test paths) returns the parent ctx unchanged.
+func TestApplyIdleTimeout_NilBroadcasterDegradesGracefully(t *testing.T) {
+	parent := context.Background()
+	idleCtx, cancel := applyIdleTimeout(parent, nil, "ws-x", 50*time.Millisecond)
+	defer cancel()
+	if idleCtx != parent {
+		t.Error("nil broadcaster must return the parent ctx unchanged")
+	}
+	// And calling cancel must be safe.
+	cancel()
+}
+
+// TestDispatchA2A_RejectsUnsafeURL is the #1483 defense-in-depth
+// regression. setupTestDB disables SSRF for normal tests so existing
+// dispatchA2A unit tests can hit httptest.NewServer (loopback) — we
+// re-enable it here to verify the new in-function isSafeURL guard.
+// Production callers go through resolveAgentURL which already
+// validates; this test pins that dispatchA2A is now safe even when
+// called directly by a future caller that skips resolveAgentURL.
+//
+// Note: dispatchA2A's signature includes workspaceID (added by the
+// idle-timeout work) so this test passes a stub value — the SSRF check
+// fires before workspaceID is referenced.
+func TestDispatchA2A_RejectsUnsafeURL(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	restoreSSRF := setSSRFCheckForTest(true)
+	t.Cleanup(restoreSSRF)
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+
+	// Cloud metadata IP — must be rejected before any HTTP call goes out.
+	_, cancel, err := handler.dispatchA2A(
+		context.Background(),
+		"ws-target",
+		"http://169.254.169.254/latest/meta-data/",
+		[]byte(`{}`),
+		"",
+	)
+	if cancel != nil {
+		cancel()
+		t.Error("cancel must be nil when the URL is rejected pre-request")
+	}
+	if err == nil {
+		t.Fatal("expected SSRF rejection error, got nil")
+	}
+	if _, ok := err.(*proxyDispatchBuildError); !ok {
+		t.Errorf("expected *proxyDispatchBuildError (caller maps to 500), got %T: %v", err, err)
+	}
+}
+
+
 // --- handleA2ADispatchError ---
 
 func TestHandleA2ADispatchError_ContextDeadline(t *testing.T) {
diff --git a/workspace-server/internal/handlers/a2a_queue.go b/workspace-server/internal/handlers/a2a_queue.go
index b747fac4..fb0e3b80 100644
--- a/workspace-server/internal/handlers/a2a_queue.go
+++ b/workspace-server/internal/handlers/a2a_queue.go
@@ -288,7 +288,7 @@ func (h *WorkspaceHandler) DrainQueueForWorkspace(ctx context.Context, workspace
 	}
 	// logActivity=false: the original EnqueueA2A callsite already logged
 	// the dispatch attempt; re-logging here would double-count events.
-	status, _, proxyErr := h.proxyA2ARequest(ctx, workspaceID, item.Body, callerID, false)
+	status, respBody, proxyErr := h.proxyA2ARequest(ctx, workspaceID, item.Body, callerID, false)
 
 	// 202 Accepted = the dispatch was itself queued again (target still busy).
 	// That's not a failure — the queued item just stays queued naturally on
@@ -321,4 +321,89 @@ func (h *WorkspaceHandler) DrainQueueForWorkspace(ctx context.Context, workspace
 	MarkQueueItemCompleted(ctx, item.ID)
 	log.Printf("A2AQueue drain: dispatched %s to workspace %s (attempt=%d)",
 		item.ID, workspaceID, item.Attempts)
+
+	// Stitch the response back to the originating delegation row, if this
+	// queue item was a delegation. Without this, check_task_status would
+	// see status='queued' (set by the executeDelegation queued-branch) and
+	// the LLM would think the work was never done. We embed delegation_id
+	// in params.message.metadata at Delegate-handler time; pull it out
+	// here and UPDATE the delegate_result row so the original caller can
+	// observe the real reply.
+	if delegationID := extractDelegationIDFromBody(item.Body); delegationID != "" {
+		h.stitchDrainResponseToDelegation(ctx, callerID, item.WorkspaceID, delegationID, respBody)
+	}
+}
+
+// extractDelegationIDFromBody pulls params.message.metadata.delegation_id
+// out of an A2A JSON-RPC body. Empty string when absent — drain treats
+// that as "this queue item didn't originate from /workspaces/:id/delegate"
+// and skips the stitch, so non-delegation queue uses (cross-workspace
+// peer-direct A2A) aren't affected.
+func extractDelegationIDFromBody(body []byte) string {
+	var envelope struct {
+		Params struct {
+			Message struct {
+				Metadata struct {
+					DelegationID string `json:"delegation_id"`
+				} `json:"metadata"`
+			} `json:"message"`
+		} `json:"params"`
+	}
+	if err := json.Unmarshal(body, &envelope); err != nil {
+		return ""
+	}
+	return envelope.Params.Message.Metadata.DelegationID
+}
+
+// stitchDrainResponseToDelegation writes the drained response into the
+// delegation's existing delegate_result row (created with status='queued'
+// by executeDelegation when the proxy first returned queued). This is the
+// other half of the loop that closes "queued → completed" so the LLM's
+// check_task_status reflects ground truth.
+//
+// Errors are logged-only — drain is fire-and-forget from Heartbeat, and a
+// stitch failure shouldn't block other queued items. The delegation will
+// just remain stuck at 'queued' in this case, which is the pre-fix
+// behaviour (no regression vs. shipping nothing).
+func (h *WorkspaceHandler) stitchDrainResponseToDelegation(ctx context.Context, sourceID, targetID, delegationID string, respBody []byte) {
+	if sourceID == "" || delegationID == "" {
+		return
+	}
+	responseText := extractResponseText(respBody)
+	respJSON, _ := json.Marshal(map[string]interface{}{
+		"text":          responseText,
+		"delegation_id": delegationID,
+	})
+	res, err := db.DB.ExecContext(ctx, `
+		UPDATE activity_logs
+		   SET status        = 'completed',
+		       summary       = $1,
+		       response_body = $2::jsonb
+		 WHERE workspace_id   = $3
+		   AND method         = 'delegate_result'
+		   AND target_id      = $4
+		   AND response_body->>'delegation_id' = $5
+	`, "Delegation completed ("+truncate(responseText, 80)+")", string(respJSON),
+		sourceID, targetID, delegationID)
+	if err != nil {
+		log.Printf("A2AQueue drain stitch: update failed for delegation %s: %v", delegationID, err)
+		return
+	}
+	if rows, _ := res.RowsAffected(); rows == 0 {
+		log.Printf("A2AQueue drain stitch: no delegate_result row for delegation %s (queued-row may not exist yet)", delegationID)
+		return
+	}
+	log.Printf("A2AQueue drain stitch: delegation %s queued → completed (%d chars)", delegationID, len(responseText))
+
+	// Broadcast DELEGATION_COMPLETE so the canvas chat feed flips the
+	// "⏸ queued" line to "✓ completed" in real time. Without this the
+	// transition only surfaces after the user reloads or polls activity.
+	if h.broadcaster != nil {
+		h.broadcaster.RecordAndBroadcast(ctx, "DELEGATION_COMPLETE", sourceID, map[string]interface{}{
+			"delegation_id":    delegationID,
+			"target_id":        targetID,
+			"response_preview": truncate(responseText, 200),
+			"via":              "queue_drain",
+		})
+	}
 }
diff --git a/workspace-server/internal/handlers/a2a_queue_test.go b/workspace-server/internal/handlers/a2a_queue_test.go
index 940f7f2f..57000910 100644
--- a/workspace-server/internal/handlers/a2a_queue_test.go
+++ b/workspace-server/internal/handlers/a2a_queue_test.go
@@ -80,6 +80,39 @@ func TestExtractIdempotencyKey_emptyOnMissing(t *testing.T) {
 	}
 }
 
+func TestExtractDelegationIDFromBody(t *testing.T) {
+	cases := []struct {
+		name string
+		body string
+		want string
+	}{
+		{
+			name: "delegation body — metadata.delegation_id present",
+			body: `{"method":"message/send","params":{"message":{"role":"user","messageId":"abc-123","metadata":{"delegation_id":"abc-123"},"parts":[{"type":"text","text":"hi"}]}}}`,
+			want: "abc-123",
+		},
+		{
+			name: "non-delegation body — no metadata (peer-direct A2A)",
+			body: `{"method":"message/send","params":{"message":{"role":"user","messageId":"m-1","parts":[{"type":"text","text":"hi"}]}}}`,
+			want: "",
+		},
+		{
+			name: "metadata present but no delegation_id key",
+			body: `{"params":{"message":{"metadata":{"trace_id":"t-1"}}}}`,
+			want: "",
+		},
+		{"malformed JSON", `not json`, ""},
+		{"empty body", ``, ""},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := extractDelegationIDFromBody([]byte(tc.body)); got != tc.want {
+				t.Errorf("extractDelegationIDFromBody = %q, want %q", got, tc.want)
+			}
+		})
+	}
+}
+
 // ──────────────────────────────────────────────────────────────────────────────
 // DrainQueueForWorkspace — nil-safe error extraction regression tests
 //
diff --git a/workspace-server/internal/handlers/activity.go b/workspace-server/internal/handlers/activity.go
index 4d98e9fa..ba6d9f0f 100644
--- a/workspace-server/internal/handlers/activity.go
+++ b/workspace-server/internal/handlers/activity.go
@@ -286,6 +286,37 @@ func (h *ActivityHandler) Notify(c *gin.Context) {
 		"name":         wsName,
 	})
 
+	// Persist to activity_logs so the chat history loader restores this
+	// message after a page reload. Pre-fix, send_message_to_user pushes
+	// were broadcast-only — survived the WebSocket session but vanished
+	// when the user refreshed because nothing wrote them to the DB.
+	//
+	// Shape chosen to match the existing loader query
+	// (`type=a2a_receive&source=canvas`):
+	//   - activity_type='a2a_receive' so it joins the same query path
+	//   - source_id=NULL so the canvas-source filter accepts it
+	//   - method='notify' to distinguish from real A2A receives in audits
+	//   - request_body=NULL so the loader doesn't append a duplicate
+	//     "user message" bubble for it
+	//   - response_body={"result": "<text>"} matches extractResponseText's
+	//     simplest branch ({result: string} → take verbatim)
+	//
+	// Errors are logged-only — broadcast already succeeded, the user
+	// sees the message; persistence failure just means the message
+	// won't survive reload (pre-fix behavior). Don't fail the whole
+	// notify on a DB hiccup.
+	respJSON, _ := json.Marshal(map[string]interface{}{"result": body.Message})
+	preview := body.Message
+	if len(preview) > 80 {
+		preview = preview[:80] + "…"
+	}
+	if _, err := db.DB.ExecContext(c.Request.Context(), `
+		INSERT INTO activity_logs (workspace_id, activity_type, method, summary, response_body, status)
+		VALUES ($1, 'a2a_receive', 'notify', $2, $3::jsonb, 'ok')
+	`, workspaceID, "Agent message: "+preview, string(respJSON)); err != nil {
+		log.Printf("Notify: failed to persist message for %s: %v", workspaceID, err)
+	}
+
 	c.JSON(http.StatusOK, gin.H{"status": "sent"})
 }
 
@@ -373,7 +404,9 @@ func (h *ActivityHandler) Report(c *gin.Context) {
 }
 
 // LogActivity inserts an activity log and optionally broadcasts via WebSocket.
-func LogActivity(ctx context.Context, broadcaster *events.Broadcaster, params ActivityParams) {
+// Takes events.EventEmitter (#1814) so callers passing a stub broadcaster
+// in tests no longer need to construct the full *events.Broadcaster.
+func LogActivity(ctx context.Context, broadcaster events.EventEmitter, params ActivityParams) {
 	reqJSON, reqErr := json.Marshal(params.RequestBody)
 	if reqErr != nil {
 		log.Printf("LogActivity: failed to marshal request_body for %s: %v", params.WorkspaceID, reqErr)
diff --git a/workspace-server/internal/handlers/activity_test.go b/workspace-server/internal/handlers/activity_test.go
index 1780be3b..9cba5873 100644
--- a/workspace-server/internal/handlers/activity_test.go
+++ b/workspace-server/internal/handlers/activity_test.go
@@ -217,6 +217,86 @@ func TestActivityReport_RejectsUnknownType(t *testing.T) {
 	}
 }
 
+func TestNotify_PersistsToActivityLogsForReloadRecovery(t *testing.T) {
+	// Regression guard for the "responses gone on reload" bug. send_message_to_user
+	// pushes (which route through Notify) used to be broadcast-only — they
+	// rendered in the canvas but vanished on page reload because nothing
+	// wrote them to activity_logs. The chat history loader queries
+	// `type=a2a_receive&source=canvas`, so the persisted row must:
+	//   - Use activity_type='a2a_receive' (loader's filter)
+	//   - Have source_id NULL (canvas-source filter)
+	//   - Carry the message text in response_body so extractResponseText
+	//     can reconstruct the agent reply on reload
+	mockDB, mock, _ := sqlmock.New()
+	defer mockDB.Close()
+	db.DB = mockDB
+
+	// Workspace existence check
+	mock.ExpectQuery(`SELECT name FROM workspaces`).
+		WithArgs("ws-notify").
+		WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("DD"))
+
+	// Persistence INSERT — verify shape
+	mock.ExpectExec(`INSERT INTO activity_logs`).
+		WithArgs(
+			"ws-notify",
+			sqlmock.AnyArg(), // summary
+			sqlmock.AnyArg(), // response_body JSON
+		).
+		WillReturnResult(sqlmock.NewResult(1, 1))
+
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	gin.SetMode(gin.TestMode)
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-notify"}}
+	body := `{"message":"agent reply that arrived after the sync POST timed out"}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/ws-notify/notify", strings.NewReader(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+	handler.Notify(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("DB expectations not met: %v", err)
+	}
+}
+
+func TestNotify_DBFailure_StillBroadcastsAnd200(t *testing.T) {
+	// Persistence is best-effort — a DB hiccup must NOT block the
+	// WebSocket push (which the user is already seeing in their open
+	// canvas). Pre-fix the WS push always succeeded; we don't want
+	// the new persistence step to regress that path.
+	mockDB, mock, _ := sqlmock.New()
+	defer mockDB.Close()
+	db.DB = mockDB
+
+	mock.ExpectQuery(`SELECT name FROM workspaces`).
+		WithArgs("ws-x").
+		WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("DD"))
+	mock.ExpectExec(`INSERT INTO activity_logs`).
+		WillReturnError(fmt.Errorf("simulated db hiccup"))
+
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	gin.SetMode(gin.TestMode)
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-x"}}
+	body := `{"message":"hi"}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/ws-x/notify", strings.NewReader(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+	handler.Notify(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("DB failure must not break the response; got %d", w.Code)
+	}
+}
+
 // ==================== Direct unit tests for SessionSearch helpers ====================
 
 // --- parseSessionSearchParams ---
diff --git a/workspace-server/internal/handlers/admin_workspace_images.go b/workspace-server/internal/handlers/admin_workspace_images.go
new file mode 100644
index 00000000..147bf8ad
--- /dev/null
+++ b/workspace-server/internal/handlers/admin_workspace_images.go
@@ -0,0 +1,227 @@
+package handlers
+
+import (
+	"context"
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/docker/docker/api/types/container"
+	"github.com/docker/docker/api/types/filters"
+	dockerimage "github.com/docker/docker/api/types/image"
+	dockerclient "github.com/docker/docker/client"
+	"github.com/gin-gonic/gin"
+
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
+)
+
+// AdminWorkspaceImagesHandler serves POST /admin/workspace-images/refresh — the
+// production-side end of the runtime CD chain. Operators (or post-publish
+// automation) hit this to (1) pull the latest workspace template images from
+// GHCR via the Docker SDK and (2) recreate any running ws-* containers so
+// they adopt the new image. Without this, a freshly-published runtime sits
+// in the registry but containers keep running the old image until the next
+// manual restart.
+//
+// On a SaaS deployment the deploy pipeline already pulls on every release,
+// so the pull step is a no-op there; the recreate step is still the way to
+// make running workspaces adopt the new image without a full host restart.
+//
+// POST /admin/workspace-images/refresh
+//
+//	?runtime=claude-code   (optional; default = all 8 templates)
+//	&recreate=true|false   (default true; false = pull only)
+//
+// Returns JSON {pulled: [...], failed: [...], recreated: [...]}
+type AdminWorkspaceImagesHandler struct {
+	docker *dockerclient.Client
+}
+
+func NewAdminWorkspaceImagesHandler(docker *dockerclient.Client) *AdminWorkspaceImagesHandler {
+	return &AdminWorkspaceImagesHandler{docker: docker}
+}
+
+// allRuntimes is the canonical list mirroring docs/workspace-runtime-package.md.
+// Update both when a new template is added.
+var allRuntimes = []string{
+	"claude-code", "langgraph", "crewai", "autogen",
+	"deepagents", "hermes", "gemini-cli", "openclaw",
+}
+
+type refreshResult struct {
+	Pulled    []string `json:"pulled"`
+	Failed    []string `json:"failed"`
+	Recreated []string `json:"recreated"`
+}
+
+// ghcrAuthHeader returns the base64-encoded JSON auth payload Docker's
+// ImagePull expects in PullOptions.RegistryAuth, or empty string when no
+// GHCR_USER/GHCR_TOKEN env is set (lets public images pull through).
+//
+// The Docker SDK doesn't read ~/.docker/config.json — every authenticated
+// pull needs an explicit RegistryAuth string. Format per the Docker
+// engine API: {"username":"…","password":"…","serveraddress":"ghcr.io"}
+// → base64-encoded JSON with no trailing padding stripped (engine handles
+// either form).
+func ghcrAuthHeader() string {
+	user := strings.TrimSpace(os.Getenv("GHCR_USER"))
+	token := strings.TrimSpace(os.Getenv("GHCR_TOKEN"))
+	if user == "" || token == "" {
+		return ""
+	}
+	payload := map[string]string{
+		"username":      user,
+		"password":      token,
+		"serveraddress": "ghcr.io",
+	}
+	js, err := json.Marshal(payload)
+	if err != nil {
+		// Should be unreachable for a static map[string]string. Log so a
+		// future contributor adding a non-marshallable field notices.
+		log.Printf("workspace-images: failed to marshal GHCR auth: %v", err)
+		return ""
+	}
+	return base64.URLEncoding.EncodeToString(js)
+}
+
+func (h *AdminWorkspaceImagesHandler) Refresh(c *gin.Context) {
+	runtimes := allRuntimes
+	if r := c.Query("runtime"); r != "" {
+		// Accept a single runtime; reject anything not in the canonical list
+		// so a typo doesn't silently no-op.
+		found := false
+		for _, known := range allRuntimes {
+			if known == r {
+				found = true
+				break
+			}
+		}
+		if !found {
+			c.JSON(http.StatusBadRequest, gin.H{
+				"error":          fmt.Sprintf("unknown runtime: %s", r),
+				"known_runtimes": allRuntimes,
+			})
+			return
+		}
+		runtimes = []string{r}
+	}
+	recreate := c.DefaultQuery("recreate", "true") == "true"
+
+	res := refreshResult{Pulled: []string{}, Failed: []string{}, Recreated: []string{}}
+	auth := ghcrAuthHeader()
+
+	// 1. Pull each template image via the Docker SDK. Soft-fail per-runtime
+	//    so one missing image (e.g. unpublished template) doesn't abort
+	//    the others. Each pull's progress stream is drained to completion
+	//    — the engine treats early-close as "abandon", leaving partial
+	//    layers around with no reference.
+	pullCtx, cancel := context.WithTimeout(c.Request.Context(), 5*time.Minute)
+	defer cancel()
+	for _, rt := range runtimes {
+		image := fmt.Sprintf("ghcr.io/molecule-ai/workspace-template-%s:latest", rt)
+		opts := dockerimage.PullOptions{Platform: provisioner.DefaultImagePlatform()}
+		if auth != "" {
+			opts.RegistryAuth = auth
+		}
+		rc, err := h.docker.ImagePull(pullCtx, image, opts)
+		if err != nil {
+			log.Printf("workspace-images/refresh: pull %s failed: %v", rt, err)
+			res.Failed = append(res.Failed, rt)
+			continue
+		}
+		// Drain to completion. We discard progress payload because no
+		// caller renders it; the platform log already records pulled/failed
+		// per runtime. If a future caller wants live progress, decode the
+		// JSON-line stream into events here.
+		if _, err := io.Copy(io.Discard, rc); err != nil {
+			rc.Close()
+			log.Printf("workspace-images/refresh: drain %s failed: %v", rt, err)
+			res.Failed = append(res.Failed, rt)
+			continue
+		}
+		rc.Close()
+		res.Pulled = append(res.Pulled, rt)
+	}
+
+	if !recreate {
+		c.JSON(http.StatusOK, res)
+		return
+	}
+
+	// 2. Find ws-* containers running an image we just pulled. Recreate
+	//    them — kill+remove and let the platform's normal provisioning
+	//    flow re-create on next canvas interaction.
+	listCtx, listCancel := context.WithTimeout(c.Request.Context(), 30*time.Second)
+	defer listCancel()
+	containers, err := h.docker.ContainerList(listCtx, container.ListOptions{
+		All:     true,
+		Filters: filters.NewArgs(filters.Arg("name", "ws-")),
+	})
+	if err != nil {
+		log.Printf("workspace-images/refresh: container list failed: %v", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "container list failed", "partial_result": res})
+		return
+	}
+
+	pulledSet := map[string]struct{}{}
+	for _, rt := range res.Pulled {
+		pulledSet[rt] = struct{}{}
+	}
+	for _, ctr := range containers {
+		// ContainerList's ctr.Image is the *resolved digest* (sha256:…),
+		// not the human-readable tag. Use ContainerInspect to get the
+		// original Config.Image (e.g. "ghcr.io/molecule-ai/workspace-
+		// template-claude-code:latest") so we can match against the
+		// pulled-runtime set. The cost is one extra round-trip per
+		// ws-* container — there are at most 8 typically, so this is
+		// well below any UX threshold.
+		inspectCtx, inspectCancel := context.WithTimeout(c.Request.Context(), 10*time.Second)
+		full, err := h.docker.ContainerInspect(inspectCtx, ctr.ID)
+		inspectCancel()
+		if err != nil {
+			log.Printf("workspace-images/refresh: inspect %s failed: %v", ctr.ID[:12], err)
+			continue
+		}
+		imageRef := ""
+		if full.Config != nil {
+			imageRef = full.Config.Image
+		}
+		matched := ""
+		for rt := range pulledSet {
+			if strings.Contains(imageRef, "workspace-template-"+rt) {
+				matched = rt
+				break
+			}
+		}
+		if matched == "" {
+			continue
+		}
+		name := strings.TrimPrefix(ctr.Names[0], "/")
+		// Remove with force — the workspace will re-provision on the next
+		// canvas interaction. This drops in-flight conversations on the
+		// removed container; document via the response so callers can
+		// schedule the refresh during a quiet window.
+		rmCtx, rmCancel := context.WithTimeout(c.Request.Context(), 30*time.Second)
+		err = h.docker.ContainerRemove(rmCtx, ctr.ID, container.RemoveOptions{Force: true})
+		rmCancel()
+		if err != nil {
+			log.Printf("workspace-images/refresh: remove %s failed: %v", name, err)
+			continue
+		}
+		res.Recreated = append(res.Recreated, name)
+	}
+
+	authStatus := "no GHCR auth (public images only)"
+	if auth != "" {
+		authStatus = "GHCR_USER/GHCR_TOKEN auth"
+	}
+	log.Printf("workspace-images/refresh: pulled=%d failed=%d recreated=%d (%s)",
+		len(res.Pulled), len(res.Failed), len(res.Recreated), authStatus)
+	c.JSON(http.StatusOK, res)
+}
diff --git a/workspace-server/internal/handlers/admin_workspace_images_test.go b/workspace-server/internal/handlers/admin_workspace_images_test.go
new file mode 100644
index 00000000..26e61f95
--- /dev/null
+++ b/workspace-server/internal/handlers/admin_workspace_images_test.go
@@ -0,0 +1,73 @@
+package handlers
+
+import (
+	"encoding/base64"
+	"encoding/json"
+	"testing"
+)
+
+func TestGHCRAuthHeader_NoEnvReturnsEmpty(t *testing.T) {
+	t.Setenv("GHCR_USER", "")
+	t.Setenv("GHCR_TOKEN", "")
+	if got := ghcrAuthHeader(); got != "" {
+		t.Errorf("expected empty (no auth → public-only), got %q", got)
+	}
+}
+
+func TestGHCRAuthHeader_PartialEnvReturnsEmpty(t *testing.T) {
+	// Both must be set — defensive against half-configured env.
+	t.Setenv("GHCR_USER", "alice")
+	t.Setenv("GHCR_TOKEN", "")
+	if got := ghcrAuthHeader(); got != "" {
+		t.Errorf("user-only env should disable auth, got %q", got)
+	}
+	t.Setenv("GHCR_USER", "")
+	t.Setenv("GHCR_TOKEN", "fake-tok-xxx")
+	if got := ghcrAuthHeader(); got != "" {
+		t.Errorf("token-only env should disable auth, got %q", got)
+	}
+}
+
+func TestGHCRAuthHeader_EncodesDockerEnginePayload(t *testing.T) {
+	t.Setenv("GHCR_USER", "alice")
+	t.Setenv("GHCR_TOKEN", "fake-tok-value")
+	got := ghcrAuthHeader()
+	if got == "" {
+		t.Fatal("expected non-empty auth header")
+	}
+	raw, err := base64.URLEncoding.DecodeString(got)
+	if err != nil {
+		t.Fatalf("auth header is not valid base64-url: %v", err)
+	}
+	var payload map[string]string
+	if err := json.Unmarshal(raw, &payload); err != nil {
+		t.Fatalf("decoded auth is not valid JSON: %v (raw=%s)", err, raw)
+	}
+	if payload["username"] != "alice" {
+		t.Errorf("username: got %q, want alice", payload["username"])
+	}
+	if payload["password"] != "fake-tok-value" {
+		t.Errorf("password: got %q, want fake-tok-value", payload["password"])
+	}
+	if payload["serveraddress"] != "ghcr.io" {
+		t.Errorf("serveraddress: got %q, want ghcr.io", payload["serveraddress"])
+	}
+}
+
+func TestGHCRAuthHeader_TrimsWhitespace(t *testing.T) {
+	// .env lines often have trailing newlines or accidental spaces. Without
+	// trimming, a stray space would produce an auth payload the engine
+	// rejects with a confusing 401.
+	t.Setenv("GHCR_USER", "  alice  ")
+	t.Setenv("GHCR_TOKEN", "\tfake-tok-value\n")
+	got := ghcrAuthHeader()
+	raw, _ := base64.URLEncoding.DecodeString(got)
+	var payload map[string]string
+	_ = json.Unmarshal(raw, &payload)
+	if payload["username"] != "alice" {
+		t.Errorf("username not trimmed: got %q", payload["username"])
+	}
+	if payload["password"] != "fake-tok-value" {
+		t.Errorf("password not trimmed: got %q", payload["password"])
+	}
+}
diff --git a/workspace-server/internal/handlers/chat_files.go b/workspace-server/internal/handlers/chat_files.go
new file mode 100644
index 00000000..f8533e28
--- /dev/null
+++ b/workspace-server/internal/handlers/chat_files.go
@@ -0,0 +1,415 @@
+package handlers
+
+// chat_files.go — file upload/download for workspace chat.
+//
+// Split from templates.go because these endpoints have a different
+// security model (no /configs write, no template fallback) and a
+// different wire format (multipart in, binary-stream out). Template
+// files are agent workspace configuration; chat files are user-agent
+// conversation payloads.
+
+import (
+	"archive/tar"
+	"bytes"
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"fmt"
+	"io"
+	"log"
+	"mime"
+	"mime/multipart"
+	"net/http"
+	"path/filepath"
+	"regexp"
+	"strings"
+
+	"github.com/docker/docker/api/types/container"
+	"github.com/gin-gonic/gin"
+)
+
+// ChatFilesHandler serves file upload + download for chat. It
+// composes the existing TemplatesHandler's Docker plumbing
+// (findContainer, execInContainer, copyFilesToContainer) rather than
+// duplicating them, so a bug fix in the Docker layer propagates to
+// both endpoints.
+type ChatFilesHandler struct {
+	templates *TemplatesHandler
+}
+
+func NewChatFilesHandler(t *TemplatesHandler) *ChatFilesHandler {
+	return &ChatFilesHandler{templates: t}
+}
+
+// chatUploadMaxBytes caps the full multipart request body so a
+// malicious / runaway client can't OOM the server. 50 MB covers most
+// documents + a handful of images per message; larger artefacts
+// should go through git/S3 rather than chat.
+const chatUploadMaxBytes = 50 * 1024 * 1024
+
+// chatUploadMaxFileBytes caps individual files in a multi-file upload.
+// Keeping the per-file cap below the total lets a user send, say, a
+// 5 MB PDF + 10 screenshots without tripping the batch limit on any
+// single attachment.
+const chatUploadMaxFileBytes = 25 * 1024 * 1024
+
+// chatUploadDir is the in-container path where user-uploaded chat
+// attachments land. Under /workspace so the file persists with the
+// workspace volume and is readable by the agent without any extra
+// plumbing — the agent just reads from the URI path we return.
+const chatUploadDir = "/workspace/.molecule/chat-uploads"
+
+// unsafeFilenameChars matches anything outside the conservative
+// {alnum, dot, underscore, dash} set. Filenames get rewritten
+// character-class at a time, so embedded paths, control chars,
+// newlines, quotes, and shell metachars never reach the filesystem.
+var unsafeFilenameChars = regexp.MustCompile(`[^a-zA-Z0-9._\-]`)
+
+// contentDispositionAttachment produces a safe `attachment; filename=...`
+// header. Quotes, CR, and LF in the filename are escaped per RFC 6266 /
+// RFC 5987: control chars dropped, backslash and double-quote
+// backslash-escaped inside the quoted-string. Also emits the
+// percent-encoded filename* parameter so non-ASCII names survive.
+// This matters because agents can write arbitrary filenames into
+// /workspace, and anything they produce reaches this header via
+// `filepath.Base(path)` — not all agents sanitize on their side.
+func contentDispositionAttachment(name string) string {
+	safeQ := make([]rune, 0, len(name))
+	for _, r := range name {
+		switch {
+		case r == '\r' || r == '\n':
+			// Drop — any CR/LF would terminate the header early.
+			continue
+		case r == '"' || r == '\\':
+			// Escape per RFC 6266 §4.1 quoted-string.
+			safeQ = append(safeQ, '\\', r)
+		case r < 0x20 || r == 0x7f:
+			// Drop other control chars.
+			continue
+		default:
+			safeQ = append(safeQ, r)
+		}
+	}
+	asciiSafe := string(safeQ)
+	// filename=  — double-quoted, escaped. Gives legacy clients a value.
+	// filename*= — RFC 5987 percent-encoded UTF-8, preferred when present.
+	return fmt.Sprintf(`attachment; filename="%s"; filename*=UTF-8''%s`,
+		asciiSafe, urlPathEscape(name))
+}
+
+// urlPathEscape percent-encodes every byte outside the RFC 3986
+// unreserved set — stricter than net/url.PathEscape (which leaves
+// "/" unescaped because it's legal in URL paths). Filenames must
+// never contain "/" anyway, so escaping it is defence-in-depth
+// against an agent that writes a path-like name.
+func urlPathEscape(s string) string {
+	const unreserved = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
+	var b strings.Builder
+	for _, c := range []byte(s) {
+		if strings.IndexByte(unreserved, c) >= 0 {
+			b.WriteByte(c)
+		} else {
+			fmt.Fprintf(&b, "%%%02X", c)
+		}
+	}
+	return b.String()
+}
+
+func sanitizeFilename(in string) string {
+	base := filepath.Base(in)
+	base = strings.ReplaceAll(base, " ", "_")
+	base = unsafeFilenameChars.ReplaceAllString(base, "_")
+	if len(base) > 100 {
+		ext := filepath.Ext(base)
+		if len(ext) > 16 {
+			ext = ""
+		}
+		base = base[:100-len(ext)] + ext
+	}
+	if base == "" || base == "." || base == ".." {
+		return "file"
+	}
+	return base
+}
+
+// ChatUploadedFile is the per-file response returned from POST
+// /workspaces/:id/chat/uploads. Clients include this payload (or a
+// trimmed subset) in their outgoing A2A `message/send` parts.
+type ChatUploadedFile struct {
+	// URI uses a custom "workspace:" scheme so clients can resolve it
+	// against the streaming Download endpoint regardless of where the
+	// canvas itself is hosted. The path component is always absolute
+	// within the workspace container.
+	URI      string `json:"uri"`
+	Name     string `json:"name"`
+	MimeType string `json:"mimeType,omitempty"`
+	Size     int64  `json:"size"`
+}
+
+// Upload handles POST /workspaces/:id/chat/uploads.
+// Accepts multipart/form-data with one or more `files` fields, stages
+// each under /workspace/.molecule/chat-uploads with a UUID prefix,
+// and returns the list of URIs for the caller to attach to an A2A
+// message.
+func (h *ChatFilesHandler) Upload(c *gin.Context) {
+	workspaceID := c.Param("id")
+	if err := validateWorkspaceID(workspaceID); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid workspace ID"})
+		return
+	}
+
+	// Hard cap the request body BEFORE ParseMultipartForm — otherwise
+	// a client could chunk-upload past the cap before Go notices.
+	c.Request.Body = http.MaxBytesReader(c.Writer, c.Request.Body, chatUploadMaxBytes)
+	if err := c.Request.ParseMultipartForm(chatUploadMaxBytes); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "failed to parse multipart form"})
+		return
+	}
+
+	form := c.Request.MultipartForm
+	var headers []*multipart.FileHeader
+	if form != nil && form.File != nil {
+		headers = form.File["files"]
+	}
+	if len(headers) == 0 {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "expected at least one 'files' field"})
+		return
+	}
+
+	ctx := c.Request.Context()
+	containerName := h.templates.findContainer(ctx, workspaceID)
+	if containerName == "" {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"})
+		return
+	}
+
+	// Build the archive in memory. Files are byte-preserving through
+	// Go's string<->[]byte (the tar helper takes map[string]string but
+	// the conversion is a literal copy, not a UTF-8 reinterpretation).
+	archive := map[string]string{}
+	uploaded := make([]ChatUploadedFile, 0, len(headers))
+	for _, fh := range headers {
+		if fh.Size > chatUploadMaxFileBytes {
+			c.JSON(http.StatusRequestEntityTooLarge, gin.H{
+				"error": fmt.Sprintf("%s exceeds per-file limit (%d MB)", fh.Filename, chatUploadMaxFileBytes/(1024*1024)),
+			})
+			return
+		}
+		f, err := fh.Open()
+		if err != nil {
+			c.JSON(http.StatusBadRequest, gin.H{"error": "failed to read upload"})
+			return
+		}
+		// LimitReader guards against a truthful-but-lying Size header:
+		// if the multipart stream carries more bytes than declared, we
+		// stop at the cap instead of growing the buffer.
+		data, err := io.ReadAll(io.LimitReader(f, chatUploadMaxFileBytes+1))
+		f.Close()
+		if err != nil {
+			c.JSON(http.StatusBadRequest, gin.H{"error": "failed to read upload"})
+			return
+		}
+		if int64(len(data)) > chatUploadMaxFileBytes {
+			c.JSON(http.StatusRequestEntityTooLarge, gin.H{
+				"error": fmt.Sprintf("%s exceeds per-file limit (%d MB)", fh.Filename, chatUploadMaxFileBytes/(1024*1024)),
+			})
+			return
+		}
+
+		name := sanitizeFilename(fh.Filename)
+		// 16-byte (UUID-equivalent) random prefix. Within a single
+		// batch we also check for collisions — birthday on 128 bits
+		// is astronomical, but a bad PRNG or single re-used draw
+		// would silently overwrite a sibling upload with its own
+		// content and return two URIs pointing at one file.
+		var stored string
+		for attempt := 0; attempt < 4; attempt++ {
+			idBytes := make([]byte, 16)
+			if _, err := rand.Read(idBytes); err != nil {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to allocate upload ID"})
+				return
+			}
+			candidate := hex.EncodeToString(idBytes) + "-" + name
+			if _, taken := archive[candidate]; !taken {
+				stored = candidate
+				break
+			}
+		}
+		if stored == "" {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to allocate unique upload ID"})
+			return
+		}
+		archive[stored] = string(data)
+
+		mt := fh.Header.Get("Content-Type")
+		if mt == "" {
+			mt = mime.TypeByExtension(filepath.Ext(name))
+		}
+		uploaded = append(uploaded, ChatUploadedFile{
+			URI:      "workspace:" + chatUploadDir + "/" + stored,
+			Name:     name,
+			MimeType: mt,
+			Size:     int64(len(data)),
+		})
+	}
+
+	// mkdir -p is idempotent; we fire it every upload instead of
+	// caching state here so container restarts don't surprise us.
+	_, _ = h.templates.execInContainer(ctx, containerName, []string{"mkdir", "-p", chatUploadDir})
+
+	// Defence in depth: pre-remove each target path before extracting
+	// the tar. An agent with write access to /workspace could in
+	// theory race-create a symlink at <chatUploadDir>/<stored-name>
+	// pointing at a sensitive in-container path (its own /etc/*,
+	// mounted secrets). Docker's tar extraction on some drivers
+	// follows pre-existing symlinks at the destination. `rm -f` the
+	// exact stored-name closes that window — the UUID prefix on the
+	// name makes a successful race effectively impossible, but this
+	// guard costs nothing and documents the intent.
+	rmArgs := []string{"rm", "-f", "--"}
+	for stored := range archive {
+		rmArgs = append(rmArgs, chatUploadDir+"/"+stored)
+	}
+	_, _ = h.templates.execInContainer(ctx, containerName, rmArgs)
+
+	if err := h.copyFlatToContainer(ctx, containerName, chatUploadDir, archive); err != nil {
+		log.Printf("Chat upload copy failed for %s: %v", workspaceID, err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to stage files in workspace"})
+		return
+	}
+
+	c.JSON(http.StatusOK, gin.H{"files": uploaded})
+}
+
+// copyFlatToContainer extracts one tar of flat files into destPath
+// inside the container. Unlike the shared copyFilesToContainer helper
+// (which prepends destPath into tar entry names — correct for its
+// callers whose files relative-live inside a nested tree), this
+// helper writes tar entries with ONLY the flat filename so Docker's
+// extraction at destPath lands them directly in destPath, not at
+// destPath/destPath/... as the shared helper would.
+// Filenames are validated to contain no path separator so nothing
+// can escape destPath via an embedded "../" or a leading "/".
+func (h *ChatFilesHandler) copyFlatToContainer(ctx context.Context, containerName, destPath string, files map[string]string) error {
+	if h.templates.docker == nil {
+		return fmt.Errorf("docker not available")
+	}
+	var buf bytes.Buffer
+	tw := tar.NewWriter(&buf)
+	for name, content := range files {
+		if strings.ContainsAny(name, "/\\") || name == ".." || name == "." || name == "" {
+			return fmt.Errorf("unsafe flat filename: %q", name)
+		}
+		data := []byte(content)
+		if err := tw.WriteHeader(&tar.Header{
+			Name:     name, // relative — Docker resolves against destPath
+			Mode:     0644,
+			Size:     int64(len(data)),
+			Typeflag: tar.TypeReg,
+		}); err != nil {
+			return fmt.Errorf("tar header %q: %w", name, err)
+		}
+		if _, err := tw.Write(data); err != nil {
+			return fmt.Errorf("tar write %q: %w", name, err)
+		}
+	}
+	if err := tw.Close(); err != nil {
+		return fmt.Errorf("tar close: %w", err)
+	}
+	return h.templates.docker.CopyToContainer(ctx, containerName, destPath, &buf, container.CopyToContainerOptions{})
+}
+
+// Download handles GET /workspaces/:id/chat/download?path=<abs path>.
+// Streams the file bytes from the container with a correct
+// Content-Type and attachment Content-Disposition. Binary-safe —
+// unlike the existing JSON ReadFile endpoint which carries content
+// as a string (lossy for non-UTF-8 bytes).
+func (h *ChatFilesHandler) Download(c *gin.Context) {
+	workspaceID := c.Param("id")
+	if err := validateWorkspaceID(workspaceID); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid workspace ID"})
+		return
+	}
+
+	path := c.Query("path")
+	if path == "" {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path query required"})
+		return
+	}
+	if !filepath.IsAbs(path) {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path must be absolute"})
+		return
+	}
+	// Path must land under one of the allowed roots — mirrors the
+	// ReadFile security model and prevents arbitrary reads of /etc
+	// or other system paths via this endpoint.
+	rooted := false
+	for root := range allowedRoots {
+		if path == root || strings.HasPrefix(path, root+"/") {
+			rooted = true
+			break
+		}
+	}
+	if !rooted {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path must be under /configs, /workspace, /home, or /plugins"})
+		return
+	}
+	// Reject anything that canonicalises differently or contains a
+	// traversal segment. Defence-in-depth on top of the prefix check.
+	if filepath.Clean(path) != path || strings.Contains(path, "..") {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid path"})
+		return
+	}
+
+	ctx := c.Request.Context()
+	if h.templates.docker == nil {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "docker unavailable"})
+		return
+	}
+	containerName := h.templates.findContainer(ctx, workspaceID)
+	if containerName == "" {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"})
+		return
+	}
+
+	// docker cp returns a tar stream containing the requested path.
+	// For a regular file that's a single tar entry; we extract and
+	// stream the body through.
+	reader, _, err := h.templates.docker.CopyFromContainer(ctx, containerName, path)
+	if err != nil {
+		c.JSON(http.StatusNotFound, gin.H{"error": "file not found"})
+		return
+	}
+	defer reader.Close()
+
+	tr := tar.NewReader(reader)
+	hdr, err := tr.Next()
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to read archive"})
+		return
+	}
+	if hdr.Typeflag != tar.TypeReg {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path is not a regular file"})
+		return
+	}
+
+	name := filepath.Base(path)
+	mt := mime.TypeByExtension(filepath.Ext(name))
+	if mt == "" {
+		mt = "application/octet-stream"
+	}
+	c.Header("Content-Type", mt)
+	c.Header("Content-Length", fmt.Sprintf("%d", hdr.Size))
+	c.Header("Content-Disposition", contentDispositionAttachment(name))
+	c.Status(http.StatusOK)
+
+	// Stream exactly hdr.Size bytes. CopyN was chosen over LimitReader
+	// because it returns an error when the source is short — that
+	// surfaces a bug in the tar extraction path immediately instead
+	// of silently truncating. Agents can legitimately produce files
+	// larger than the 50 MB upload cap (that's a per-request inbound
+	// cap, not a per-artifact one), so we cannot clamp here.
+	if _, err := io.CopyN(c.Writer, tr, hdr.Size); err != nil {
+		log.Printf("Chat download stream error for %s (%s): %v", workspaceID, path, err)
+	}
+}
diff --git a/workspace-server/internal/handlers/chat_files_test.go b/workspace-server/internal/handlers/chat_files_test.go
new file mode 100644
index 00000000..870308f9
--- /dev/null
+++ b/workspace-server/internal/handlers/chat_files_test.go
@@ -0,0 +1,194 @@
+package handlers
+
+// Unit tests for chat_files.go. The Docker-touching paths (Upload
+// actually copying into a container, Download actually streaming tar)
+// are exercised via integration tests — docker-in-docker is out of
+// scope for the unit suite. These tests cover the validation + error
+// surfaces that a caller can reach without a running container.
+
+import (
+	"bytes"
+	"mime/multipart"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	"github.com/gin-gonic/gin"
+)
+
+func TestSanitizeFilename(t *testing.T) {
+	cases := []struct {
+		in, want string
+	}{
+		{"report.pdf", "report.pdf"},
+		{"my file.pdf", "my_file.pdf"},
+		{"../../etc/passwd", "passwd"},
+		{"weird;$name`.txt", "weird__name_.txt"},
+		{"", "file"},
+		{".", "file"},
+		{"..", "file"},
+	}
+	for _, tc := range cases {
+		got := sanitizeFilename(tc.in)
+		if got != tc.want {
+			t.Errorf("sanitizeFilename(%q) = %q, want %q", tc.in, got, tc.want)
+		}
+	}
+}
+
+func TestSanitizeFilename_LongNamePreservesExtension(t *testing.T) {
+	// 120-char base + .pdf — the helper should truncate the base but
+	// keep the extension intact so content-type inference still works.
+	longBase := strings.Repeat("a", 120)
+	got := sanitizeFilename(longBase + ".pdf")
+	if len(got) > 100 {
+		t.Errorf("filename not truncated: len=%d", len(got))
+	}
+	if !strings.HasSuffix(got, ".pdf") {
+		t.Errorf("extension stripped: %q", got)
+	}
+}
+
+func TestChatUpload_InvalidWorkspaceID(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil)
+	h := NewChatFilesHandler(tmplh)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "not-a-uuid"}}
+	c.Request = httptest.NewRequest("POST", "/workspaces/not-a-uuid/chat/uploads", nil)
+
+	h.Upload(c)
+
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400 on invalid workspace id, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
+func TestChatUpload_MissingFiles(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil)
+	h := NewChatFilesHandler(tmplh)
+
+	// Multipart body with no `files` field — only a text field.
+	var buf bytes.Buffer
+	mw := multipart.NewWriter(&buf)
+	_ = mw.WriteField("other", "value")
+	mw.Close()
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+	req := httptest.NewRequest("POST", "/workspaces/00000000-0000-0000-0000-000000000001/chat/uploads", &buf)
+	req.Header.Set("Content-Type", mw.FormDataContentType())
+	c.Request = req
+
+	h.Upload(c)
+
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400 when files field missing, got %d: %s", w.Code, w.Body.String())
+	}
+	if !strings.Contains(w.Body.String(), "files") {
+		t.Errorf("expected error to mention files field: %s", w.Body.String())
+	}
+}
+
+func TestChatDownload_InvalidPath(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil)
+	h := NewChatFilesHandler(tmplh)
+
+	cases := []struct {
+		name, path, wantSubstr string
+	}{
+		{"empty", "", "path query required"},
+		{"relative", "workspace/foo.txt", "must be absolute"},
+		{"wrong root", "/etc/passwd", "must be under"},
+		{"traversal", "/workspace/../etc/passwd", "invalid path"},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			w := httptest.NewRecorder()
+			c, _ := gin.CreateTestContext(w)
+			c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+			req := httptest.NewRequest("GET", "/workspaces/xxx/chat/download?path="+tc.path, nil)
+			c.Request = req
+
+			h.Download(c)
+
+			if w.Code != http.StatusBadRequest {
+				t.Errorf("expected 400 for %s, got %d: %s", tc.name, w.Code, w.Body.String())
+			}
+			if !strings.Contains(w.Body.String(), tc.wantSubstr) {
+				t.Errorf("expected error to contain %q, got: %s", tc.wantSubstr, w.Body.String())
+			}
+		})
+	}
+}
+
+func TestContentDispositionAttachment_Escapes(t *testing.T) {
+	cases := []struct {
+		name, input, wantSubstr string
+	}{
+		{
+			name:       "plain ASCII passes through",
+			input:      "report.pdf",
+			wantSubstr: `filename="report.pdf"`,
+		},
+		{
+			name:       "double-quote is backslash-escaped",
+			input:      `weird".pdf`,
+			wantSubstr: `filename="weird\".pdf"`,
+		},
+		{
+			name:       "CR and LF dropped to prevent header injection",
+			input:      "bad\r\nX-Leak: 1\r\n.txt",
+			wantSubstr: `filename="badX-Leak: 1.txt"`,
+		},
+		{
+			name:       "non-ASCII emits filename* percent-encoded",
+			input:      "résumé.pdf",
+			wantSubstr: "filename*=UTF-8''r%C3%A9sum%C3%A9.pdf",
+		},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := contentDispositionAttachment(tc.input)
+			if !strings.Contains(got, tc.wantSubstr) {
+				t.Errorf("contentDispositionAttachment(%q) = %q, missing substring %q", tc.input, got, tc.wantSubstr)
+			}
+			// Must never contain a bare CR or LF — either would end the header.
+			if strings.ContainsAny(got, "\r\n") {
+				t.Errorf("header contains CR/LF: %q", got)
+			}
+		})
+	}
+}
+
+func TestChatDownload_DockerUnavailable(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil) // docker=nil
+	h := NewChatFilesHandler(tmplh)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+	req := httptest.NewRequest("GET", "/workspaces/xxx/chat/download?path=/workspace/report.pdf", nil)
+	c.Request = req
+
+	h.Download(c)
+
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503 when docker is nil, got %d: %s", w.Code, w.Body.String())
+	}
+}
diff --git a/workspace-server/internal/handlers/delegation.go b/workspace-server/internal/handlers/delegation.go
index 8c0d681f..59da198c 100644
--- a/workspace-server/internal/handlers/delegation.go
+++ b/workspace-server/internal/handlers/delegation.go
@@ -78,13 +78,21 @@ func (h *DelegationHandler) Delegate(c *gin.Context) {
 	// reason (logged); we still dispatch the A2A request and surface the
 	// warning in the response.
 
-	// Build A2A payload
+	// Build A2A payload. Embedding delegation_id in metadata gives the
+	// queue drain path a way to look up the originating delegation row
+	// when stitching the response back (issue: previously the drain
+	// dispatched successfully but discarded the response, so
+	// check_task_status returned status='queued' forever even after a
+	// real reply landed). messageId mirrors delegation_id so the
+	// platform's idempotency-key extraction also keys off the same id.
 	a2aBody, _ := json.Marshal(map[string]interface{}{
 		"method": "message/send",
 		"params": map[string]interface{}{
 			"message": map[string]interface{}{
-				"role":  "user",
-				"parts": []map[string]interface{}{{"type": "text", "text": body.Task}},
+				"role":      "user",
+				"messageId": delegationID,
+				"parts":     []map[string]interface{}{{"type": "text", "text": body.Task}},
+				"metadata":  map[string]interface{}{"delegation_id": delegationID},
 			},
 		},
 	})
@@ -284,6 +292,40 @@ func (h *DelegationHandler) executeDelegation(sourceID, targetID, delegationID s
 		return
 	}
 
+	// 202 + {queued: true} means the target was busy and the proxy
+	// enqueued the request for the next drain tick — NOT a completion.
+	// Treat it as such: write a clean 'queued' activity row with no
+	// JSON-as-text leakage into the summary, broadcast a status update,
+	// and return. The eventual drain doesn't (yet) feed a result back
+	// into this delegation, so callers polling check_task_status will
+	// see status='queued' and know to retry instead of believing the
+	// queued JSON is the agent's reply. Fixes the chat-leak where the
+	// LLM echoed "Delegation completed (workspace agent busy ...)" to
+	// the user.
+	if status == http.StatusAccepted && isQueuedProxyResponse(respBody) {
+		log.Printf("Delegation %s: target %s busy — queued for drain", delegationID, targetID)
+		h.updateDelegationStatus(sourceID, delegationID, "queued", "")
+		// Store delegation_id in response_body so DrainQueueForWorkspace's
+		// stitch step can find this row by JSON-path key after the queued
+		// dispatch eventually succeeds. Without the key, the drain finds
+		// the row by (workspace_id, target_id, method) but can't tell
+		// multiple-queued-delegations-to-same-target apart.
+		queuedJSON, _ := json.Marshal(map[string]interface{}{
+			"delegation_id": delegationID,
+			"queued":        true,
+		})
+		if _, err := db.DB.ExecContext(ctx, `
+			INSERT INTO activity_logs (workspace_id, activity_type, method, source_id, target_id, summary, response_body, status)
+			VALUES ($1, 'delegation', 'delegate_result', $2, $3, $4, $5::jsonb, 'queued')
+		`, sourceID, sourceID, targetID, "Delegation queued — target at capacity", string(queuedJSON)); err != nil {
+			log.Printf("Delegation %s: failed to insert queued log: %v", delegationID, err)
+		}
+		h.broadcaster.RecordAndBroadcast(ctx, "DELEGATION_STATUS", sourceID, map[string]interface{}{
+			"delegation_id": delegationID, "target_id": targetID, "status": "queued",
+		})
+		return
+	}
+
 	// A2A returned 200 — target received and processed the task
 	// Status: dispatched → received → completed (we don't have a separate "received" signal from the target yet)
 	responseText := extractResponseText(respBody)
@@ -517,6 +559,21 @@ func isTransientProxyError(err *proxyA2AError) bool {
 	return false
 }
 
+// isQueuedProxyResponse reports whether the proxy returned a body shaped like
+// `{"queued": true, "queue_id": ..., "queue_depth": ..., "message": ...}` —
+// the busy-target enqueue path in a2a_proxy_helpers.go. Caller checks this
+// alongside HTTP 202 to distinguish a successful agent reply from a deferred
+// dispatch; without the distinction we'd write the queued-message JSON into
+// the delegation result row and the LLM would surface it as agent output.
+func isQueuedProxyResponse(body []byte) bool {
+	var resp map[string]interface{}
+	if json.Unmarshal(body, &resp) != nil {
+		return false
+	}
+	queued, _ := resp["queued"].(bool)
+	return queued
+}
+
 func extractResponseText(body []byte) string {
 	var resp map[string]interface{}
 	if json.Unmarshal(body, &resp) != nil {
diff --git a/workspace-server/internal/handlers/delegation_test.go b/workspace-server/internal/handlers/delegation_test.go
index caa5118d..21cc3a90 100644
--- a/workspace-server/internal/handlers/delegation_test.go
+++ b/workspace-server/internal/handlers/delegation_test.go
@@ -376,6 +376,39 @@ func TestIsTransientProxyError_RetriesOnRestartRaceStatuses(t *testing.T) {
 	}
 }
 
+func TestIsQueuedProxyResponse(t *testing.T) {
+	// Regression guard for the chat-leak bug: when the proxy returns
+	// 202 with a queued-shape body, executeDelegation must classify it
+	// as "queued" — not "completed". Mis-classifying it causes the
+	// queued JSON to land in activity_logs.summary, which the LLM then
+	// echoes verbatim into the agent chat as
+	// "Delegation completed: Delegation completed (workspace agent
+	// busy — request queued, will dispatch...)".
+	cases := []struct {
+		name string
+		body string
+		want bool
+	}{
+		{
+			name: "real proxy busy-enqueue body",
+			body: `{"queued":true,"queue_id":"d0993390-5f5a-4f5d-90a2-66639e53e3c9","queue_depth":1,"message":"workspace agent busy — request queued, will dispatch when capacity available"}`,
+			want: true,
+		},
+		{"queued false explicitly", `{"queued":false}`, false},
+		{"queued field absent (real A2A reply)", `{"jsonrpc":"2.0","id":"1","result":{"kind":"message","parts":[{"kind":"text","text":"hi"}]}}`, false},
+		{"non-bool queued value (defensive)", `{"queued":"true"}`, false},
+		{"malformed JSON", `not-json`, false},
+		{"empty body", ``, false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := isQueuedProxyResponse([]byte(tc.body)); got != tc.want {
+				t.Errorf("isQueuedProxyResponse(%q) = %v, want %v", tc.body, got, tc.want)
+			}
+		})
+	}
+}
+
 func TestDelegationRetryDelay_IsSaneWindow(t *testing.T) {
 	// Regression guard: the retry delay must be long enough for the
 	// reactive URL refresh in proxyA2ARequest to kick in (which involves
diff --git a/workspace-server/internal/handlers/handlers_additional_test.go b/workspace-server/internal/handlers/handlers_additional_test.go
index cb638a6f..ca3df0de 100644
--- a/workspace-server/internal/handlers/handlers_additional_test.go
+++ b/workspace-server/internal/handlers/handlers_additional_test.go
@@ -11,6 +11,7 @@ import (
 	"time"
 
 	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
 	"github.com/gin-gonic/gin"
 )
 
@@ -31,7 +32,7 @@ func TestWorkspaceCreate_WithParentID(t *testing.T) {
 	mock.ExpectBegin()
 	// Default tier is 3 (Privileged) — see workspace.go create-handler comment.
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Child Agent", nil, 3, "langgraph", sqlmock.AnyArg(), &parentID, nil, "none", (*int64)(nil)).
+		WithArgs(sqlmock.AnyArg(), "Child Agent", nil, 3, "langgraph", sqlmock.AnyArg(), &parentID, nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@@ -66,7 +67,7 @@ func TestWorkspaceCreate_ExplicitClaudeCodeRuntime(t *testing.T) {
 
 	mock.ExpectBegin()
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "CC Agent", nil, 2, "claude-code", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil)).
+		WithArgs(sqlmock.AnyArg(), "CC Agent", nil, 2, "claude-code", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@@ -277,6 +278,40 @@ func TestWorkspaceList_WithData(t *testing.T) {
 	}
 }
 
+// ---------- workspace.go: Create with explicit max_concurrent_tasks ----------
+
+func TestWorkspaceCreate_MaxConcurrentTasksOverride(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	mock.ExpectBegin()
+	mock.ExpectExec("INSERT INTO workspaces").
+		WithArgs(sqlmock.AnyArg(), "Leader Agent", nil, 3, "claude-code", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), 3).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	mock.ExpectCommit()
+	mock.ExpectExec("INSERT INTO canvas_layouts").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	mock.ExpectExec("INSERT INTO structure_events").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	body := `{"name":"Leader Agent","runtime":"claude-code","max_concurrent_tasks":3}`
+	c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Create(c)
+
+	if w.Code != http.StatusCreated {
+		t.Errorf("expected 201, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet expectations: %v", err)
+	}
+}
+
 // ---------- registry.go: Register with provisioner URL preserved ----------
 
 func TestRegister_ProvisionerURLPreserved(t *testing.T) {
diff --git a/workspace-server/internal/handlers/handlers_test.go b/workspace-server/internal/handlers/handlers_test.go
index c490b2be..be1fd1c8 100644
--- a/workspace-server/internal/handlers/handlers_test.go
+++ b/workspace-server/internal/handlers/handlers_test.go
@@ -291,7 +291,7 @@ func TestWorkspaceCreate(t *testing.T) {
 	// Expect workspace INSERT (uuid is dynamic, use AnyArg for id, runtime, awareness_namespace).
 	// Default tier is 3 (Privileged) — see workspace.go create-handler comment.
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Test Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil)).
+		WithArgs(sqlmock.AnyArg(), "Test Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 
 	// Expect transaction commit (no secrets in this payload)
diff --git a/workspace-server/internal/handlers/org.go b/workspace-server/internal/handlers/org.go
index 9a7f6692..83c2c6f5 100644
--- a/workspace-server/internal/handlers/org.go
+++ b/workspace-server/internal/handlers/org.go
@@ -5,6 +5,7 @@ package handlers
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"log"
 	"net/http"
@@ -180,6 +181,108 @@ func NewOrgHandler(wh *WorkspaceHandler, b *events.Broadcaster, p *provisioner.P
 	}
 }
 
+// EnvRequirement is either a single env var name (strict: that exact
+// var must be configured) or an any-of group (any one of the listed
+// names satisfies the requirement).
+//
+// YAML shapes accepted:
+//
+//	required_env:
+//	  - GITHUB_TOKEN                              # single
+//	  - any_of: [ANTHROPIC_API_KEY, CLAUDE_CODE_OAUTH_TOKEN]   # OR group
+//
+// The any-of form exists because some runtimes accept either of two
+// credential shapes — Claude Code takes ANTHROPIC_API_KEY or an OAuth
+// token interchangeably, and forcing an org template to pick one
+// would falsely block the other. For JSON (GET /org/templates),
+// the same shapes round-trip: strings stay strings, groups stay
+// {any_of: [...]}.
+type EnvRequirement struct {
+	// Name is non-empty for a single required env var.
+	Name string
+	// AnyOf is non-empty for an OR group; any one member satisfies.
+	AnyOf []string
+}
+
+// Members returns every env name this requirement considers —
+// [Name] for single, AnyOf for groups. Used by preflight, collect,
+// and the name-validation regex gate.
+func (e EnvRequirement) Members() []string {
+	if e.Name != "" {
+		return []string{e.Name}
+	}
+	return e.AnyOf
+}
+
+// IsSatisfied reports whether any member of the requirement is
+// present in `configured`. Single: exact-match. AnyOf: at least
+// one hit.
+func (e EnvRequirement) IsSatisfied(configured map[string]struct{}) bool {
+	for _, m := range e.Members() {
+		if _, ok := configured[m]; ok {
+			return true
+		}
+	}
+	return false
+}
+
+// UnmarshalYAML accepts either a scalar (string → single) or a map
+// with an `any_of` list (→ group).
+func (e *EnvRequirement) UnmarshalYAML(value *yaml.Node) error {
+	if value.Kind == yaml.ScalarNode {
+		var s string
+		if err := value.Decode(&s); err != nil {
+			return err
+		}
+		e.Name = s
+		return nil
+	}
+	var alt struct {
+		AnyOf []string `yaml:"any_of"`
+	}
+	if err := value.Decode(&alt); err != nil {
+		return fmt.Errorf("env requirement must be a string or {any_of: [...]}: %w", err)
+	}
+	if len(alt.AnyOf) == 0 {
+		return fmt.Errorf("env requirement any_of must contain at least one env var")
+	}
+	e.AnyOf = alt.AnyOf
+	return nil
+}
+
+// MarshalJSON emits the dual shape so GET /org/templates callers get
+// {"required_env": ["GITHUB_TOKEN", {"any_of": [...]}]}, matching
+// the YAML syntax.
+func (e EnvRequirement) MarshalJSON() ([]byte, error) {
+	if e.Name != "" {
+		return json.Marshal(e.Name)
+	}
+	return json.Marshal(struct {
+		AnyOf []string `json:"any_of"`
+	}{AnyOf: e.AnyOf})
+}
+
+// UnmarshalJSON is the inverse — accepts the same dual shape so
+// POST /org/import with an inline `template` body works too.
+func (e *EnvRequirement) UnmarshalJSON(data []byte) error {
+	var s string
+	if err := json.Unmarshal(data, &s); err == nil {
+		e.Name = s
+		return nil
+	}
+	var alt struct {
+		AnyOf []string `json:"any_of"`
+	}
+	if err := json.Unmarshal(data, &alt); err != nil {
+		return fmt.Errorf("env requirement must be a string or {any_of: [...]}: %w", err)
+	}
+	if len(alt.AnyOf) == 0 {
+		return fmt.Errorf("env requirement any_of must contain at least one env var")
+	}
+	e.AnyOf = alt.AnyOf
+	return nil
+}
+
 // OrgTemplate is the YAML structure for an org hierarchy.
 type OrgTemplate struct {
 	Name           string              `yaml:"name" json:"name"`
@@ -189,6 +292,18 @@ type OrgTemplate struct {
 	// GlobalMemories is a list of org-wide memories seeded as GLOBAL scope
 	// on the first root workspace (PM) during org import. Issue #1050.
 	GlobalMemories []models.MemorySeed `yaml:"global_memories" json:"global_memories"`
+	// RequiredEnv lists env vars that MUST be configured globally (or
+	// on every workspace in the subtree that needs them) before import
+	// succeeds. Each entry is either a plain string (strict) or an
+	// {any_of: [...]} group (at least one member must be set). Declared
+	// at the org level for shared creds; also extensible per-workspace
+	// via OrgWorkspace.RequiredEnv for team-scoped credentials.
+	RequiredEnv []EnvRequirement `yaml:"required_env" json:"required_env"`
+	// RecommendedEnv is the "nice-to-have" tier — import still succeeds
+	// without them, but features degrade. Same single|any_of shape as
+	// RequiredEnv so a recommended OR group reads "set any one of these
+	// to unlock the feature; all missing = warning".
+	RecommendedEnv []EnvRequirement `yaml:"recommended_env" json:"recommended_env"`
 }
 
 type OrgDefaults struct {
@@ -287,15 +402,27 @@ type OrgWorkspace struct {
 	// InitialMemories are memories seeded into this workspace at creation
 	// time. If empty, defaults.initial_memories are used. Issue #1050.
 	InitialMemories []models.MemorySeed `yaml:"initial_memories" json:"initial_memories"`
-	Schedules       []OrgSchedule       `yaml:"schedules" json:"schedules"`
-	Channels        []OrgChannel        `yaml:"channels" json:"channels"`
+	// MaxConcurrentTasks: see models.CreateWorkspacePayload.
+	MaxConcurrentTasks int                 `yaml:"max_concurrent_tasks" json:"max_concurrent_tasks"`
+	Schedules          []OrgSchedule       `yaml:"schedules" json:"schedules"`
+	Channels           []OrgChannel        `yaml:"channels" json:"channels"`
 	External        bool                `yaml:"external" json:"external"`
 	URL             string              `yaml:"url" json:"url"`
 	Canvas          struct {
 		X float64 `yaml:"x" json:"x"`
 		Y float64 `yaml:"y" json:"y"`
 	} `yaml:"canvas" json:"canvas"`
-	Children []OrgWorkspace `yaml:"children" json:"children"`
+	// RequiredEnv / RecommendedEnv declared at the workspace level
+	// narrow down what a specific team needs beyond the org-wide union.
+	// When GET /org/templates walks the tree, these flow up into
+	// OrgTemplate.RequiredEnv / RecommendedEnv. A workspace's subtree
+	// inherits: a parent declaring ANTHROPIC_API_KEY as required
+	// means every descendant considers it required too (no override
+	// needed at each leaf). Same single|any_of shape as the org-level
+	// lists.
+	RequiredEnv    []EnvRequirement `yaml:"required_env" json:"required_env"`
+	RecommendedEnv []EnvRequirement `yaml:"recommended_env" json:"recommended_env"`
+	Children       []OrgWorkspace   `yaml:"children" json:"children"`
 }
 
 // ListTemplates handles GET /org/templates — lists available org templates.
@@ -354,11 +481,18 @@ func (h *OrgHandler) ListTemplates(c *gin.Context) {
 			continue
 		}
 		count := countWorkspaces(tmpl.Workspaces)
+		// Walk the tree to collect required + recommended env union.
+		// Canvas uses these to render a preflight modal BEFORE firing
+		// the import — saves the user from a 15-workspace import that
+		// dies one container at a time on missing creds.
+		required, recommended := collectOrgEnv(&tmpl)
 		templates = append(templates, map[string]interface{}{
-			"dir":         e.Name(),
-			"name":        tmpl.Name,
-			"description": tmpl.Description,
-			"workspaces":  count,
+			"dir":             e.Name(),
+			"name":            tmpl.Name,
+			"description":     tmpl.Description,
+			"workspaces":      count,
+			"required_env":    required,
+			"recommended_env": recommended,
 		})
 	}
 
@@ -370,6 +504,13 @@ func (h *OrgHandler) Import(c *gin.Context) {
 	var body struct {
 		Dir      string      `json:"dir"`      // org template directory name
 		Template OrgTemplate `json:"template"` // or inline template
+		// Force skips the required-env preflight. Used by tooling
+		// that already computed the preflight client-side and wants
+		// to proceed despite missing creds (usually because the
+		// user explicitly acknowledged the tradeoff). Default behavior
+		// refuses the import with a 412 and the missing-key list so
+		// the canvas can surface them in its preflight modal.
+		Force bool `json:"force"`
 	}
 	if err := c.ShouldBindJSON(&body); err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request body"})
@@ -415,6 +556,59 @@ func (h *OrgHandler) Import(c *gin.Context) {
 		return
 	}
 
+	// Required-env preflight — refuses import when any required_env is
+	// missing from global_secrets (unless `force: true` overrides). The
+	// canvas runs the same check client-side against GET /org/templates
+	// output and shows a modal so users set keys before clicking Import;
+	// this server-side check is the authoritative guard in case a caller
+	// bypasses the UI (CLI, API clients, etc.). 412 Precondition Failed
+	// carries the missing-key list so tooling can render the same
+	// add-key flow.
+	required, _ := collectOrgEnv(&tmpl)
+	if body.Force {
+		// Log the bypass so a post-incident search can find who
+		// imported an org with missing creds. The common audit flow
+		// treats log.Printf at INFO as the low-cost trail for
+		// explicit-override actions — keeps force as a supported
+		// knob but makes it investigable.
+		log.Printf("Org import: force=true bypass — template=%q, required_env=%v", tmpl.Name, required)
+	} else if len(required) > 0 {
+		ctx := c.Request.Context()
+		configured, err := loadConfiguredGlobalSecretKeys(ctx)
+		if err != nil {
+			// Fail closed. Previously this fell through and imported
+			// anyway, defeating the preflight for exactly the case
+			// it's meant to cover. A DB hiccup should look like a
+			// retryable 500, not a silent green light for an import
+			// that will fail at container-start time on every node.
+			log.Printf("Org import preflight: global secrets lookup failed: %v", err)
+			c.JSON(http.StatusInternalServerError, gin.H{
+				"error": "could not verify required environment variables; try again or pass force=true to override",
+			})
+			return
+		}
+		var missing []EnvRequirement
+		for _, req := range required {
+			// For a single requirement this is exact-match; for an
+			// any-of group, any one member satisfies. Groups whose
+			// alternative is already configured drop out here — the
+			// user doesn't need to re-configure them.
+			if !req.IsSatisfied(configured) {
+				missing = append(missing, req)
+			}
+		}
+		if len(missing) > 0 {
+			c.JSON(http.StatusPreconditionFailed, gin.H{
+				"error":        "missing required environment variables",
+				"missing_env":  missing,
+				"required_env": required,
+				"template":     tmpl.Name,
+				"suggestion":   "set these as global secrets (POST /settings/secrets) or pass force=true to override",
+			})
+			return
+		}
+	}
+
 	results := []map[string]interface{}{}
 	var createErr error
 
@@ -426,7 +620,8 @@ func (h *OrgHandler) Import(c *gin.Context) {
 	// using subtree-aware grid slots (children that are themselves
 	// parents get a bigger slot so they don't overflow into siblings).
 	for _, ws := range tmpl.Workspaces {
-		if err := h.createWorkspaceTree(ws, nil, ws.Canvas.X, ws.Canvas.Y, tmpl.Defaults, orgBaseDir, &results, provisionSem); err != nil {
+		// Root: relX/relY == absX/absY (no parent to be relative to).
+		if err := h.createWorkspaceTree(ws, nil, ws.Canvas.X, ws.Canvas.Y, ws.Canvas.X, ws.Canvas.Y, tmpl.Defaults, orgBaseDir, &results, provisionSem); err != nil {
 			createErr = err
 			break
 		}
diff --git a/workspace-server/internal/handlers/org_import.go b/workspace-server/internal/handlers/org_import.go
index 5e50e7ae..fd4a39a2 100644
--- a/workspace-server/internal/handlers/org_import.go
+++ b/workspace-server/internal/handlers/org_import.go
@@ -10,6 +10,8 @@ import (
 	"log"
 	"os"
 	"path/filepath"
+	"regexp"
+	"sort"
 	"strings"
 	"time"
 
@@ -28,7 +30,13 @@ import (
 // parent.abs + childSlotInGrid(index, siblingSizes) computed by the
 // caller. Storing already-absolute coords means a child that is itself
 // a parent can simply compound the grid without any per-call math.
-func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX, absY float64, defaults OrgDefaults, orgBaseDir string, results *[]map[string]interface{}, provisionSem chan struct{}) error {
+// relX / relY are THIS workspace's position RELATIVE to its parent's
+// absolute origin (i.e. childSlotInGrid output for children; 0,0 for
+// roots since a root's absolute IS its relative). The broadcast
+// payload ships relative coords so the canvas can drop the node
+// straight into the parent's child-coordinate space without doing a
+// canvas-wide absolute-position walk.
+func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX, absY, relX, relY float64, defaults OrgDefaults, orgBaseDir string, results *[]map[string]interface{}, provisionSem chan struct{}) error {
 	// Apply defaults
 	runtime := ws.Runtime
 	if runtime == "" {
@@ -103,10 +111,14 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 	// (see canvas-topology.ts), so imports don't spray the viewport.
 	initialCollapsed := false
 
+	maxConcurrent := ws.MaxConcurrentTasks
+	if maxConcurrent <= 0 {
+		maxConcurrent = models.DefaultMaxConcurrentTasks
+	}
 	_, err := db.DB.ExecContext(ctx, `
-		INSERT INTO workspaces (id, name, role, tier, runtime, awareness_namespace, status, parent_id, workspace_dir, workspace_access)
-		VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
-	`, id, ws.Name, role, tier, runtime, awarenessNS, "provisioning", parentID, workspaceDir, workspaceAccess)
+		INSERT INTO workspaces (id, name, role, tier, runtime, awareness_namespace, status, parent_id, workspace_dir, workspace_access, max_concurrent_tasks)
+		VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
+	`, id, ws.Name, role, tier, runtime, awarenessNS, "provisioning", parentID, workspaceDir, workspaceAccess, maxConcurrent)
 	if err != nil {
 		log.Printf("Org import: failed to create %s: %v", ws.Name, err)
 		return fmt.Errorf("failed to create %s: %w", ws.Name, err)
@@ -128,10 +140,23 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 	}
 
 	// Broadcast — include runtime so the canvas pill renders the right
-	// badge immediately instead of "unknown".
-	h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISIONING", id, map[string]interface{}{
+	// badge immediately instead of "unknown". parent_id + x/y let the
+	// canvas's org-deploy animation spawn the child from the parent's
+	// current coords and tween into its reserved slot, instead of
+	// landing in a default grid position first and snapping on the
+	// next hydrate.
+	payload := map[string]interface{}{
 		"name": ws.Name, "tier": tier, "runtime": runtime,
-	})
+		// Parent-relative coords — the canvas's React Flow node uses
+		// these as the node's position when parent_id is set (React
+		// Flow treats node.position as parent-relative when the node
+		// has a parentId). For roots, relX/relY == absX/absY.
+		"x": relX, "y": relY,
+	}
+	if parentID != nil {
+		payload["parent_id"] = *parentID
+	}
+	h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISIONING", id, payload)
 
 	// Seed initial memories from workspace config or defaults (issue #1050).
 	// Per-workspace initial_memories override defaults; if workspace has none,
@@ -509,7 +534,9 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 			slotX, slotY := childSlotInGrid(i, siblingSizes)
 			childAbsX := absX + slotX
 			childAbsY := absY + slotY
-			if err := h.createWorkspaceTree(child, &id, childAbsX, childAbsY, defaults, orgBaseDir, results, provisionSem); err != nil {
+			// slotX/slotY are already parent-relative — that's
+			// exactly what childSlotInGrid returns.
+			if err := h.createWorkspaceTree(child, &id, childAbsX, childAbsY, slotX, slotY, defaults, orgBaseDir, results, provisionSem); err != nil {
 				return err
 			}
 			time.Sleep(workspaceCreatePacingMs * time.Millisecond)
@@ -519,6 +546,213 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 	return nil
 }
 
+// envVarNamePattern guards template-supplied env var names against
+// pathological inputs. A malicious template could ship
+// required_env: ["'; DROP …"] or whitespace-only entries that would
+// flow through collectOrgEnv → into the 412 response body and,
+// worse, into the modal's PUT /settings/secrets input. Schema
+// already has `key TEXT NOT NULL UNIQUE` and our queries are
+// parameterised so SQL injection isn't the threat — the real risks
+// are UI rendering weirdness (newlines, NUL bytes, zero-width chars)
+// and downstream env-var semantics (POSIX requires uppercase +
+// underscore + digit). A strict regex filters both classes of
+// problem at a single choke point.
+var envVarNamePattern = regexp.MustCompile(`^[A-Z][A-Z0-9_]{0,127}$`)
+
+// sanitizeEnvMembers filters a requirement's member list through the
+// name-validation regex, logging rejections. Returns the filtered
+// list and a boolean indicating whether any valid members remain.
+// Used so a group containing one valid + one bogus name is kept
+// (valid member carries the group) rather than silently dropped.
+func sanitizeEnvMembers(members []string, where string) ([]string, bool) {
+	out := make([]string, 0, len(members))
+	for _, k := range members {
+		if !envVarNamePattern.MatchString(k) {
+			if k != "" {
+				log.Printf("collectOrgEnv: rejecting invalid env var name %q from %s (must match %s)", k, where, envVarNamePattern)
+			}
+			continue
+		}
+		out = append(out, k)
+	}
+	return out, len(out) > 0
+}
+
+// envRequirementKey canonicalises a requirement for dedup — sorted
+// member list joined with NUL so `any_of: [A, B]` and `any_of: [B, A]`
+// collapse to the same key. Single requirements are length-1 groups.
+func envRequirementKey(members []string) string {
+	cp := append([]string(nil), members...)
+	sort.Strings(cp)
+	return strings.Join(cp, "\x00")
+}
+
+// collectOrgEnv walks the whole template tree and returns the union of
+// required_env and recommended_env declared anywhere — at the org
+// level, on root workspaces, or on any nested child. Deduplicates by
+// group membership (same set of members = same requirement) and
+// sorts deterministically so the canvas sees a stable order.
+//
+// "Required wins" rules:
+//
+//   - A requirement that appears in BOTH required and recommended
+//     (same members) surfaces only as required.
+//   - A single-name requirement (e.g. "API_KEY") and a group that
+//     contains that same name (e.g. {any_of: [API_KEY, OTHER]}) are
+//     NOT deduplicated — they're semantically different (strict vs
+//     satisfiable-by-alternative) and the stricter "single" one wins,
+//     so the any-of group is dropped when its members overlap with a
+//     strict requirement declared elsewhere.
+//
+// Invalid names fail envVarNamePattern; the filter is applied per
+// group so a group with one bogus entry keeps the rest. A group
+// whose ALL members are invalid is dropped entirely with a log.
+func collectOrgEnv(tmpl *OrgTemplate) (required, recommended []EnvRequirement) {
+	reqByKey := map[string]EnvRequirement{}
+	recByKey := map[string]EnvRequirement{}
+	// Names covered by strict (single) required entries. A group in
+	// EITHER tier whose any-of contains ONE of these names is
+	// dominated by the strict requirement and gets dropped on the
+	// second pass.
+	strictRequiredNames := map[string]struct{}{}
+
+	accept := func(into map[string]EnvRequirement, src []EnvRequirement, where string, markStrict bool) {
+		for _, req := range src {
+			members, ok := sanitizeEnvMembers(req.Members(), where)
+			if !ok {
+				continue
+			}
+			key := envRequirementKey(members)
+			if _, exists := into[key]; exists {
+				continue
+			}
+			if req.Name != "" && len(members) == 1 {
+				into[key] = EnvRequirement{Name: members[0]}
+				if markStrict {
+					strictRequiredNames[members[0]] = struct{}{}
+				}
+			} else {
+				into[key] = EnvRequirement{AnyOf: members}
+			}
+		}
+	}
+	accept(reqByKey, tmpl.RequiredEnv, "template root", true)
+	accept(recByKey, tmpl.RecommendedEnv, "template root", false)
+	var walk func([]OrgWorkspace)
+	walk = func(ws []OrgWorkspace) {
+		for _, w := range ws {
+			accept(reqByKey, w.RequiredEnv, "workspace "+w.Name, true)
+			accept(recByKey, w.RecommendedEnv, "workspace "+w.Name, false)
+			walk(w.Children)
+		}
+	}
+	walk(tmpl.Workspaces)
+
+	// Required wins across tiers: any requirement whose members
+	// overlap with a strict required name gets dropped from
+	// recommended. Keeps the canvas modal from showing the same
+	// key in both sections.
+	prune := func(from map[string]EnvRequirement) {
+		for k, r := range from {
+			for _, m := range r.Members() {
+				if _, strict := strictRequiredNames[m]; strict {
+					delete(from, k)
+					break
+				}
+			}
+		}
+	}
+	prune(recByKey)
+
+	// Same-tier: a strict required X dominates any-of groups in
+	// required that CONTAIN X (a group saying "any of X, Y" is
+	// automatically satisfied when X is required anyway, so it's
+	// redundant). Same logic applied to recommended.
+	pruneSameTier := func(tier map[string]EnvRequirement) {
+		strictInTier := map[string]struct{}{}
+		for _, r := range tier {
+			if r.Name != "" {
+				strictInTier[r.Name] = struct{}{}
+			}
+		}
+		for k, r := range tier {
+			if len(r.AnyOf) == 0 {
+				continue
+			}
+			for _, m := range r.AnyOf {
+				if _, strict := strictInTier[m]; strict {
+					delete(tier, k)
+					break
+				}
+			}
+		}
+	}
+	pruneSameTier(reqByKey)
+	pruneSameTier(recByKey)
+
+	required = flattenAndSortRequirements(reqByKey)
+	recommended = flattenAndSortRequirements(recByKey)
+	return required, recommended
+}
+
+func flattenAndSortRequirements(by map[string]EnvRequirement) []EnvRequirement {
+	out := make([]EnvRequirement, 0, len(by))
+	for _, r := range by {
+		out = append(out, r)
+	}
+	sort.Slice(out, func(i, j int) bool {
+		// Sort singles first by name; groups after, ordered by
+		// joined-member string. Gives the canvas a deterministic
+		// render order so the same template always produces the
+		// same modal layout.
+		iSingle := out[i].Name != ""
+		jSingle := out[j].Name != ""
+		if iSingle != jSingle {
+			return iSingle
+		}
+		if iSingle {
+			return out[i].Name < out[j].Name
+		}
+		return envRequirementKey(out[i].AnyOf) < envRequirementKey(out[j].AnyOf)
+	})
+	return out
+}
+
+// loadConfiguredGlobalSecretKeys returns the set of key names present
+// in global_secrets WHERE the encrypted_value is non-empty. Filtering
+// on the payload size catches the failure mode where a row was
+// upserted with an empty value (historical rows predating the
+// binding:"required" guard on SetGlobal, or a future direct SQL
+// path that skips it) — the preflight would otherwise report the
+// key as "configured" and the per-container preflight would still
+// fail at start time, defeating the whole feature.
+// The LIMIT is a sanity cap: at realistic tenant sizes (< 1k
+// secrets) it's a no-op; at pathological sizes it stops one slow
+// query from wedging org imports. A hit gets logged so operators
+// can investigate.
+const globalSecretsPreflightLimit = 10000
+
+func loadConfiguredGlobalSecretKeys(ctx context.Context) (map[string]struct{}, error) {
+	rows, err := db.DB.QueryContext(ctx,
+		`SELECT key FROM global_secrets WHERE octet_length(encrypted_value) > 0 LIMIT $1`,
+		globalSecretsPreflightLimit)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	out := map[string]struct{}{}
+	for rows.Next() {
+		var k string
+		if scanErr := rows.Scan(&k); scanErr == nil && k != "" {
+			out[k] = struct{}{}
+		}
+	}
+	if len(out) == globalSecretsPreflightLimit {
+		log.Printf("loadConfiguredGlobalSecretKeys: hit LIMIT %d — org-import preflight may be incomplete", globalSecretsPreflightLimit)
+	}
+	return out, rows.Err()
+}
+
 func countWorkspaces(workspaces []OrgWorkspace) int {
 	count := len(workspaces)
 	for _, ws := range workspaces {
diff --git a/workspace-server/internal/handlers/org_test.go b/workspace-server/internal/handlers/org_test.go
index 7af674b2..19dbece9 100644
--- a/workspace-server/internal/handlers/org_test.go
+++ b/workspace-server/internal/handlers/org_test.go
@@ -1,6 +1,7 @@
 package handlers
 
 import (
+	"sort"
 	"strings"
 	"testing"
 	"time"
@@ -650,3 +651,428 @@ func TestOrgImport_ScheduleComputeError(t *testing.T) {
 		})
 	}
 }
+
+// ============================================================================
+// Org env-preflight aggregation (collectOrgEnv)
+// ============================================================================
+
+// strictReq builds a slice of single-name EnvRequirements for test
+// fixtures. Equivalent to the old []string literal but wrapped in
+// the new union shape.
+func strictReq(names ...string) []EnvRequirement {
+	out := make([]EnvRequirement, 0, len(names))
+	for _, n := range names {
+		out = append(out, EnvRequirement{Name: n})
+	}
+	return out
+}
+
+// anyOfReq builds a single any-of EnvRequirement for test fixtures.
+func anyOfReq(names ...string) EnvRequirement {
+	return EnvRequirement{AnyOf: append([]string(nil), names...)}
+}
+
+// reqNames flattens a slice of EnvRequirements into a single comparable
+// slice: single-name reqs contribute their Name, any-of reqs contribute
+// "anyOf(A|B|C)" with members sorted for deterministic output. Lets
+// tests assert against a string form regardless of which kind each
+// entry takes.
+func reqNames(reqs []EnvRequirement) []string {
+	out := make([]string, 0, len(reqs))
+	for _, r := range reqs {
+		if r.Name != "" {
+			out = append(out, r.Name)
+			continue
+		}
+		members := append([]string(nil), r.AnyOf...)
+		sort.Strings(members)
+		out = append(out, "anyOf("+strings.Join(members, "|")+")")
+	}
+	return out
+}
+
+func TestCollectOrgEnv_UnionAcrossLevels(t *testing.T) {
+	tmpl := &OrgTemplate{
+		RequiredEnv:    strictReq("ANTHROPIC_API_KEY"),
+		RecommendedEnv: strictReq("SLACK_WEBHOOK_URL"),
+		Workspaces: []OrgWorkspace{
+			{
+				Name:        "Root",
+				RequiredEnv: strictReq("GITHUB_TOKEN"),
+				Children: []OrgWorkspace{
+					{
+						Name:           "Leaf",
+						RequiredEnv:    strictReq("OPENROUTER_API_KEY"),
+						RecommendedEnv: strictReq("DISCORD_WEBHOOK_URL"),
+					},
+				},
+			},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	// Required is the union of top-level + root + leaf.
+	wantReq := []string{"ANTHROPIC_API_KEY", "GITHUB_TOKEN", "OPENROUTER_API_KEY"}
+	if !stringSlicesEqual(reqNames(req), wantReq) {
+		t.Errorf("required mismatch: got %v, want %v", reqNames(req), wantReq)
+	}
+	wantRec := []string{"DISCORD_WEBHOOK_URL", "SLACK_WEBHOOK_URL"}
+	if !stringSlicesEqual(reqNames(rec), wantRec) {
+		t.Errorf("recommended mismatch: got %v, want %v", reqNames(rec), wantRec)
+	}
+}
+
+func TestCollectOrgEnv_RequiredWinsOverRecommended(t *testing.T) {
+	// Same key declared at one layer as recommended and another as
+	// required MUST surface only on the required side — a required
+	// declaration is strictly stricter than a recommended one, and
+	// listing it in both tiers would confuse the preflight modal.
+	tmpl := &OrgTemplate{
+		RecommendedEnv: strictReq("API_KEY"),
+		Workspaces: []OrgWorkspace{
+			{Name: "X", RequiredEnv: strictReq("API_KEY")},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "API_KEY" {
+		t.Errorf("required should contain API_KEY, got %v", reqNames(req))
+	}
+	for _, r := range rec {
+		if r.Name == "API_KEY" {
+			t.Errorf("API_KEY must not appear in recommended once required elsewhere")
+		}
+	}
+}
+
+func TestCollectOrgEnv_Dedup(t *testing.T) {
+	// Same key declared twice at different levels should appear once.
+	tmpl := &OrgTemplate{
+		RequiredEnv: strictReq("K", "K"),
+		Workspaces: []OrgWorkspace{
+			{Name: "A", RequiredEnv: strictReq("K")},
+			{Name: "B", RequiredEnv: strictReq("K"), Children: []OrgWorkspace{
+				{Name: "C", RequiredEnv: strictReq("K")},
+			}},
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "K" {
+		t.Errorf("dedup failed: got %v, want [K]", reqNames(req))
+	}
+}
+
+func TestCollectOrgEnv_Empty(t *testing.T) {
+	tmpl := &OrgTemplate{}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 0 || len(rec) != 0 {
+		t.Errorf("empty template should produce empty slices, got req=%v rec=%v", reqNames(req), reqNames(rec))
+	}
+}
+
+// stringSlicesEqual checks ordered equality — collectOrgEnv sorts its
+// output so callers can do deterministic comparisons.
+func stringSlicesEqual(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func TestCollectOrgEnv_RequiredWinsOnSameStruct(t *testing.T) {
+	// The same key declared required AND recommended on the SAME
+	// workspace node (rare but legal to parse) must still dedup
+	// correctly and end up required-only.
+	tmpl := &OrgTemplate{
+		Workspaces: []OrgWorkspace{
+			{
+				Name:           "X",
+				RequiredEnv:    strictReq("API_KEY"),
+				RecommendedEnv: strictReq("API_KEY"),
+			},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "API_KEY" {
+		t.Errorf("required should contain API_KEY once, got %v", reqNames(req))
+	}
+	for _, r := range rec {
+		if r.Name == "API_KEY" {
+			t.Errorf("API_KEY must not appear in recommended when also required on same struct")
+		}
+	}
+}
+
+func TestCollectOrgEnv_RejectsInvalidNames(t *testing.T) {
+	// Names failing envVarNamePattern (lowercase, traversal, whitespace,
+	// shell metachars) must be dropped silently — the log line is not
+	// asserted here; the output slice assertion is enough to prove the
+	// filter fires.
+	tmpl := &OrgTemplate{
+		RequiredEnv: strictReq(
+			"VALID_ONE",
+			"lowercase_bad",
+			"../../etc/passwd",
+			"name with spaces",
+			"WITH-DASH",
+			"'; DROP TABLE users;--",
+			"",
+			"A", // single char — still valid per regex
+		),
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if !stringSlicesEqual(reqNames(req), []string{"A", "VALID_ONE"}) {
+		t.Errorf("expected only valid names, got %v", reqNames(req))
+	}
+}
+
+// TestOrgTemplate_ClaudeAnyOfAuthPreflight exercises the shape the
+// ux-ab-lab template ships with: a single any-of group at the org
+// level covering ANTHROPIC_API_KEY vs. CLAUDE_CODE_OAUTH_TOKEN, plus
+// two strict recommended entries (SERPER_API_KEY, VERCEL_TOKEN).
+// Proves the end-to-end YAML → OrgTemplate → collectOrgEnv → IsSatisfied
+// pipeline works for the canonical "Claude sub OR API key" pattern
+// without depending on the on-disk template file (org-templates/ is
+// populated by the clone-manifest, not tracked in this monorepo).
+func TestOrgTemplate_ClaudeAnyOfAuthPreflight(t *testing.T) {
+	src := `
+name: UX A/B Lab
+required_env:
+  - any_of:
+      - ANTHROPIC_API_KEY
+      - CLAUDE_CODE_OAUTH_TOKEN
+recommended_env:
+  - SERPER_API_KEY
+  - VERCEL_TOKEN
+workspaces:
+  - name: Design Director
+    children:
+      - name: UX Researcher
+      - name: Visual Designer
+      - name: React Engineer
+      - name: Deploy Engineer
+      - name: A11y + SEO Auditor
+      - name: Perf Auditor
+`
+	var tmpl OrgTemplate
+	if err := yaml.Unmarshal([]byte(src), &tmpl); err != nil {
+		t.Fatalf("unmarshal: %v", err)
+	}
+	if len(tmpl.Workspaces) != 1 || len(tmpl.Workspaces[0].Children) != 6 {
+		t.Fatalf("expected 1 root with 6 children, got shape %+v", tmpl.Workspaces)
+	}
+
+	required, recommended := collectOrgEnv(&tmpl)
+	if len(required) != 1 {
+		t.Fatalf("expected 1 required requirement (the any-of group), got %d: %v", len(required), reqNames(required))
+	}
+	if required[0].Name != "" {
+		t.Errorf("expected any-of group, got strict name %q", required[0].Name)
+	}
+	wantMembers := []string{"ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"}
+	got := append([]string(nil), required[0].AnyOf...)
+	sort.Strings(got)
+	if !stringSlicesEqual(got, wantMembers) {
+		t.Errorf("any-of members mismatch: got %v, want %v", got, wantMembers)
+	}
+
+	// Either member should independently satisfy the group.
+	if !required[0].IsSatisfied(map[string]struct{}{"ANTHROPIC_API_KEY": {}}) {
+		t.Errorf("ANTHROPIC_API_KEY alone should satisfy the group")
+	}
+	if !required[0].IsSatisfied(map[string]struct{}{"CLAUDE_CODE_OAUTH_TOKEN": {}}) {
+		t.Errorf("CLAUDE_CODE_OAUTH_TOKEN alone should satisfy the group")
+	}
+	if required[0].IsSatisfied(map[string]struct{}{"OPENAI_API_KEY": {}}) {
+		t.Errorf("unrelated key should NOT satisfy the group")
+	}
+
+	wantRec := []string{"SERPER_API_KEY", "VERCEL_TOKEN"}
+	if !stringSlicesEqual(reqNames(recommended), wantRec) {
+		t.Errorf("recommended mismatch: got %v, want %v", reqNames(recommended), wantRec)
+	}
+}
+
+// TestEnvRequirement_UnmarshalYAML proves the on-disk YAML shape
+// (scalar OR `{any_of: [...]}` block) round-trips into EnvRequirement
+// correctly. The preflight pipeline reads user-authored org.yaml
+// files; a regression here would silently drop requirements.
+func TestEnvRequirement_UnmarshalYAML(t *testing.T) {
+	src := `
+required_env:
+  - GITHUB_TOKEN
+  - any_of:
+      - ANTHROPIC_API_KEY
+      - CLAUDE_CODE_OAUTH_TOKEN
+`
+	var parsed struct {
+		RequiredEnv []EnvRequirement `yaml:"required_env"`
+	}
+	if err := yaml.Unmarshal([]byte(src), &parsed); err != nil {
+		t.Fatalf("unmarshal failed: %v", err)
+	}
+	if len(parsed.RequiredEnv) != 2 {
+		t.Fatalf("want 2 requirements, got %d", len(parsed.RequiredEnv))
+	}
+	if parsed.RequiredEnv[0].Name != "GITHUB_TOKEN" {
+		t.Errorf("first should be strict GITHUB_TOKEN, got %+v", parsed.RequiredEnv[0])
+	}
+	if parsed.RequiredEnv[1].Name != "" || len(parsed.RequiredEnv[1].AnyOf) != 2 {
+		t.Errorf("second should be any-of group, got %+v", parsed.RequiredEnv[1])
+	}
+}
+
+// TestEnvRequirement_UnmarshalYAML_RejectsEmptyAnyOf guards against a
+// template that ships `any_of: []` — ambiguous semantics (impossible
+// to satisfy), so the parser must fail loudly rather than silently
+// pass a never-satisfiable requirement through the preflight.
+func TestEnvRequirement_UnmarshalYAML_RejectsEmptyAnyOf(t *testing.T) {
+	src := `
+required_env:
+  - any_of: []
+`
+	var parsed struct {
+		RequiredEnv []EnvRequirement `yaml:"required_env"`
+	}
+	err := yaml.Unmarshal([]byte(src), &parsed)
+	if err == nil {
+		t.Errorf("expected error for empty any_of, got nil: %+v", parsed)
+	}
+}
+
+// ---------------------------------------------------------------------
+// any_of group tests — the new EnvRequirement union shape allows a
+// single requirement to be satisfied by any of a list of members (e.g.
+// ANTHROPIC_API_KEY OR CLAUDE_CODE_OAUTH_TOKEN). collectOrgEnv +
+// IsSatisfied together must handle this correctly.
+// ---------------------------------------------------------------------
+
+func TestEnvRequirement_IsSatisfied(t *testing.T) {
+	configured := map[string]struct{}{
+		"ANTHROPIC_API_KEY": {},
+		"GITHUB_TOKEN":      {},
+	}
+	tests := []struct {
+		name string
+		req  EnvRequirement
+		want bool
+	}{
+		{"strict present", EnvRequirement{Name: "ANTHROPIC_API_KEY"}, true},
+		{"strict absent", EnvRequirement{Name: "MISSING_KEY"}, false},
+		{"any-of first member present", anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"), true},
+		{"any-of second member present", anyOfReq("CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_API_KEY"), true},
+		{"any-of none present", anyOfReq("OPENAI_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"), false},
+		{"any-of single member present", anyOfReq("GITHUB_TOKEN"), true},
+	}
+	for _, tt := range tests {
+		if got := tt.req.IsSatisfied(configured); got != tt.want {
+			t.Errorf("%s: got %v, want %v", tt.name, got, tt.want)
+		}
+	}
+}
+
+func TestCollectOrgEnv_AnyOfGroupPreserved(t *testing.T) {
+	// A group with two alternatives should come through as a single
+	// EnvRequirement carrying both members.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 {
+		t.Fatalf("expected 1 requirement, got %d: %v", len(req), reqNames(req))
+	}
+	if req[0].Name != "" {
+		t.Errorf("expected any-of group, got strict name %q", req[0].Name)
+	}
+	wantMembers := []string{"ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"}
+	got := append([]string(nil), req[0].AnyOf...)
+	sort.Strings(got)
+	if !stringSlicesEqual(got, wantMembers) {
+		t.Errorf("any-of members mismatch: got %v, want %v", got, wantMembers)
+	}
+}
+
+func TestCollectOrgEnv_AnyOfGroupDedup(t *testing.T) {
+	// Two identical groups (members in different order) declared at
+	// different levels must collapse to one.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+		},
+		Workspaces: []OrgWorkspace{
+			{
+				Name: "Root",
+				RequiredEnv: []EnvRequirement{
+					anyOfReq("CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_API_KEY"),
+				},
+			},
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 {
+		t.Errorf("expected 1 requirement after dedup, got %d: %v", len(req), reqNames(req))
+	}
+}
+
+func TestCollectOrgEnv_StrictDominatesGroup(t *testing.T) {
+	// If a strict requirement X is declared anywhere, any-of groups
+	// that CONTAIN X are redundant — the strict requirement will force
+	// X to be configured, which satisfies any group mentioning it too.
+	// Same-tier pruning drops the group.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			{Name: "ANTHROPIC_API_KEY"},
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "ANTHROPIC_API_KEY" {
+		t.Errorf("strict should dominate group, got %v", reqNames(req))
+	}
+}
+
+func TestCollectOrgEnv_StrictRequiredDominatesRecommendedGroup(t *testing.T) {
+	// Cross-tier: a strict required X drops any-of groups in the
+	// recommended tier that mention X.
+	tmpl := &OrgTemplate{
+		RequiredEnv: strictReq("ANTHROPIC_API_KEY"),
+		RecommendedEnv: []EnvRequirement{
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+			{Name: "SLACK_WEBHOOK_URL"},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "ANTHROPIC_API_KEY" {
+		t.Errorf("required mismatch: got %v", reqNames(req))
+	}
+	// The any-of group should have been pruned; only SLACK remains.
+	if len(rec) != 1 || rec[0].Name != "SLACK_WEBHOOK_URL" {
+		t.Errorf("recommended mismatch: got %v, want [SLACK_WEBHOOK_URL]", reqNames(rec))
+	}
+}
+
+func TestCollectOrgEnv_AnyOfWithInvalidMemberKeepsValidOnes(t *testing.T) {
+	// A group with one valid + one invalid member should keep the
+	// valid one (group carried by any remaining legitimate name). A
+	// group where ALL members are invalid is dropped entirely.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			anyOfReq("VALID_ONE", "lowercase_bad"),
+			anyOfReq("'; DROP TABLE;--", ""),
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 {
+		t.Fatalf("expected 1 requirement, got %d: %v", len(req), reqNames(req))
+	}
+	// The remaining group has only one valid member, so it gets
+	// promoted to a single-name requirement (len(members)==1 path).
+	if req[0].Name != "VALID_ONE" && !stringSlicesEqual(req[0].AnyOf, []string{"VALID_ONE"}) {
+		t.Errorf("expected VALID_ONE to survive, got %v", reqNames(req))
+	}
+}
diff --git a/workspace-server/internal/handlers/registry.go b/workspace-server/internal/handlers/registry.go
index e5be5553..2a07264e 100644
--- a/workspace-server/internal/handlers/registry.go
+++ b/workspace-server/internal/handlers/registry.go
@@ -454,6 +454,29 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea
 		return
 	}
 
+	// Self-reported runtime wedge: takes precedence over the error_rate
+	// path. The heartbeat task lives in its own asyncio task and keeps
+	// firing 200s even after claude_agent_sdk locks up on
+	// `Control request timeout: initialize` — so error_rate stays at 0
+	// (no calls have been recorded as errors yet) while every actual
+	// /a2a POST hangs. The workspace tells us about that case via
+	// runtime_state="wedged"; we honor it directly. Sample_error from
+	// the heartbeat carries the human-readable reason ("SDK init
+	// timeout — restart workspace"), which the canvas surfaces in the
+	// degraded card without the operator scraping container logs.
+	if payload.RuntimeState == "wedged" && currentStatus == "online" {
+		_, err := db.DB.ExecContext(ctx,
+			`UPDATE workspaces SET status = 'degraded', updated_at = now() WHERE id = $1 AND status = 'online'`,
+			payload.WorkspaceID)
+		if err != nil {
+			log.Printf("Heartbeat: failed to mark %s degraded (wedged): %v", payload.WorkspaceID, err)
+		}
+		h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_DEGRADED", payload.WorkspaceID, map[string]interface{}{
+			"runtime_state": "wedged",
+			"sample_error":  payload.SampleError,
+		})
+	}
+
 	if currentStatus == "online" && payload.ErrorRate >= 0.5 {
 		if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'degraded', updated_at = now() WHERE id = $1`, payload.WorkspaceID); err != nil {
 			log.Printf("Heartbeat: failed to mark %s degraded: %v", payload.WorkspaceID, err)
@@ -464,7 +487,13 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea
 		})
 	}
 
-	if currentStatus == "degraded" && payload.ErrorRate < 0.1 {
+	// Recovery from degraded → online when BOTH the error rate has
+	// fallen back AND the workspace is no longer reporting a wedge.
+	// The wedge condition is sticky for the process lifetime
+	// (claude_sdk_executor only clears it on restart), so when the
+	// container restarts and starts heartbeating fresh — RuntimeState
+	// is empty, error_rate is 0 — this branch flips us back to online.
+	if currentStatus == "degraded" && payload.ErrorRate < 0.1 && payload.RuntimeState == "" {
 		if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'online', updated_at = now() WHERE id = $1`, payload.WorkspaceID); err != nil {
 			log.Printf("Heartbeat: failed to recover %s to online: %v", payload.WorkspaceID, err)
 		}
diff --git a/workspace-server/internal/handlers/registry_test.go b/workspace-server/internal/handlers/registry_test.go
index 62c9e984..9f33b140 100644
--- a/workspace-server/internal/handlers/registry_test.go
+++ b/workspace-server/internal/handlers/registry_test.go
@@ -298,6 +298,163 @@ func TestHeartbeatHandler_OnlineStaysOnline(t *testing.T) {
 	}
 }
 
+// ==================== Heartbeat — runtime wedge (claude_agent_sdk init timeout) ====================
+
+// TestHeartbeatHandler_RuntimeWedged_FlipsOnlineToDegraded verifies the
+// runtime_state="wedged" path. Heartbeat task in the workspace lives in
+// its own asyncio task and keeps reporting online while the Claude SDK
+// is wedged on Control request timeout; the workspace tells us about
+// the wedge via this field, and we honor it by flipping status →
+// degraded with the wedge reason in last_sample_error.
+func TestHeartbeatHandler_RuntimeWedged_FlipsOnlineToDegraded(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	wedgeMsg := "claude_agent_sdk wedge: Control request timeout: initialize — restart workspace to recover"
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
+
+	// Heartbeat UPDATE — sample_error carries the wedge reason from the
+	// workspace's _runtime_state_payload() helper.
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-wedged", 0.0, wedgeMsg, 0, 600, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// evaluateStatus: currentStatus = online
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("online"))
+
+	// The wedge-handling branch fires the degraded UPDATE with the
+	// `AND status = 'online'` guard (race-safe against concurrent
+	// removal). Match the SQL with the guard included.
+	mock.ExpectExec("UPDATE workspaces SET status = 'degraded'.*status = 'online'").
+		WithArgs("ws-wedged").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// RecordAndBroadcast for WORKSPACE_DEGRADED
+	mock.ExpectExec("INSERT INTO structure_events").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	body := `{"workspace_id":"ws-wedged","error_rate":0.0,"sample_error":"` + wedgeMsg + `","active_tasks":0,"uptime_seconds":600,"runtime_state":"wedged"}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestHeartbeatHandler_DegradedRecoversOnlyAfterWedgeClears verifies that
+// the degraded → online recovery path requires BOTH error_rate < 0.1
+// AND runtime_state cleared. A workspace still reporting wedged stays
+// degraded even when error_rate happens to be 0 (no calls have been
+// recorded as errors yet — the wedge is captured as a runtime state,
+// not an error count).
+func TestHeartbeatHandler_DegradedRecoversOnlyAfterWedgeClears(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-still-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
+
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-still-wedged", 0.0, "still broken", 0, 800, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// currentStatus = degraded
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-still-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("degraded"))
+
+	// No additional UPDATE expected — the recovery branch's
+	// `runtime_state == ""` guard blocks the flip back to online.
+	// (sqlmock fails the test if any unmocked Exec runs.)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	body := `{"workspace_id":"ws-still-wedged","error_rate":0.0,"sample_error":"still broken","active_tasks":0,"uptime_seconds":800,"runtime_state":"wedged"}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestHeartbeatHandler_DegradedToOnline_AfterWedgeClears verifies the
+// happy-path recovery: a workspace previously marked degraded is
+// post-restart, error_rate is back to 0, and runtime_state is empty
+// (the new process re-imported claude_sdk_executor with the flag
+// fresh). Status flips back to online and a WORKSPACE_ONLINE event
+// fires.
+func TestHeartbeatHandler_DegradedToOnline_AfterWedgeClears(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-recovered").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
+
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-recovered", 0.0, "", 0, 30, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-recovered").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("degraded"))
+
+	// Recovery UPDATE fires (degraded → online).
+	mock.ExpectExec("UPDATE workspaces SET status = 'online'").
+		WithArgs("ws-recovered").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	mock.ExpectExec("INSERT INTO structure_events").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	// runtime_state intentionally absent (== ""); error_rate = 0; this
+	// is exactly what a freshly-restarted workspace's first heartbeat
+	// looks like.
+	body := `{"workspace_id":"ws-recovered","error_rate":0.0,"sample_error":"","active_tasks":0,"uptime_seconds":30}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
 // ==================== UpdateCard ====================
 
 func TestUpdateCard_Success(t *testing.T) {
diff --git a/workspace-server/internal/handlers/secrets.go b/workspace-server/internal/handlers/secrets.go
index 261222d9..3766068d 100644
--- a/workspace-server/internal/handlers/secrets.go
+++ b/workspace-server/internal/handlers/secrets.go
@@ -466,3 +466,70 @@ func (h *SecretsHandler) GetModel(c *gin.Context) {
 
 	c.JSON(http.StatusOK, gin.H{"model": string(decrypted), "source": "workspace_secrets"})
 }
+
+// SetModel handles PUT /workspaces/:id/model — writes the model slug
+// into workspace_secrets as MODEL_PROVIDER (the key GetModel reads).
+// For hermes, the value is a hermes-native slug like "minimax/MiniMax-M2.7";
+// for langgraph it's the legacy "provider:model" form. Either way it's just
+// an opaque string the runtime interprets on its next start.
+//
+// Empty string clears the override. Triggers auto-restart so the new
+// env (HERMES_DEFAULT_MODEL etc.) takes effect immediately — without
+// this the user clicks Save+Restart, the canvas PUT lands, but the
+// already-restarting container misses the window and boots with the
+// old value.
+func (h *SecretsHandler) SetModel(c *gin.Context) {
+	workspaceID := c.Param("id")
+	if !uuidRegex.MatchString(workspaceID) {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid workspace ID"})
+		return
+	}
+	ctx := c.Request.Context()
+
+	var body struct {
+		Model string `json:"model"`
+	}
+	if err := c.ShouldBindJSON(&body); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request body"})
+		return
+	}
+
+	if body.Model == "" {
+		if _, err := db.DB.ExecContext(ctx,
+			`DELETE FROM workspace_secrets WHERE workspace_id = $1 AND key = 'MODEL_PROVIDER'`,
+			workspaceID); err != nil {
+			log.Printf("SetModel delete error: %v", err)
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to clear model"})
+			return
+		}
+		if h.restartFunc != nil {
+			go h.restartFunc(workspaceID)
+		}
+		c.JSON(http.StatusOK, gin.H{"status": "cleared"})
+		return
+	}
+
+	encrypted, err := crypto.Encrypt([]byte(body.Model))
+	if err != nil {
+		log.Printf("SetModel encrypt error: %v", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to encrypt model"})
+		return
+	}
+	version := crypto.CurrentEncryptionVersion()
+	_, err = db.DB.ExecContext(ctx, `
+		INSERT INTO workspace_secrets (workspace_id, key, encrypted_value, encryption_version)
+		VALUES ($1, 'MODEL_PROVIDER', $2, $3)
+		ON CONFLICT (workspace_id, key) DO UPDATE
+			SET encrypted_value = $2, encryption_version = $3, updated_at = now()
+	`, workspaceID, encrypted, version)
+	if err != nil {
+		log.Printf("SetModel upsert error: %v", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save model"})
+		return
+	}
+
+	if h.restartFunc != nil {
+		go h.restartFunc(workspaceID)
+	}
+	c.JSON(http.StatusOK, gin.H{"status": "saved", "model": body.Model})
+}
diff --git a/workspace-server/internal/handlers/secrets_test.go b/workspace-server/internal/handlers/secrets_test.go
index 1e4da981..78e66a16 100644
--- a/workspace-server/internal/handlers/secrets_test.go
+++ b/workspace-server/internal/handlers/secrets_test.go
@@ -6,6 +6,7 @@ import (
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
+	"strings"
 	"testing"
 	"time"
 
@@ -535,6 +536,88 @@ func TestSecretsGetModel_DBError(t *testing.T) {
 	}
 }
 
+// ==================== SetModel ====================
+
+func TestSecretsSetModel_Upsert(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	restartCalled := make(chan string, 1)
+	handler := NewSecretsHandler(func(id string) { restartCalled <- id })
+
+	mock.ExpectExec(`INSERT INTO workspace_secrets`).
+		WithArgs("00000000-0000-0000-0000-000000000001", sqlmock.AnyArg(), sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(1, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+	c.Request = httptest.NewRequest("PUT", "/workspaces/00000000-0000-0000-0000-000000000001/model",
+		strings.NewReader(`{"model":"minimax/MiniMax-M2.7"}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.SetModel(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	select {
+	case id := <-restartCalled:
+		if id != "00000000-0000-0000-0000-000000000001" {
+			t.Errorf("restart called with wrong id: %s", id)
+		}
+	case <-time.After(500 * time.Millisecond):
+		t.Error("restart was not triggered")
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+func TestSecretsSetModel_EmptyClears(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewSecretsHandler(func(string) {})
+
+	mock.ExpectExec(`DELETE FROM workspace_secrets`).
+		WithArgs("00000000-0000-0000-0000-000000000002").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000002"}}
+	c.Request = httptest.NewRequest("PUT", "/workspaces/00000000-0000-0000-0000-000000000002/model",
+		strings.NewReader(`{"model":""}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.SetModel(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+func TestSecretsSetModel_InvalidID(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewSecretsHandler(nil)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "not-a-uuid"}}
+	c.Request = httptest.NewRequest("PUT", "/workspaces/not-a-uuid/model",
+		strings.NewReader(`{"model":"x"}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.SetModel(c)
+
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400 for bad UUID, got %d", w.Code)
+	}
+}
+
 // ==================== Values — Phase 30.2 decrypted pull ====================
 
 // These tests target the secrets.Values handler (GET /workspaces/:id/secrets/values)
diff --git a/workspace-server/internal/handlers/ssrf.go b/workspace-server/internal/handlers/ssrf.go
index a84426f1..2e795e90 100644
--- a/workspace-server/internal/handlers/ssrf.go
+++ b/workspace-server/internal/handlers/ssrf.go
@@ -127,7 +127,16 @@ var testAllowLoopback = false
 // container deployments the relaxation is off and every private range
 // stays blocked.
 func isPrivateOrMetadataIP(ip net.IP) bool {
-	saas := saasMode()
+	// MOLECULE_ENV=development is the dev-host pattern: platform and
+	// workspace containers share a docker bridge network (172.18.0.0/16,
+	// RFC-1918). Treat that the same as SaaS for private-range relaxation
+	// — both share the "trusted intra-network routing" property. Without
+	// this, every workspace registration via docker-internal hostname
+	// resolves to 172.18.x.x and gets rejected as
+	// "workspace URL is not publicly routable", breaking the entire
+	// docker-compose dev loop. Always-blocked categories (metadata link-
+	// local, TEST-NET, CGNAT) remain blocked regardless.
+	saas := saasMode() || devModeAllowsLoopback()
 
 	// IPv4 path.
 	if ip4 := ip.To4(); ip4 != nil {
diff --git a/workspace-server/internal/handlers/ssrf_test.go b/workspace-server/internal/handlers/ssrf_test.go
index 37b2b358..8e246ebc 100644
--- a/workspace-server/internal/handlers/ssrf_test.go
+++ b/workspace-server/internal/handlers/ssrf_test.go
@@ -387,6 +387,37 @@ func TestIsSafeURL_SaaSMode_StillBlocksMetadataEtAl(t *testing.T) {
 	}
 }
 
+// TestIsSafeURL_DevMode_AllowsRFC1918 pins the dev-mode RFC-1918 + ULA
+// relaxation that #2103 widened. The dev-host docker-compose pattern
+// puts the platform + workspaces on the same docker bridge (172.18.0.0/16),
+// so workspace registration via 172.18.x.x must NOT be rejected in dev.
+// SaaS already allowed this; dev mode now matches via
+// `saas := saasMode() || devModeAllowsLoopback()` in isPrivateOrMetadataIP.
+//
+// Without this test, a future refactor that quietly drops the
+// `|| devModeAllowsLoopback()` from line 130 wouldn't trip any test —
+// the existing `TestIsSafeURL_DevMode_StillBlocksOtherRanges` only
+// pins the security floor (metadata / TEST-NET / CGNAT), not the
+// behavior change.
+func TestIsSafeURL_DevMode_AllowsRFC1918(t *testing.T) {
+	// Make sure saasMode() returns false so the test exercises the
+	// devModeAllowsLoopback() branch specifically — not a SaaS-mode pass.
+	t.Setenv("MOLECULE_DEPLOY_MODE", "self-hosted")
+	t.Setenv("MOLECULE_ORG_ID", "")
+	t.Setenv("MOLECULE_ENV", "development")
+
+	for _, url := range []string{
+		"http://10.1.2.3/agent",
+		"http://172.18.0.42:8000/a2a",       // the docker-compose case from the issue
+		"http://192.168.1.100/agent",
+		"http://[fd00::1]/agent",            // IPv6 ULA fd00::/8 also relaxed
+	} {
+		if err := isSafeURL(url); err != nil {
+			t.Errorf("isSafeURL(%q) in dev mode: got %v, want nil", url, err)
+		}
+	}
+}
+
 // TestIsSafeURL_StrictMode_BlocksRFC1918 is the strict-mode counterpart to
 // TestIsSafeURL_SaaSMode_AllowsRFC1918.  In self-hosted / single-container
 // deployments there is no legitimate reason to reach RFC-1918 agents, so the
diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go
index 43d6e877..87676b55 100644
--- a/workspace-server/internal/handlers/workspace.go
+++ b/workspace-server/internal/handlers/workspace.go
@@ -26,7 +26,13 @@ import (
 )
 
 type WorkspaceHandler struct {
-	broadcaster *events.Broadcaster
+	// broadcaster narrowed from `*events.Broadcaster` to the
+	// events.EventEmitter interface (#1814) so tests can substitute a
+	// capture-only stub without standing up the real Redis + WS-hub
+	// topology. Production callers still pass *events.Broadcaster, which
+	// satisfies the interface — see the compile-time assertion in
+	// internal/events/broadcaster.go.
+	broadcaster events.EventEmitter
 	provisioner *provisioner.Provisioner
 	cpProv      *provisioner.CPProvisioner
 	platformURL string
@@ -46,7 +52,7 @@ type WorkspaceHandler struct {
 	provisionTimeouts runtimeProvisionTimeoutsCache
 }
 
-func NewWorkspaceHandler(b *events.Broadcaster, p *provisioner.Provisioner, platformURL, configsDir string) *WorkspaceHandler {
+func NewWorkspaceHandler(b events.EventEmitter, p *provisioner.Provisioner, platformURL, configsDir string) *WorkspaceHandler {
 	return &WorkspaceHandler{
 		broadcaster: b,
 		provisioner: p,
@@ -204,11 +210,15 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
 		return
 	}
 
+	maxConcurrent := payload.MaxConcurrentTasks
+	if maxConcurrent <= 0 {
+		maxConcurrent = models.DefaultMaxConcurrentTasks
+	}
 	// Insert workspace with runtime persisted in DB (inside transaction)
 	_, err := tx.ExecContext(ctx, `
-		INSERT INTO workspaces (id, name, role, tier, runtime, awareness_namespace, status, parent_id, workspace_dir, workspace_access, budget_limit)
-		VALUES ($1, $2, $3, $4, $5, $6, 'provisioning', $7, $8, $9, $10)
-	`, id, payload.Name, role, payload.Tier, payload.Runtime, awarenessNamespace, payload.ParentID, workspaceDir, workspaceAccess, payload.BudgetLimit)
+		INSERT INTO workspaces (id, name, role, tier, runtime, awareness_namespace, status, parent_id, workspace_dir, workspace_access, budget_limit, max_concurrent_tasks)
+		VALUES ($1, $2, $3, $4, $5, $6, 'provisioning', $7, $8, $9, $10, $11)
+	`, id, payload.Name, role, payload.Tier, payload.Runtime, awarenessNamespace, payload.ParentID, workspaceDir, workspaceAccess, payload.BudgetLimit, maxConcurrent)
 	if err != nil {
 		tx.Rollback() //nolint:errcheck
 		log.Printf("Create workspace error: %v", err)
diff --git a/workspace-server/internal/handlers/workspace_budget_test.go b/workspace-server/internal/handlers/workspace_budget_test.go
index 01a96db3..b38e984b 100644
--- a/workspace-server/internal/handlers/workspace_budget_test.go
+++ b/workspace-server/internal/handlers/workspace_budget_test.go
@@ -23,6 +23,7 @@ import (
 	"time"
 
 	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
 	"github.com/gin-gonic/gin"
 )
 
@@ -150,6 +151,7 @@ func TestWorkspaceBudget_Create_WithLimit(t *testing.T) {
 			nil,              // workspace_dir
 			"none",           // workspace_access
 			&budgetVal,       // budget_limit ($10)
+			models.DefaultMaxConcurrentTasks, // max_concurrent_tasks default
 		).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
diff --git a/workspace-server/internal/handlers/workspace_crud.go b/workspace-server/internal/handlers/workspace_crud.go
index b7e83a74..043ece50 100644
--- a/workspace-server/internal/handlers/workspace_crud.go
+++ b/workspace-server/internal/handlers/workspace_crud.go
@@ -5,6 +5,7 @@ package handlers
 // Delete (cascade + purge), and input validation helpers.
 
 import (
+	"context"
 	"database/sql"
 	"errors"
 	"fmt"
@@ -12,6 +13,7 @@ import (
 	"net/http"
 	"path/filepath"
 	"strings"
+	"time"
 
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
@@ -390,44 +392,69 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
 	// Any concurrent heartbeat / registration / liveness-triggered restart
 	// will see status='removed' and bail out early.
 	//
-	// #1843: Stop() errors used to be silently swallowed. On the CP/EC2
-	// backend, Stop() calls the control plane's DELETE workspaces endpoint
-	// to terminate the EC2; if that errors (CP transient 5xx, network),
-	// the EC2 stays running with no DB row to track it — the
-	// "14 orphan workspace EC2s on a 0-customer account" scenario.
-	// Aggregate Stop failures and surface them as 500 so the client can
-	// retry. The retry replays Stop with the same instance_id (still
-	// readable from the row even after status='removed') — idempotent on
-	// the CP side. RemoveVolume errors stay log-and-continue: those are
-	// local cleanup of /var/data, not infra-leak class.
+	// Combines two concerns:
+	//
+	//  1. Detach cleanup from the request ctx via WithoutCancel + a 30s
+	//     timeout, so when the canvas's `api.del` resolves on our 200
+	//     (and gin cancels c.Request.Context()), in-flight Docker
+	//     stop/remove calls don't get cancelled mid-operation. The
+	//     previous shape leaked containers every time the canvas hung
+	//     up promptly: Stop returned "context canceled", the container
+	//     stayed up, and the next RemoveVolume failed with
+	//     "volume in use". 30s is generous for Docker daemon round-
+	//     trips (typical: <2s) and bounds a stuck daemon.
+	//
+	//  2. #1843: aggregate Stop() failures into stopErrs so the
+	//     post-deletion block surfaces them as 500. On the CP/EC2
+	//     backend, Stop() calls control plane's DELETE endpoint to
+	//     terminate the EC2; if that errors (transient 5xx, network),
+	//     the EC2 stays running with no DB row to track it (the
+	//     "orphan EC2 on a 0-customer account" scenario). Loud-fail
+	//     instead of silent-leak — clients retry, Stop's instance_id
+	//     lookup is idempotent against status='removed'. RemoveVolume
+	//     errors stay log-and-continue (local cleanup, not infra-leak).
+	cleanupCtx, cleanupCancel := context.WithTimeout(
+		context.WithoutCancel(ctx), 30*time.Second)
+	defer cleanupCancel()
+
 	var stopErrs []error
+	stopAndRemove := func(wsID string) {
+		if h.provisioner == nil {
+			return
+		}
+		// Check Stop's error before attempting RemoveVolume — the
+		// previous code discarded it and immediately tried the
+		// volume remove, which always fails with "volume in use"
+		// when Stop didn't actually kill the container. The orphan
+		// sweeper (registry/orphan_sweeper.go) catches what we
+		// skip here on the next reconcile pass.
+		if err := h.provisioner.Stop(cleanupCtx, wsID); err != nil {
+			log.Printf("Delete %s container stop failed: %v — leaving volume for orphan sweeper", wsID, err)
+			stopErrs = append(stopErrs, fmt.Errorf("stop %s: %w", wsID, err))
+			return
+		}
+		if err := h.provisioner.RemoveVolume(cleanupCtx, wsID); err != nil {
+			log.Printf("Delete %s volume removal warning: %v", wsID, err)
+		}
+	}
+
 	for _, descID := range descendantIDs {
-		if h.provisioner != nil {
-			if err := h.provisioner.Stop(ctx, descID); err != nil {
-				log.Printf("Delete descendant %s stop error: %v", descID, err)
-				stopErrs = append(stopErrs, fmt.Errorf("stop descendant %s: %w", descID, err))
-			}
-			if err := h.provisioner.RemoveVolume(ctx, descID); err != nil {
-				log.Printf("Delete descendant %s volume removal warning: %v", descID, err)
-			}
-		}
-		db.ClearWorkspaceKeys(ctx, descID)
-		h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_REMOVED", descID, map[string]interface{}{})
+		stopAndRemove(descID)
+		db.ClearWorkspaceKeys(cleanupCtx, descID)
+		// Detach broadcaster ctx for the same reason as the cleanup
+		// above — RecordAndBroadcast does an INSERT INTO
+		// structure_events + Redis Publish. If the canvas hangs up,
+		// a request-ctx-bound INSERT can be cancelled mid-write,
+		// leaving other WS clients ignorant of the cascade. The DB
+		// row is already 'removed' so it's recoverable, but the
+		// inconsistency is avoidable.
+		h.broadcaster.RecordAndBroadcast(cleanupCtx, "WORKSPACE_REMOVED", descID, map[string]interface{}{})
 	}
 
-	// Stop + remove volume for the workspace itself
-	if h.provisioner != nil {
-		if err := h.provisioner.Stop(ctx, id); err != nil {
-			log.Printf("Delete %s stop error: %v", id, err)
-			stopErrs = append(stopErrs, fmt.Errorf("stop %s: %w", id, err))
-		}
-		if err := h.provisioner.RemoveVolume(ctx, id); err != nil {
-			log.Printf("Delete %s volume removal warning: %v", id, err)
-		}
-	}
-	db.ClearWorkspaceKeys(ctx, id)
+	stopAndRemove(id)
+	db.ClearWorkspaceKeys(cleanupCtx, id)
 
-	h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_REMOVED", id, map[string]interface{}{
+	h.broadcaster.RecordAndBroadcast(cleanupCtx, "WORKSPACE_REMOVED", id, map[string]interface{}{
 		"cascade_deleted": len(descendantIDs),
 	})
 
diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go
index dff410f6..c7952dd5 100644
--- a/workspace-server/internal/handlers/workspace_provision.go
+++ b/workspace-server/internal/handlers/workspace_provision.go
@@ -176,20 +176,33 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
 			// Try to recover by applying the runtime-default template. payload.Runtime
 			// is populated by the caller (Restart handler / Create handler) from the
 			// DB row — same source of truth the apply_template=true path uses.
+			// Try `<runtime>-default` first (historical naming), then plain
+			// `<runtime>` (current naming in workspace-configs-templates/).
+			// Only claude-code has the `-default` suffix; every other
+			// runtime directory uses the bare name. Without the bare-name
+			// fallback, recovery only worked for claude-code and blank
+			// workspaces on every other runtime bricked on first start.
 			recovered := false
 			if payload.Runtime != "" {
-				runtimeTemplate := filepath.Join(h.configsDir, payload.Runtime+"-default")
-				if _, statErr := os.Stat(runtimeTemplate); statErr == nil {
-					log.Printf("Provisioner: auto-recover for %s — config volume empty, applying %s-default template (#1858)",
-						workspaceID, payload.Runtime)
-					templatePath = runtimeTemplate
-					// Rebuild cfg with the recovered template path so Start() sees it.
-					cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
-					cfg.ResetClaudeSession = resetClaudeSession
-					recovered = true
-				} else {
-					log.Printf("Provisioner: auto-recover for %s — runtime template %s not found: %v",
-						workspaceID, runtimeTemplate, statErr)
+				candidates := []string{
+					filepath.Join(h.configsDir, payload.Runtime+"-default"),
+					filepath.Join(h.configsDir, payload.Runtime),
+				}
+				for _, runtimeTemplate := range candidates {
+					if _, statErr := os.Stat(runtimeTemplate); statErr == nil {
+						log.Printf("Provisioner: auto-recover for %s — config volume empty, applying %s template (#1858)",
+							workspaceID, filepath.Base(runtimeTemplate))
+						templatePath = runtimeTemplate
+						// Rebuild cfg with the recovered template path so Start() sees it.
+						cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
+						cfg.ResetClaudeSession = resetClaudeSession
+						recovered = true
+						break
+					}
+				}
+				if !recovered {
+					log.Printf("Provisioner: auto-recover for %s — no template found under %s for runtime=%s",
+						workspaceID, h.configsDir, payload.Runtime)
 				}
 			}
 
@@ -616,6 +629,17 @@ func (h *WorkspaceHandler) ensureDefaultConfig(workspaceID string, payload model
 // payload.Model at boot), this is a no-op — no harm in the switch
 // being empty for those cases.
 func applyRuntimeModelEnv(envVars map[string]string, runtime, model string) {
+	// Fall back to the MODEL_PROVIDER workspace secret when the caller
+	// didn't pass one explicitly. This is the path that "Save+Restart"
+	// hits — Restart builds its payload from the workspaces row (no model
+	// column there) so payload.Model is always empty, but the user's
+	// canvas selection was stored as MODEL_PROVIDER via PUT /model and
+	// is already loaded into envVars here. Without this fallback hermes
+	// silently boots with the template default and errors "No LLM
+	// provider configured" even though the user picked a valid model.
+	if model == "" {
+		model = envVars["MODEL_PROVIDER"]
+	}
 	if model == "" {
 		return
 	}
diff --git a/workspace-server/internal/handlers/workspace_provision_test.go b/workspace-server/internal/handlers/workspace_provision_test.go
index 65960071..5297fd15 100644
--- a/workspace-server/internal/handlers/workspace_provision_test.go
+++ b/workspace-server/internal/handlers/workspace_provision_test.go
@@ -9,7 +9,6 @@ import (
 	"testing"
 
 	"github.com/DATA-DOG/go-sqlmock"
-	"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/plugins"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
@@ -1044,13 +1043,20 @@ var errInternalDB = fmt.Errorf("pq: connection refused")
 var errInternalOS = fmt.Errorf("operation failed: no such file or directory")
 
 // captureBroadcaster is a test broadcaster that captures the last data
-// payload passed to RecordAndBroadcast so tests can inspect it.
+// payload passed to RecordAndBroadcast so tests can inspect it. Now
+// satisfies events.EventEmitter (#1814) directly — RecordAndBroadcast
+// captures, BroadcastOnly is a no-op since none of the
+// WorkspaceHandler paths under test call it.
 type captureBroadcaster struct {
-	events.Broadcaster // embed to satisfy the interface — only RecordAndBroadcast is overridden
 	lastData map[string]interface{}
 	lastErr  error
 }
 
+// BroadcastOnly is required to satisfy events.EventEmitter. None of the
+// captureBroadcaster's exercising tests should land here — if a future
+// test does, it'll need to add capture state for that channel.
+func (c *captureBroadcaster) BroadcastOnly(_ string, _ string, _ interface{}) {}
+
 func (c *captureBroadcaster) RecordAndBroadcast(_ context.Context, _, _ string, data interface{}) error {
 	if m, ok := data.(map[string]interface{}); ok {
 		// Shallow-copy so the caller can't mutate our capture.
@@ -1107,14 +1113,14 @@ func containsUnsafeString(v interface{}) bool {
 // never leaks internal error details in WORKSPACE_PROVISION_FAILED broadcasts.
 // Regression test for issue #1206.
 func TestProvisionWorkspace_NoInternalErrorsInBroadcast(t *testing.T) {
-	t.Skip("TODO: captureBroadcaster type mismatch with WorkspaceHandler.broadcaster (*events.Broadcaster). Needs broadcaster interface refactor — currently blocking package compile on main (2026-04-21).")
+	t.Skip("TODO: write the test body. The interface blocker (#1814) is fixed — captureBroadcaster now satisfies events.EventEmitter and can be passed to NewWorkspaceHandler. The remaining work is constructing the provisionWorkspace failure path + asserting captured payload doesn't contain unsafeErrorStrings.")
 }
 
 // TestProvisionWorkspaceCP_NoInternalErrorsInBroadcast asserts that
 // provisionWorkspaceCP never leaks err.Error() in WORKSPACE_PROVISION_FAILED
 // broadcasts. Regression test for issue #1206.
 func TestProvisionWorkspaceCP_NoInternalErrorsInBroadcast(t *testing.T) {
-	t.Skip("TODO: captureBroadcaster type mismatch with WorkspaceHandler.broadcaster (*events.Broadcaster). Needs broadcaster interface refactor — currently blocking package compile on main (2026-04-21).")
+	t.Skip("TODO: write the test body. Same status as TestProvisionWorkspace_NoInternalErrorsInBroadcast — interface blocker fixed (#1814), need to construct the provisionWorkspaceCP failure path + assert payload sanitization.")
 }
 
 // mockEnvMutator is a provisionhook.Registry stub that always returns a fixed error.
diff --git a/workspace-server/internal/handlers/workspace_test.go b/workspace-server/internal/handlers/workspace_test.go
index 878af611..43faa8f9 100644
--- a/workspace-server/internal/handlers/workspace_test.go
+++ b/workspace-server/internal/handlers/workspace_test.go
@@ -11,6 +11,7 @@ import (
 	"testing"
 
 	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
 	"github.com/gin-gonic/gin"
 )
 
@@ -154,7 +155,7 @@ func TestWorkspaceCreate_DBInsertError(t *testing.T) {
 	// Transaction begins, workspace INSERT fails, transaction is rolled back.
 	mock.ExpectBegin()
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Failing Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil)).
+		WithArgs(sqlmock.AnyArg(), "Failing Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks).
 		WillReturnError(sql.ErrConnDone)
 	mock.ExpectRollback()
 
@@ -187,7 +188,7 @@ func TestWorkspaceCreate_DefaultsApplied(t *testing.T) {
 	// Expect workspace INSERT with defaulted tier=3 (Privileged — the
 	// handler default in workspace.go), runtime="langgraph"
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Default Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil)).
+		WithArgs(sqlmock.AnyArg(), "Default Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 
@@ -238,7 +239,7 @@ func TestWorkspaceCreate_WithSecrets_Persists(t *testing.T) {
 
 	mock.ExpectBegin()
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Hermes Agent", nil, 3, "hermes", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil)).
+		WithArgs(sqlmock.AnyArg(), "Hermes Agent", nil, 3, "hermes", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	// Secret inserted inside the same transaction.
 	mock.ExpectExec("INSERT INTO workspace_secrets").
@@ -1257,7 +1258,7 @@ runtime_config:
 	mock.ExpectExec("INSERT INTO workspaces").
 		WithArgs(
 			sqlmock.AnyArg(), "Hermes Agent", nil, 3, "hermes",
-			sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil)).
+			sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@@ -1314,7 +1315,7 @@ model: anthropic:claude-sonnet-4-5
 	mock.ExpectExec("INSERT INTO workspaces").
 		WithArgs(
 			sqlmock.AnyArg(), "Legacy Agent", nil, 3, "langgraph",
-			sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil)).
+			sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@@ -1367,7 +1368,7 @@ runtime_config:
 	mock.ExpectExec("INSERT INTO workspaces").
 		WithArgs(
 			sqlmock.AnyArg(), "Custom Hermes", nil, 3, "hermes",
-			sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil)).
+			sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
diff --git a/workspace-server/internal/middleware/wsauth_middleware.go b/workspace-server/internal/middleware/wsauth_middleware.go
index 93538753..ef82d8e7 100644
--- a/workspace-server/internal/middleware/wsauth_middleware.go
+++ b/workspace-server/internal/middleware/wsauth_middleware.go
@@ -14,6 +14,30 @@ import (
 	"github.com/gin-gonic/gin"
 )
 
+// abortAuthLookupError is the single response shape for "the auth
+// middleware tried to validate a token but the underlying datastore
+// lookup failed." Returns 503 (not 500) because the right semantic
+// is "platform infrastructure unavailable, retry shortly" — not
+// "internal server error in our application logic". The structured
+// `code` lets the canvas distinguish this from generic 5xx and
+// surface a dedicated diagnostic ("Postgres/Redis unreachable —
+// check local services") instead of a confusing
+// `auth check failed` toast.
+//
+// `where` is included in the log line so the operator can grep
+// which call site fired (WorkspaceAuth vs AdminAuth, the
+// HasAnyLiveTokenGlobal probe vs orgtoken.Validate). The
+// user-visible body deliberately does NOT include the underlying
+// error string — that could leak DB hostnames, connection-string
+// fragments, or internal code paths.
+func abortAuthLookupError(c *gin.Context, where string, err error) {
+	log.Printf("wsauth: %s: datastore lookup failed (returning 503): %v", where, err)
+	c.AbortWithStatusJSON(http.StatusServiceUnavailable, gin.H{
+		"error": "platform datastore unavailable — retry shortly",
+		"code":  "platform_unavailable",
+	})
+}
+
 // WorkspaceAuth returns a Gin middleware that enforces per-workspace bearer-token
 // authentication on /workspaces/:id/* sub-routes.
 //
@@ -73,8 +97,7 @@ func WorkspaceAuth(database *sql.DB) gin.HandlerFunc {
 				c.Next()
 				return
 			} else if !errors.Is(err, orgtoken.ErrInvalidToken) {
-				log.Printf("wsauth: WorkspaceAuth: orgtoken.Validate: %v", err)
-				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "auth check failed"})
+				abortAuthLookupError(c, "WorkspaceAuth: orgtoken.Validate", err)
 				return
 			}
 			// Per-workspace token — narrowest scope, bound to this :id.
@@ -136,8 +159,7 @@ func AdminAuth(database *sql.DB) gin.HandlerFunc {
 
 		hasLive, err := wsauth.HasAnyLiveTokenGlobal(ctx, database)
 		if err != nil {
-			log.Printf("wsauth: AdminAuth: HasAnyLiveTokenGlobal failed: %v", err)
-			c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "auth check failed"})
+			abortAuthLookupError(c, "AdminAuth: HasAnyLiveTokenGlobal", err)
 			return
 		}
 		if !hasLive {
@@ -214,8 +236,7 @@ func AdminAuth(database *sql.DB) gin.HandlerFunc {
 			return
 		} else if !errors.Is(err, orgtoken.ErrInvalidToken) {
 			// DB error — fail closed and log. Don't expose DB text.
-			log.Printf("wsauth: AdminAuth: orgtoken.Validate: %v", err)
-			c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "auth check failed"})
+			abortAuthLookupError(c, "AdminAuth: orgtoken.Validate", err)
 			return
 		}
 
diff --git a/workspace-server/internal/middleware/wsauth_middleware_test.go b/workspace-server/internal/middleware/wsauth_middleware_test.go
index edfd2230..6c802a79 100644
--- a/workspace-server/internal/middleware/wsauth_middleware_test.go
+++ b/workspace-server/internal/middleware/wsauth_middleware_test.go
@@ -2,8 +2,11 @@ package middleware
 
 import (
 	"crypto/sha256"
+	"encoding/json"
+	"errors"
 	"net/http"
 	"net/http/httptest"
+	"strings"
 	"testing"
 
 	"github.com/DATA-DOG/go-sqlmock"
@@ -1699,3 +1702,57 @@ func TestAdminAuth_684_SpecificRoutes_NoBearer_Returns401(t *testing.T) {
 		})
 	}
 }
+
+// ==================== platform-unavailable classification ====================
+//
+// abortAuthLookupError replaces the prior opaque
+// `500 {"error":"auth check failed"}` with a 503 + structured code so
+// the canvas can render a dedicated diagnostic instead of a confusing
+// toast. Pin both the status code and the body shape against
+// regression — this is the contract the canvas's
+// PlatformUnavailableError classifier reads at api.ts.
+
+func TestAdminAuth_DatastoreError_Returns503PlatformUnavailable(t *testing.T) {
+	mockDB, mock, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock.New: %v", err)
+	}
+	defer mockDB.Close()
+
+	// Simulate Postgres being down — HasAnyLiveTokenGlobal's COUNT
+	// query returns a connection error.
+	mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
+		WillReturnError(errors.New("dial tcp [::1]:5432: connect: connection refused"))
+
+	r := gin.New()
+	r.GET("/workspaces", AdminAuth(mockDB), func(c *gin.Context) {
+		c.JSON(http.StatusOK, gin.H{"ok": true})
+	})
+
+	w := httptest.NewRecorder()
+	req, _ := http.NewRequest(http.MethodGet, "/workspaces", nil)
+	r.ServeHTTP(w, req)
+
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d: %s", w.Code, w.Body.String())
+	}
+	var resp map[string]interface{}
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("response body must be JSON: %v (body=%s)", err, w.Body.String())
+	}
+	if resp["code"] != "platform_unavailable" {
+		t.Errorf("response code = %v, want platform_unavailable (canvas reads this for the dedicated diagnostic)", resp["code"])
+	}
+	if _, ok := resp["error"].(string); !ok {
+		t.Errorf("response must include human-readable error string, got %v", resp["error"])
+	}
+	// The body must NOT leak the underlying DB error string —
+	// production hostnames / connection-string fragments could land
+	// in an error toast otherwise.
+	if errStr, _ := resp["error"].(string); strings.Contains(errStr, "dial tcp") {
+		t.Errorf("response leaks underlying DB error: %q", errStr)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
diff --git a/workspace-server/internal/models/workspace.go b/workspace-server/internal/models/workspace.go
index 26061a1f..b8732c9e 100644
--- a/workspace-server/internal/models/workspace.go
+++ b/workspace-server/internal/models/workspace.go
@@ -6,6 +6,12 @@ import (
 	"time"
 )
 
+// DefaultMaxConcurrentTasks mirrors the workspaces.max_concurrent_tasks
+// schema default. Handlers that resolve a 0/omitted payload value write
+// this constant so the read-side (scheduler capacity check) sees a
+// guaranteed non-zero column on every row.
+const DefaultMaxConcurrentTasks = 1
+
 type Workspace struct {
 	ID                 string          `json:"id" db:"id"`
 	Name               string          `json:"name" db:"name"`
@@ -51,6 +57,19 @@ type HeartbeatPayload struct {
 	// a previously-reported spend value. Any non-zero value is clamped to
 	// [0, maxMonthlySpend] before the DB write. (#615)
 	MonthlySpend int64 `json:"monthly_spend"`
+	// RuntimeState is a self-reported runtime health flag separate from
+	// "is the heartbeat task firing at all". The heartbeat task lives in
+	// its own asyncio task and keeps pinging even when the agent runtime
+	// is wedged (e.g. claude_agent_sdk's `Control request timeout:
+	// initialize` leaves the SDK in a permanent error state for the
+	// process lifetime). RuntimeState is how the workspace tells the
+	// platform "I'm alive but my Claude runtime is broken — flip me to
+	// degraded so the canvas can show a Restart hint."
+	//
+	// Empty string = healthy / no signal. The only currently-recognised
+	// non-empty value is "wedged"; future values can extend this without
+	// migration.
+	RuntimeState string `json:"runtime_state"`
 }
 
 type UpdateCardPayload struct {
@@ -85,6 +104,9 @@ type CreateWorkspacePayload struct {
 	// workspace secrets at creation time.  Stored encrypted (same path as
 	// POST /workspaces/:id/secrets).  Nil/empty map is a no-op.
 	Secrets map[string]string `json:"secrets"`
+	// MaxConcurrentTasks caps parallel A2A + cron dispatch. 0 means use
+	// DefaultMaxConcurrentTasks. Leaders typically set 3.
+	MaxConcurrentTasks int `json:"max_concurrent_tasks"`
 	Canvas   struct {
 		X float64 `json:"x"`
 		Y float64 `json:"y"`
diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go
index a57a4cb6..c38b0d02 100644
--- a/workspace-server/internal/provisioner/provisioner.go
+++ b/workspace-server/internal/provisioner/provisioner.go
@@ -17,6 +17,7 @@ import (
 	"time"
 
 	"github.com/docker/docker/api/types/container"
+	"github.com/docker/docker/api/types/filters"
 	dockerimage "github.com/docker/docker/api/types/image"
 	"github.com/docker/docker/api/types/network"
 	"github.com/docker/docker/api/types/volume"
@@ -143,6 +144,62 @@ func ContainerName(workspaceID string) string {
 	return fmt.Sprintf("ws-%s", id)
 }
 
+// containerNamePrefix is the shared prefix every workspace container
+// name carries (`ws-`). Used by ListWorkspaceContainerIDPrefixes for
+// the Docker name-filter, and by the orphan sweeper to recognise our
+// own containers vs. anything else on the host.
+const containerNamePrefix = "ws-"
+
+// ListWorkspaceContainerIDPrefixes returns the 12-char workspace ID
+// prefixes of every running ws-* container the Docker daemon knows
+// about. The 12-char form matches ContainerName's truncation, so the
+// orphan sweeper can intersect this set against `SELECT
+// substring(id::text, 1, 12) FROM workspaces WHERE status = 'removed'`
+// without an extra round-trip per row.
+//
+// Returns an empty slice on any Docker error (sweeper treats that as
+// "skip this round" — better than a partial scan that misses leaks).
+func (p *Provisioner) ListWorkspaceContainerIDPrefixes(ctx context.Context) ([]string, error) {
+	if p == nil || p.cli == nil {
+		return nil, nil
+	}
+	containers, err := p.cli.ContainerList(ctx, container.ListOptions{
+		// All=true catches stopped-but-not-removed containers too —
+		// those still hold their volume references and would block
+		// RemoveVolume just like a running container would.
+		All:     true,
+		Filters: filters.NewArgs(filters.Arg("name", containerNamePrefix)),
+	})
+	if err != nil {
+		return nil, err
+	}
+	prefixes := make([]string, 0, len(containers))
+	for _, c := range containers {
+		// Container names from the API include a leading slash:
+		// "/ws-abc123def456". Strip both the slash and our prefix
+		// to recover the 12-char workspace ID.
+		//
+		// The Docker name filter is a SUBSTRING match (not a prefix
+		// match), so something like "my-ws-thing" would also be
+		// returned. The HasPrefix check below is load-bearing:
+		// without it those false positives would flow into the
+		// orphan sweeper's DB query as bogus LIKE patterns.
+		for _, name := range c.Names {
+			n := strings.TrimPrefix(name, "/")
+			if !strings.HasPrefix(n, containerNamePrefix) {
+				continue
+			}
+			id := strings.TrimPrefix(n, containerNamePrefix)
+			if id == "" {
+				continue
+			}
+			prefixes = append(prefixes, id)
+			break // one name is enough; multiple aliases would dup
+		}
+	}
+	return prefixes, nil
+}
+
 // InternalURL returns the Docker-internal URL for a workspace container.
 func InternalURL(workspaceID string) string {
 	return fmt.Sprintf("http://%s:%s", ContainerName(workspaceID), DefaultPort)
@@ -832,6 +889,14 @@ func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) erro
 // restart policy: if we ContainerStop first, the restart policy can
 // respawn the container before ContainerRemove runs, leaving a zombie
 // that re-registers via heartbeat after deletion.
+//
+// Returns nil on success AND on "container does not exist" (the cleanup
+// goal is achieved either way). Returns the underlying Docker error
+// only when the daemon actually failed to remove a live container —
+// callers that follow Stop with RemoveVolume MUST check the return
+// and skip volume removal on a real error, otherwise the volume
+// removal will fail with "volume in use" because the container is
+// still alive.
 func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
 	if p == nil || p.cli == nil {
 		return ErrNoBackend
@@ -839,15 +904,23 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
 	name := ContainerName(workspaceID)
 
 	// Force-remove kills and removes in one atomic operation, bypassing
-	// the restart policy entirely. If the container doesn't exist, the
-	// error is harmless.
-	if err := p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true}); err != nil {
-		// Container may already be gone — log but don't fail.
-		log.Printf("Provisioner: force-remove warning for %s: %v", name, err)
+	// the restart policy entirely.
+	err := p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true})
+	if err == nil {
+		log.Printf("Provisioner: stopped and removed container %s", name)
+		return nil
 	}
-
-	log.Printf("Provisioner: stopped and removed container %s", name)
-	return nil
+	if isContainerNotFound(err) {
+		// Container was already gone — the post-condition we want is
+		// satisfied. Don't surface as an error.
+		log.Printf("Provisioner: container %s already gone (no-op)", name)
+		return nil
+	}
+	// Real failure: daemon timeout, socket EOF, ctx cancellation, etc.
+	// Caller (workspace_crud.stopAndRemove, orphan_sweeper.sweepOnce)
+	// must propagate this so they can skip the follow-up RemoveVolume.
+	log.Printf("Provisioner: force-remove failed for %s: %v", name, err)
+	return fmt.Errorf("force-remove %s: %w", name, err)
 }
 
 // IsRunning checks if a workspace container is currently running.
@@ -1082,6 +1155,13 @@ func pullImageAndDrain(ctx context.Context, cli dockerImageClient, ref, platform
 //
 // Tracked in issue #1875; remove this fallback once the template repos
 // publish multi-arch manifests.
+// DefaultImagePlatform is the exported alias used by the admin
+// workspace-images handler so its ImagePull picks the same platform as
+// the provisioner's. Avoids duplicating the Apple-Silicon-needs-amd64
+// logic and keeps both call sites in sync if Docker manifest support
+// changes (e.g., when the templates start shipping multi-arch).
+func DefaultImagePlatform() string { return defaultImagePlatform() }
+
 func defaultImagePlatform() string {
 	if v, ok := os.LookupEnv("MOLECULE_IMAGE_PLATFORM"); ok {
 		return v
diff --git a/workspace-server/internal/registry/orphan_sweeper.go b/workspace-server/internal/registry/orphan_sweeper.go
new file mode 100644
index 00000000..23faf28d
--- /dev/null
+++ b/workspace-server/internal/registry/orphan_sweeper.go
@@ -0,0 +1,186 @@
+package registry
+
+// orphan_sweeper.go — periodic reconcile pass that cleans up Docker
+// containers whose corresponding workspace row in Postgres has
+// status='removed'. Defence in depth on top of the inline cleanup
+// in handlers/workspace_crud.go.
+//
+// Why this exists: the inline cleanup is one-shot — if Docker hiccups
+// (daemon restart, host load, transient API error), the container
+// silently stays alive while the DB row is already 'removed'. Without
+// a reconcile pass those leaks accumulate forever. With one, every
+// missed cleanup heals on the next sweep.
+//
+// Cost: O(running containers) per cycle, not O(historical removed
+// rows). The Docker name filter trims the candidate set to ws-* only
+// (typically the same handful as ContainerList without filter on a
+// dev host); the DB lookup is one indexed query against the
+// idx_workspaces_status btree.
+
+import (
+	"context"
+	"log"
+	"time"
+
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
+	"github.com/lib/pq"
+)
+
+// OrphanReaper is the dependency the sweeper takes from provisioner.
+// Extracted as an interface so the sweeper is unit-testable without
+// a real Docker daemon — matches the ContainerChecker pattern in
+// healthsweep.go. *provisioner.Provisioner satisfies this naturally.
+type OrphanReaper interface {
+	ListWorkspaceContainerIDPrefixes(ctx context.Context) ([]string, error)
+	Stop(ctx context.Context, workspaceID string) error
+	RemoveVolume(ctx context.Context, workspaceID string) error
+}
+
+// isLikelyWorkspaceID accepts strings shaped like a UUID prefix —
+// hex chars and `-` only. Workspace IDs are full UUIDs and the
+// container-name truncation keeps the hex prefix intact, so any
+// container name that doesn't match this is by definition not one
+// of ours and should be skipped. Also doubles as a SQL LIKE
+// wildcard guard (rejects `_` and `%`).
+func isLikelyWorkspaceID(s string) bool {
+	if s == "" {
+		return false
+	}
+	for _, r := range s {
+		switch {
+		case r >= '0' && r <= '9':
+		case r >= 'a' && r <= 'f':
+		case r >= 'A' && r <= 'F':
+		case r == '-':
+		default:
+			return false
+		}
+	}
+	return true
+}
+
+// OrphanSweepInterval is the cadence of the reconcile loop. 60s
+// matches the heartbeat cadence (30s) × 2 — a single missed cleanup
+// surfaces within ~90s end-to-end (canvas delete → next sweep tick →
+// container gone). Faster cycles would just pay Docker API cost for
+// no UX win; slower would let leaks linger long enough to compound
+// CPU pressure on dev hosts.
+const OrphanSweepInterval = 60 * time.Second
+
+// orphanSweepDeadline bounds a single sweep cycle. A daemon at the
+// edge of timing out shouldn't accumulate goroutines. 30s is generous
+// for a dev host with dozens of containers and a busy daemon.
+const orphanSweepDeadline = 30 * time.Second
+
+// StartOrphanSweeper runs the reconcile loop until ctx is cancelled.
+// nil reaper makes the loop a no-op (matches handlers'
+// nil-provisioner-tolerant pattern — some test harnesses run without
+// Docker available).
+func StartOrphanSweeper(ctx context.Context, reaper OrphanReaper) {
+	if reaper == nil {
+		log.Println("Orphan sweeper: reaper is nil — sweeper disabled")
+		return
+	}
+	log.Printf("Orphan sweeper started — reconciling every %s", OrphanSweepInterval)
+	ticker := time.NewTicker(OrphanSweepInterval)
+	defer ticker.Stop()
+	// Run once immediately so a platform restart cleans up any
+	// containers leaked while we were down — don't make the user
+	// wait 60s for the first reconcile.
+	sweepOnce(ctx, reaper)
+	for {
+		select {
+		case <-ctx.Done():
+			log.Println("Orphan sweeper: shutdown")
+			return
+		case <-ticker.C:
+			sweepOnce(ctx, reaper)
+		}
+	}
+}
+
+func sweepOnce(parent context.Context, reaper OrphanReaper) {
+	ctx, cancel := context.WithTimeout(parent, orphanSweepDeadline)
+	defer cancel()
+
+	prefixes, err := reaper.ListWorkspaceContainerIDPrefixes(ctx)
+	if err != nil {
+		log.Printf("Orphan sweeper: ListWorkspaceContainerIDPrefixes failed: %v — skipping cycle", err)
+		return
+	}
+	if len(prefixes) == 0 {
+		return
+	}
+
+	// Resolve each prefix to a full workspace_id whose status is
+	// 'removed'. The platform's workspace IDs are full UUIDs but
+	// container names are truncated to 12 chars — an UPPER BOUND
+	// of one match per prefix is guaranteed by the DB (UUID v4
+	// collisions in the first 12 chars across active rows are
+	// statistically negligible). Use a single IN-style query so
+	// the cost is one round-trip regardless of leak count.
+	//
+	// Defence: drop any prefix whose contents fall outside the
+	// hex-and-dash UUID alphabet. Workspace IDs are UUIDs, so
+	// container names follow ws-<12 hex chars>. Anything else is
+	// either a non-workspace container that slipped past the
+	// substring-match Docker filter (workspace-runner, etc.) or a
+	// malformed entry — neither should be turned into a LIKE
+	// pattern. Also blocks SQL LIKE wildcards (`_` and `%`) from
+	// reaching the query, even though Docker's container-name
+	// validation would already have rejected them upstream.
+	likes := make([]string, 0, len(prefixes))
+	for _, p := range prefixes {
+		if !isLikelyWorkspaceID(p) {
+			continue
+		}
+		likes = append(likes, p+"%")
+	}
+	if len(likes) == 0 {
+		return
+	}
+	rows, err := db.DB.QueryContext(ctx, `
+		SELECT id::text
+		  FROM workspaces
+		 WHERE status = 'removed'
+		   AND id::text LIKE ANY($1::text[])
+	`, pq.Array(likes))
+	if err != nil {
+		log.Printf("Orphan sweeper: DB query failed: %v — skipping cycle", err)
+		return
+	}
+	defer rows.Close()
+
+	var orphanIDs []string
+	for rows.Next() {
+		var id string
+		if scanErr := rows.Scan(&id); scanErr != nil {
+			log.Printf("Orphan sweeper: row scan failed: %v", scanErr)
+			continue
+		}
+		orphanIDs = append(orphanIDs, id)
+	}
+	if err := rows.Err(); err != nil {
+		log.Printf("Orphan sweeper: rows iteration failed: %v", err)
+		return
+	}
+
+	for _, id := range orphanIDs {
+		log.Printf("Orphan sweeper: stopping leaked container for removed workspace %s", id)
+		if stopErr := reaper.Stop(ctx, id); stopErr != nil {
+			// Stop returns the wrapped Docker error (treating
+			// "container not found" as nil-success via
+			// isContainerNotFound), so a non-nil here means the
+			// container is genuinely still alive — daemon timeout,
+			// ctx cancellation, or a transient socket EOF.
+			// Skip RemoveVolume so we don't fall into the same
+			// Stop-failed-then-volume-in-use trap that motivated
+			// this sweeper. The next cycle (60s out) retries Stop.
+			log.Printf("Orphan sweeper: Stop failed for %s: %v — leaving volume for next cycle", id, stopErr)
+			continue
+		}
+		if rmErr := reaper.RemoveVolume(ctx, id); rmErr != nil {
+			log.Printf("Orphan sweeper: RemoveVolume warning for %s: %v", id, rmErr)
+		}
+	}
+}
diff --git a/workspace-server/internal/registry/orphan_sweeper_test.go b/workspace-server/internal/registry/orphan_sweeper_test.go
new file mode 100644
index 00000000..ec17a5c1
--- /dev/null
+++ b/workspace-server/internal/registry/orphan_sweeper_test.go
@@ -0,0 +1,255 @@
+package registry
+
+import (
+	"context"
+	"errors"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/DATA-DOG/go-sqlmock"
+)
+
+// fakeReaper is a hand-rolled OrphanReaper for the sweeper tests.
+// Records every Stop / RemoveVolume call so tests can assert which
+// workspace IDs got reconciled.
+type fakeReaper struct {
+	mu             sync.Mutex
+	listResponse   []string
+	listErr        error
+	stopErr        map[string]error
+	removeVolErr   map[string]error
+	stopCalls      []string
+	removeVolCalls []string
+}
+
+func (f *fakeReaper) ListWorkspaceContainerIDPrefixes(_ context.Context) ([]string, error) {
+	if f.listErr != nil {
+		return nil, f.listErr
+	}
+	return f.listResponse, nil
+}
+
+func (f *fakeReaper) Stop(_ context.Context, wsID string) error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.stopCalls = append(f.stopCalls, wsID)
+	return f.stopErr[wsID]
+}
+
+func (f *fakeReaper) RemoveVolume(_ context.Context, wsID string) error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.removeVolCalls = append(f.removeVolCalls, wsID)
+	return f.removeVolErr[wsID]
+}
+
+// TestSweepOnce_ReconcilesRunningRemovedRows — the core reconcile
+// behavior: a container running for a workspace whose DB row is
+// 'removed' gets stopped + volume removed.
+func TestSweepOnce_ReconcilesRunningRemovedRows(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	// Docker reports two ws-* containers; one's row is 'removed'
+	// (the leak), the other's is 'online' (the DB rightly excludes
+	// it from the WHERE clause and we should NOT reap it).
+	reaper := &fakeReaper{
+		listResponse: []string{"abc123def456", "xyz789ghi012"},
+	}
+
+	// The query asks for status='removed' rows whose id matches the
+	// LIKE patterns built from the running container prefixes. Mock
+	// returns only the leaked one as a UUID-shaped full id.
+	mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`).
+		WillReturnRows(sqlmock.NewRows([]string{"id"}).
+			AddRow("abc123def456-0000-0000-0000-000000000000"))
+
+	sweepOnce(context.Background(), reaper)
+
+	if len(reaper.stopCalls) != 1 || reaper.stopCalls[0] != "abc123def456-0000-0000-0000-000000000000" {
+		t.Errorf("Stop calls = %v, want exactly the leaked id", reaper.stopCalls)
+	}
+	if len(reaper.removeVolCalls) != 1 || reaper.removeVolCalls[0] != "abc123def456-0000-0000-0000-000000000000" {
+		t.Errorf("RemoveVolume calls = %v, want exactly the leaked id", reaper.removeVolCalls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestSweepOnce_NoRunningContainers — Docker returns nothing, sweeper
+// short-circuits without a DB query (no leak possible if no
+// containers exist).
+func TestSweepOnce_NoRunningContainers(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	reaper := &fakeReaper{listResponse: nil}
+
+	// No DB query expected — if sweepOnce makes one anyway the
+	// sqlmock will fail "unexpected query".
+	sweepOnce(context.Background(), reaper)
+
+	if len(reaper.stopCalls) != 0 {
+		t.Errorf("Stop should not fire when no containers exist; got %v", reaper.stopCalls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestSweepOnce_DockerListErrorSkipsCycle — a Docker daemon hiccup
+// must not cascade into a DB query (otherwise we'd reap based on
+// stale information). Skip the cycle, retry next tick.
+func TestSweepOnce_DockerListErrorSkipsCycle(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	reaper := &fakeReaper{listErr: errors.New("daemon unreachable")}
+	sweepOnce(context.Background(), reaper)
+
+	if len(reaper.stopCalls) != 0 {
+		t.Errorf("Stop must not fire when Docker list failed; got %v", reaper.stopCalls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestSweepOnce_StopFailureLeavesVolume — if Stop fails, RemoveVolume
+// MUST NOT fire. This is the same trap that motivated the sweeper:
+// removing a volume held by a still-running container always errors
+// with "volume in use", and we'd accumulate noise in the log without
+// actually fixing anything. Leave the volume for the next sweep
+// (which will retry Stop).
+func TestSweepOnce_StopFailureLeavesVolume(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	reaper := &fakeReaper{
+		listResponse: []string{"abc123def456"},
+		stopErr: map[string]error{
+			"abc123def456-0000-0000-0000-000000000000": errors.New("docker daemon timeout"),
+		},
+	}
+	mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`).
+		WillReturnRows(sqlmock.NewRows([]string{"id"}).
+			AddRow("abc123def456-0000-0000-0000-000000000000"))
+
+	sweepOnce(context.Background(), reaper)
+
+	if len(reaper.stopCalls) != 1 {
+		t.Errorf("Stop should have been attempted exactly once, got %v", reaper.stopCalls)
+	}
+	if len(reaper.removeVolCalls) != 0 {
+		t.Errorf("RemoveVolume must not fire when Stop failed; got %v", reaper.removeVolCalls)
+	}
+}
+
+// TestSweepOnce_VolumeRemoveErrorIsNonFatal — RemoveVolume failures
+// are logged but don't prevent processing other orphans in the same
+// cycle. Belt + braces against a transient daemon issue mid-loop.
+func TestSweepOnce_VolumeRemoveErrorIsNonFatal(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	reaper := &fakeReaper{
+		listResponse: []string{"aaa111bbb222", "ccc333ddd444"},
+		removeVolErr: map[string]error{
+			"aaa111bbb222-0000-0000-0000-000000000000": errors.New("volume not found"),
+		},
+	}
+	mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`).
+		WillReturnRows(sqlmock.NewRows([]string{"id"}).
+			AddRow("aaa111bbb222-0000-0000-0000-000000000000").
+			AddRow("ccc333ddd444-0000-0000-0000-000000000000"))
+
+	sweepOnce(context.Background(), reaper)
+
+	if len(reaper.stopCalls) != 2 {
+		t.Errorf("both orphans should have been Stopped; got %v", reaper.stopCalls)
+	}
+	if len(reaper.removeVolCalls) != 2 {
+		t.Errorf("both orphans should have had RemoveVolume attempted; got %v", reaper.removeVolCalls)
+	}
+}
+
+// TestSweepOnce_FiltersNonWorkspacePrefixes — the Docker name filter
+// is a SUBSTRING match so containers like "my-ws-thing" can slip
+// through. The HasPrefix check in the provisioner trims those, but
+// the in-sweeper isLikelyWorkspaceID guard is the second line of
+// defence: anything outside the UUID alphabet (hex + dashes) is
+// rejected before being turned into a SQL LIKE pattern. Locks in
+// that no DB query fires when every prefix is filtered out.
+func TestSweepOnce_FiltersNonWorkspacePrefixes(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	reaper := &fakeReaper{
+		listResponse: []string{
+			"not_a_uuid_at_all",            // underscore not in UUID alphabet
+			"contains%wildcard",            // SQL LIKE wildcard — must not reach the query
+			"contains_wildcard",            // SQL LIKE single-char wildcard
+			"",                             // empty
+			"valid-but-non-workspace-name", // dash + lowercase letters that aren't hex
+		},
+	}
+
+	// No DB query expected — every prefix is rejected before the
+	// query builds, so we short-circuit. sqlmock fails on any
+	// unexpected query.
+	sweepOnce(context.Background(), reaper)
+
+	if len(reaper.stopCalls) != 0 {
+		t.Errorf("Stop must not fire when all prefixes filtered; got %v", reaper.stopCalls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestIsLikelyWorkspaceID — pin the alphabet directly. This is the
+// guard that prevents SQL LIKE wildcards (`%`, `_`) from reaching
+// the sweeper's query.
+func TestIsLikelyWorkspaceID(t *testing.T) {
+	cases := []struct {
+		in   string
+		want bool
+	}{
+		{"abc123def456", true},
+		{"abcdef-1234-5678-90ab-cdef00112233", true},
+		{"ABC123DEF456", true}, // uppercase hex still allowed
+		{"", false},
+		{"abc_123", false},      // underscore (SQL LIKE single-char wildcard)
+		{"abc%123", false},      // percent (SQL LIKE multi-char wildcard)
+		{"hello world", false},  // space, non-hex letters
+		{"valid-but-not", false}, // 'l', 't', 'n' aren't hex
+		{"abc 123", false},
+		{".../escape", false},
+	}
+	for _, tc := range cases {
+		got := isLikelyWorkspaceID(tc.in)
+		if got != tc.want {
+			t.Errorf("isLikelyWorkspaceID(%q) = %v, want %v", tc.in, got, tc.want)
+		}
+	}
+}
+
+// TestStartOrphanSweeper_NilReaperIsNoOp — tolerance for the
+// nil-provisioner path used by some test harnesses.
+func TestStartOrphanSweeper_NilReaperIsNoOp(t *testing.T) {
+	// Should return immediately without panicking. Wrap in a goroutine
+	// + done-channel so we can assert it didn't block.
+	done := make(chan struct{})
+	go func() {
+		StartOrphanSweeper(context.Background(), nil)
+		close(done)
+	}()
+	select {
+	case <-done:
+		// expected
+	case <-time.After(500 * time.Millisecond):
+		t.Fatal("StartOrphanSweeper(nil) blocked instead of returning immediately")
+	}
+}
diff --git a/workspace-server/internal/router/router.go b/workspace-server/internal/router/router.go
index 6242067a..abaa9b86 100644
--- a/workspace-server/internal/router/router.go
+++ b/workspace-server/internal/router/router.go
@@ -308,6 +308,7 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
 		wsAuth.PUT("/secrets", sech.Set)
 		wsAuth.DELETE("/secrets/:key", sech.Delete)
 		wsAuth.GET("/model", sech.GetModel)
+		wsAuth.PUT("/model", sech.SetModel)
 
 		// Token usage metrics — cost transparency (#593).
 		// WorkspaceAuth middleware (on wsAuth) binds the bearer to :id.
@@ -402,6 +403,17 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
 		r.POST("/admin/a2a-queue/drop-stale", middleware.AdminAuth(db.DB), qH.DropStale)
 	}
 
+	// Admin — workspace template image refresh. Pulls latest images from GHCR
+	// and recreates running ws-* containers so they adopt the new image.
+	// Final step of the runtime CD chain — see docs/workspace-runtime-package.md.
+	// Operators (or post-publish automation) hit this after a runtime release.
+	// Reuses the provisioner's Docker client; no-op when prov is nil
+	// (test / non-Docker deploy).
+	if prov != nil {
+		imgH := handlers.NewAdminWorkspaceImagesHandler(prov.DockerClient())
+		r.POST("/admin/workspace-images/refresh", middleware.AdminAuth(db.DB), imgH.Refresh)
+	}
+
 	// Admin — test token minting (issue #6). Hidden in production via TestTokensEnabled().
 	// NOT behind AdminAuth — this is the bootstrap endpoint E2E tests and
 	// fresh installs use to obtain their first admin bearer. Adding AdminAuth
@@ -470,6 +482,14 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
 	wsAuth.PUT("/files/*path", tmplh.WriteFile)
 	wsAuth.DELETE("/files/*path", tmplh.DeleteFile)
 
+	// Chat attachments — file upload (user → agent) and binary-safe
+	// streaming download (agent → user). Namespaced under /chat/ so
+	// the security model is obviously distinct from /files/* (which
+	// handles workspace config/templates and has a different caller).
+	chatfh := handlers.NewChatFilesHandler(tmplh)
+	wsAuth.POST("/chat/uploads", chatfh.Upload)
+	wsAuth.GET("/chat/download", chatfh.Download)
+
 	// Plugins
 	pluginsDir := findPluginsDir(configsDir)
 	// Runtime lookup lets the plugins handler filter the registry to plugins
diff --git a/workspace-server/migrations/043_workspace_status_enum.down.sql b/workspace-server/migrations/043_workspace_status_enum.down.sql
new file mode 100644
index 00000000..4df43519
--- /dev/null
+++ b/workspace-server/migrations/043_workspace_status_enum.down.sql
@@ -0,0 +1,24 @@
+-- 043_workspace_status_enum.down.sql
+--
+-- Reverse 043_workspace_status_enum.up.sql: convert workspaces.status
+-- back to plain TEXT and drop the workspace_status enum type.
+
+BEGIN;
+
+-- Symmetric with the up migration: a rollback under the same load
+-- that motivated the up-file's 5s lock_timeout would otherwise stall
+-- writers indefinitely.
+SET LOCAL lock_timeout = '5s';
+
+ALTER TABLE workspaces
+    ALTER COLUMN status DROP DEFAULT;
+
+ALTER TABLE workspaces
+    ALTER COLUMN status TYPE TEXT USING status::TEXT;
+
+ALTER TABLE workspaces
+    ALTER COLUMN status SET DEFAULT 'provisioning';
+
+DROP TYPE workspace_status;
+
+COMMIT;
diff --git a/workspace-server/migrations/043_workspace_status_enum.up.sql b/workspace-server/migrations/043_workspace_status_enum.up.sql
new file mode 100644
index 00000000..ac8af646
--- /dev/null
+++ b/workspace-server/migrations/043_workspace_status_enum.up.sql
@@ -0,0 +1,84 @@
+-- 043_workspace_status_enum.up.sql
+--
+-- Convert workspaces.status from free-form TEXT to a real Postgres
+-- ENUM type. The previous shape (TEXT DEFAULT 'provisioning' with no
+-- CHECK constraint, set by 001_workspaces.sql) let any handler write
+-- any string, including typos and stale values from older code paths.
+-- Locking the value set forces every writer to use one of the agreed
+-- states and lets us add a new state (`degraded`, used by the SDK
+-- wedge detector landing in this same change) without losing type
+-- safety on the column.
+--
+-- Value set covers every status the production codebase actually writes:
+--
+--   provisioning  — workspace row exists, container is being created
+--                   (initial INSERT default)
+--   online        — heartbeat fresh + last response was successful
+--   offline       — Redis liveness key expired (ws-side dead) or
+--                   the proxy detected an unreachable upstream
+--   degraded      — runtime is alive but reporting trouble (heartbeat
+--                   error_rate >= 0.5, OR new in this change:
+--                   workspace explicitly reported runtime_state="wedged")
+--   failed        — provisioning never completed, or workspace marked
+--                   itself failed via bundle import / runtime crash
+--   removed       — soft-delete tombstone; the row stays so foreign-
+--                   key references survive but no operations target it
+--   paused        — operator-initiated suspend via workspace_restart's
+--                   pause path (workspace_restart.go:406)
+--   hibernated    — auto-suspended after idle threshold; container
+--                   stopped but row preserved (workspace_restart.go:283,
+--                   introduced by migration 029_workspace_hibernation)
+--
+-- Sweep of every `UPDATE workspaces SET status = 'X'` in the
+-- workspace-server/internal/ tree (excluding tests) verified the
+-- value set. Adding a new state in the future requires both updating
+-- this enum (a separate `ALTER TYPE … ADD VALUE` migration) AND any
+-- writers — the enum will reject unknown strings at insert/update
+-- time, which is the exact failure mode this migration is meant to
+-- give us.
+--
+-- Deployment: `ALTER TABLE … ALTER COLUMN TYPE` takes ACCESS
+-- EXCLUSIVE on workspaces. A long-running SELECT against the table
+-- will block the migration; the migration will then block every
+-- writer behind it. `SET lock_timeout` aborts the migration in 5s
+-- if it can't acquire the lock — preferable to stalling the whole
+-- workspace fleet behind one slow query.
+
+BEGIN;
+
+SET LOCAL lock_timeout = '5s';
+
+CREATE TYPE workspace_status AS ENUM (
+    'provisioning',
+    'online',
+    'offline',
+    'degraded',
+    'failed',
+    'removed',
+    'paused',
+    'hibernated'
+);
+
+-- The two-step ALTER (DROP DEFAULT then change type then SET DEFAULT)
+-- is required because Postgres rejects an ALTER COLUMN TYPE on a
+-- column that has a DEFAULT whose expression doesn't match the new
+-- type. The intermediate moment with no default is fine — no INSERT
+-- happens between these statements inside the same transaction.
+--
+-- The `USING status::workspace_status` cast is the type-conversion
+-- expression Postgres needs when the source and target types aren't
+-- assignment-compatible. If any existing row has a status value
+-- outside the enum's set, this statement aborts the transaction and
+-- the migration leaves the table untouched — that's the correct
+-- behavior (we'd want to know about the rogue value before locking
+-- the type).
+ALTER TABLE workspaces
+    ALTER COLUMN status DROP DEFAULT;
+
+ALTER TABLE workspaces
+    ALTER COLUMN status TYPE workspace_status USING status::workspace_status;
+
+ALTER TABLE workspaces
+    ALTER COLUMN status SET DEFAULT 'provisioning'::workspace_status;
+
+COMMIT;
diff --git a/workspace/a2a_client.py b/workspace/a2a_client.py
index 1a3e4f76..d740bd6a 100644
--- a/workspace/a2a_client.py
+++ b/workspace/a2a_client.py
@@ -10,7 +10,7 @@ import uuid
 
 import httpx
 
-from platform_auth import auth_headers
+from platform_auth import auth_headers, self_source_headers
 
 logger = logging.getLogger(__name__)
 
@@ -56,9 +56,15 @@ async def send_a2a_message(target_url: str, message: str) -> str:
         timeout=httpx.Timeout(connect=30.0, read=300.0, write=30.0, pool=30.0)
     ) as client:
         try:
+            # self_source_headers() includes X-Workspace-ID so the
+            # platform's a2a_receive logger records source_id =
+            # WORKSPACE_ID. Otherwise peer-A2A messages — including
+            # the case where target_url resolves to this workspace's
+            # own /a2a — get logged with source_id=NULL and surface
+            # in the recipient's My Chat tab as user-typed input.
             resp = await client.post(
                 target_url,
-                headers=auth_headers(),
+                headers=self_source_headers(WORKSPACE_ID),
                 json={
                     "jsonrpc": "2.0",
                     "id": str(uuid.uuid4()),
@@ -81,10 +87,40 @@ async def send_a2a_message(target_url: str, message: str) -> str:
                     return f"{_A2A_ERROR_PREFIX}{text}"
                 return text
             elif "error" in data:
-                return f"{_A2A_ERROR_PREFIX}{data['error'].get('message', 'unknown')}"
-            return str(data)
+                err = data["error"]
+                msg = (err.get("message") or "").strip()
+                code = err.get("code")
+                if msg and code is not None:
+                    detail = f"{msg} (code={code})"
+                elif msg:
+                    detail = msg
+                elif code is not None:
+                    detail = f"JSON-RPC error with no message (code={code})"
+                else:
+                    detail = "JSON-RPC error with no message"
+                return f"{_A2A_ERROR_PREFIX}{detail} [target={target_url}]"
+            return f"{_A2A_ERROR_PREFIX}unexpected response shape (no result, no error): {str(data)[:200]} [target={target_url}]"
         except Exception as e:
-            return f"{_A2A_ERROR_PREFIX}{e}"
+            # Some httpx exceptions stringify to empty (RemoteProtocolError,
+            # ConnectionReset variants) — the canvas would then render
+            # "[A2A_ERROR] " with no detail and the operator has no signal
+            # to act on. Always include the exception class name and the
+            # target URL so the activity log + Agent Comms panel have
+            # actionable information without a trip through container logs.
+            msg = str(e).strip()
+            type_name = type(e).__name__
+            if not msg:
+                detail = f"{type_name} (no message — likely connection reset or silent timeout)"
+            elif msg.startswith(f"{type_name}:") or msg.startswith(f"{type_name} "):
+                # Already prefixed with the type — don't double-prefix.
+                # Prefix-anchored check (not substring) so a message that
+                # happens to mention some OTHER class name mid-string
+                # (e.g. "got OSError on read") doesn't suppress our own
+                # type prefix and lose the diagnostic signal.
+                detail = msg
+            else:
+                detail = f"{type_name}: {msg}"
+            return f"{_A2A_ERROR_PREFIX}{detail} [target={target_url}]"
 
 
 async def get_peers() -> list[dict]:
diff --git a/workspace/a2a_executor.py b/workspace/a2a_executor.py
index 69c56398..590be813 100644
--- a/workspace/a2a_executor.py
+++ b/workspace/a2a_executor.py
@@ -48,6 +48,10 @@ from shared_runtime import (
     brief_task,
     set_current_task,
 )
+from executor_helpers import (
+    collect_outbound_files,
+    extract_attached_files,
+)
 from builtin_tools.telemetry import (
     A2A_TASK_ID,
     GEN_AI_OPERATION_NAME,
@@ -211,6 +215,18 @@ class LangGraphA2AExecutor(AgentExecutor):
           3. Message(final_text)                      — terminal event
         """
         user_input = extract_message_text(context)
+        # Pull attached files from A2A message parts (kind: "file") and
+        # append a manifest to the prompt so the agent knows they exist.
+        # LangGraph tools (filesystem, bash, skills) can then open the
+        # files by path — without this the agent silently ignores the
+        # attachments and replies "I'm not sure what you're referring to".
+        _attached_files = extract_attached_files(getattr(context, "message", None))
+        if _attached_files:
+            _manifest = "\n\nAttached files:\n" + "\n".join(
+                f"- {f['name']} ({f['mime_type'] or 'unknown type'}) at {f['path']}"
+                for f in _attached_files
+            )
+            user_input = (user_input + _manifest) if user_input else _manifest.lstrip()
         if not user_input:
             parts = getattr(getattr(context, "message", None), "parts", None)
             logger.warning("A2A execute: no text content in message parts: %s", parts)
@@ -415,7 +431,38 @@ class LangGraphA2AExecutor(AgentExecutor):
                 # Non-streaming: ResultAggregator.consume_all() returns this
                 #   immediately as the response (a2a_client.py reads .parts[0].text).
                 # Streaming: yielded as the last SSE event in the stream.
-                msg = new_agent_text_message(final_text, task_id=task_id, context_id=context_id)
+                #
+                # If the reply mentions /workspace/... paths, stage each one
+                # and emit as FileParts alongside the text so the canvas can
+                # render a download button. Same contract the hermes executor
+                # uses — every runtime going through this code path (langgraph,
+                # deepagents, future ReAct variants) inherits it.
+                _outbound = collect_outbound_files(final_text)
+                if _outbound:
+                    # NOTE: do NOT re-import `Part` here. It is already imported
+                    # at module scope (line 42). A function-scope `from a2a.types
+                    # import ... Part ...` would mark `Part` as a local name
+                    # throughout this function under Python's scoping rules,
+                    # making the earlier `Part(text=text)` call (line ~358, inside
+                    # the astream_events loop) raise UnboundLocalError because
+                    # the local binding is not yet in scope at that point.
+                    from a2a.types import FilePart, FileWithUri, Message, Role, TextPart
+                    _parts: list[Part] = [Part(root=TextPart(text=final_text))] if final_text else []
+                    for f in _outbound:
+                        _parts.append(Part(root=FilePart(file=FileWithUri(
+                            uri="workspace:" + f["path"],
+                            name=f["name"],
+                            mimeType=f["mime_type"],
+                        ))))
+                    msg = Message(
+                        messageId=uuid.uuid4().hex,
+                        role=Role.agent,
+                        parts=_parts,
+                        taskId=task_id,
+                        contextId=context_id,
+                    )
+                else:
+                    msg = new_agent_text_message(final_text, task_id=task_id, context_id=context_id)
                 # Attach tool_trace via metadata when supported. Guarded with
                 # hasattr because some test mocks return a plain string here.
                 if tool_trace and hasattr(msg, "metadata"):
diff --git a/workspace/a2a_tools.py b/workspace/a2a_tools.py
index 691491d7..3e3671bb 100644
--- a/workspace/a2a_tools.py
+++ b/workspace/a2a_tools.py
@@ -112,7 +112,7 @@ def _auth_headers_for_heartbeat() -> dict[str, str]:
 
 async def report_activity(
     activity_type: str, target_id: str = "", summary: str = "", status: str = "ok",
-    task_text: str = "", response_text: str = "",
+    task_text: str = "", response_text: str = "", error_detail: str = "",
 ):
     """Report activity to the platform for live progress tracking."""
     try:
@@ -129,6 +129,13 @@ async def report_activity(
                 payload["request_body"] = {"task": task_text}
             if response_text:
                 payload["response_body"] = {"result": response_text}
+            if error_detail:
+                # error_detail is a top-level activity row column on the
+                # platform (handlers/activity.go). Surfacing the cleaned
+                # exception string here lets the Activity tab render a
+                # red error chip + the cause without forcing the user
+                # to scroll into the raw response_body JSON.
+                payload["error_detail"] = error_detail
             await client.post(
                 f"{PLATFORM_URL}/workspaces/{WORKSPACE_ID}/activity",
                 json=payload,
@@ -178,11 +185,23 @@ async def tool_delegate_task(workspace_id: str, task: str) -> str:
     # Detect delegation failures — wrap them clearly so the calling agent
     # can decide to retry, use another peer, or handle the task itself.
     is_error = result.startswith(_A2A_ERROR_PREFIX)
+    # Strip the sentinel prefix so error_detail is the human-readable
+    # cause directly. The Activity tab's red error chip surfaces this
+    # without the user having to scroll into the raw response JSON.
+    #
+    # Cap at 4096 chars before sending — the platform's
+    # activity_logs.error_detail column is unbounded TEXT and a
+    # malicious or buggy peer could otherwise stream an arbitrarily
+    # large error message into the caller's activity log. 4096 is
+    # comfortably above any real exception traceback we've seen and
+    # well below an obvious-DoS threshold.
+    error_detail = result[len(_A2A_ERROR_PREFIX):].strip()[:4096] if is_error else ""
     await report_activity(
         "a2a_receive", workspace_id,
-        f"{peer_name} responded ({len(result)} chars)" if not is_error else f"{peer_name} failed",
+        f"{peer_name} responded ({len(result)} chars)" if not is_error else f"{peer_name} failed: {error_detail[:120]}",
         task_text=task, response_text=result,
         status="error" if is_error else "ok",
+        error_detail=error_detail,
     )
     if is_error:
         return (
diff --git a/workspace/builtin_tools/a2a_tools.py b/workspace/builtin_tools/a2a_tools.py
index 8c7be15f..df4f9d78 100644
--- a/workspace/builtin_tools/a2a_tools.py
+++ b/workspace/builtin_tools/a2a_tools.py
@@ -42,10 +42,15 @@ async def delegate_task(workspace_id: str, task: str) -> str:
         except Exception as e:
             return f"Error discovering workspace: {e}"
 
-        # Send A2A message
+        # Send A2A message. X-Workspace-ID identifies us as the source —
+        # without it the platform's a2a_receive logger writes
+        # source_id=NULL and the recipient's My Chat tab renders the
+        # delegation as if a human user typed it. Same hazard fixed
+        # in heartbeat.py / a2a_client.py / main.py initial+idle flows.
         try:
             a2a_resp = await client.post(
                 target_url,
+                headers={"X-Workspace-ID": WORKSPACE_ID},
                 json={
                     "jsonrpc": "2.0",
                     "id": str(uuid.uuid4()),
diff --git a/workspace/claude_sdk_executor.py b/workspace/claude_sdk_executor.py
index 893aafdb..a05823c8 100644
--- a/workspace/claude_sdk_executor.py
+++ b/workspace/claude_sdk_executor.py
@@ -29,7 +29,7 @@ import asyncio
 import logging
 import os
 import sys
-from collections.abc import AsyncIterator
+from collections.abc import AsyncIterator, Callable
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
 
@@ -47,7 +47,9 @@ from executor_helpers import (
     WORKSPACE_MOUNT,
     auto_push_hook,
     brief_summary,
+    collect_outbound_files,
     commit_memory,
+    extract_attached_files,
     extract_message_text,
     get_a2a_instructions,
     get_hma_instructions,
@@ -85,6 +87,180 @@ _RETRYABLE_PATTERNS = (
     "try again",
 )
 
+# Module-level SDK-wedge flag. When claude_agent_sdk's `query.initialize()`
+# raises `Control request timeout: initialize`, the SDK's internal client-
+# process state is corrupted for the rest of the Python process — every
+# subsequent `_run_query()` call hits the same wedge and re-throws. The
+# executor itself can't auto-recover (the underlying CLI subprocess and
+# its read pipe are in an unrecoverable state); only a workspace restart
+# clears it.
+#
+# The heartbeat task reads these helpers and reports
+# `runtime_state="wedged"` to the platform, which flips the workspace to
+# `degraded` so the canvas surfaces a Restart hint instead of leaving
+# the user staring at a green dot while every chat hangs.
+#
+# Module scope (not instance scope) is deliberate: the wedge is a
+# property of the Python process, not the executor. A future per-org
+# multi-executor design could move this to a shared registry, but with
+# one executor per workspace process today the simplest lock-free
+# read+write fits.
+_sdk_wedged_reason: str | None = None
+
+
+def is_wedged() -> bool:
+    """True if the Claude SDK has hit a non-recoverable init wedge in
+    this process. Sticky until process restart."""
+    return _sdk_wedged_reason is not None
+
+
+def wedge_reason() -> str:
+    """Human-readable description of the wedge cause, or empty string
+    when not wedged. Surfaced to the canvas via heartbeat sample_error."""
+    return _sdk_wedged_reason or ""
+
+
+def _mark_sdk_wedged(reason: str) -> None:
+    """Internal — flag the SDK as wedged. Only the first call wins
+    (subsequent identical wedges shouldn't overwrite a more specific
+    reason). Tests use `_reset_sdk_wedge_for_test()` to clear."""
+    global _sdk_wedged_reason
+    if _sdk_wedged_reason is None:
+        _sdk_wedged_reason = reason
+        logger.error("SDK wedge detected: %s — workspace will report degraded until a successful query clears it", reason)
+
+
+def _clear_sdk_wedge_on_success() -> None:
+    """Auto-recovery — called from _run_query after a successful
+    completion. The original wedge could be transient (a single network
+    blip during the SDK's first-message handshake), and a sticky-only
+    flag would lock the workspace into degraded forever even after the
+    SDK started working again. Clearing on observed success means the
+    next heartbeat after a working query reports `runtime_state` empty
+    and the platform flips status back to online.
+
+    No-op when not wedged (the common case)."""
+    global _sdk_wedged_reason
+    if _sdk_wedged_reason is not None:
+        logger.info("SDK wedge cleared after successful query — workspace will recover to online on next heartbeat")
+        _sdk_wedged_reason = None
+
+
+def _reset_sdk_wedge_for_test() -> None:
+    """Test-only escape hatch. Production code clears the wedge via
+    `_clear_sdk_wedge_on_success` when a query succeeds; this helper
+    is for unit tests that need to reset between cases."""
+    global _sdk_wedged_reason
+    _sdk_wedged_reason = None
+
+
+# Per-tool-use summarizers. Reads the most-useful argument from each
+# tool's input dict so the canvas progress feed shows
+# `🛠 Read /tmp/foo` instead of the bare tool name. Anything not in the
+# table falls through to a generic "🛠 <tool>(…)" line. Order keys by
+# tool frequency so a future contributor can see the high-traffic
+# tools first.
+_TOOL_USE_SUMMARIZERS: dict[str, Callable[[dict], str]] = {
+    "Read":  lambda i: f"📄 Read {i.get('file_path', '?')}",
+    "Write": lambda i: f"✍️  Write {i.get('file_path', '?')}",
+    "Edit":  lambda i: f"✏️  Edit {i.get('file_path', '?')}",
+    "Bash":  lambda i: f"⚡ Bash: {(i.get('command') or '')[:80]}",
+    "Glob":  lambda i: f"🔍 Glob {i.get('pattern', '?')}",
+    "Grep":  lambda i: f"🔍 Grep {i.get('pattern', '?')}",
+    "WebFetch": lambda i: f"🌐 WebFetch {i.get('url', '?')}",
+    "WebSearch": lambda i: f"🌐 WebSearch {i.get('query', '?')}",
+    "Task":  lambda i: f"🤖 Task: {(i.get('description') or '')[:60]}",
+    "TodoWrite": lambda _i: "📝 TodoWrite",
+}
+
+
+def _summarize_tool_use(tool_name: str, tool_input: dict) -> str:
+    summarizer = _TOOL_USE_SUMMARIZERS.get(tool_name)
+    if summarizer:
+        try:
+            return summarizer(tool_input or {})[:200]
+        except Exception:
+            pass
+    # Generic fallback. Truncated so a tool with a giant input dict
+    # doesn't write a 10kB activity row per call.
+    return f"🛠 {tool_name}(…)"[:200]
+
+
+async def _report_tool_use(block: Any) -> None:
+    """Fire-and-forget agent_log activity row per tool the SDK invoked,
+    so the canvas's MyChat live-progress feed can render each step
+    Claude is doing instead of staring at a single spinner.
+
+    Posts directly to /workspaces/:id/activity rather than through
+    a2a_tools.report_activity — that helper also pushes a current_task
+    heartbeat which would duplicate as a TASK_UPDATED line in the
+    chat feed. The workspace card's current_task is already set
+    once per turn by the executor's set_current_task(brief_summary)
+    call, so the per-tool telemetry stays a chat-only signal.
+
+    Best-effort — any failure (network blip, platform unreachable, the
+    block didn't have the attrs we expected) is swallowed silently.
+    The tool will still execute regardless; only the progress
+    telemetry is lost. Deliberately does NOT raise — a malformed
+    block must not abort the message-stream iteration in
+    `_run_query`.
+    """
+    try:
+        # Lazy imports to keep this helper non-essential — the
+        # executor must still run when the workspace's network/auth
+        # plumbing isn't fully set up (e.g. unit tests).
+        import httpx
+        from a2a_client import PLATFORM_URL, WORKSPACE_ID
+        from platform_auth import auth_headers
+    except Exception:
+        return
+    try:
+        tool_name = getattr(block, "name", "") or ""
+        tool_input = getattr(block, "input", {}) or {}
+        if not tool_name:
+            return
+        summary = _summarize_tool_use(tool_name, tool_input)
+        # 5s budget — long enough to absorb a single platform GC
+        # pause, short enough that a wedged platform doesn't slow
+        # the tool-iteration cadence beyond noticeable.
+        async with httpx.AsyncClient(timeout=5.0) as client:
+            await client.post(
+                f"{PLATFORM_URL}/workspaces/{WORKSPACE_ID}/activity",
+                json={
+                    "activity_type": "agent_log",
+                    "source_id": WORKSPACE_ID,
+                    # target_id == source for self-actions. Matches the
+                    # convention other self-logged activity rows use
+                    # (a2a_receive when the workspace logs its own
+                    # outbound reply) so DB consumers joining on
+                    # target_id see a well-defined value.
+                    "target_id": WORKSPACE_ID,
+                    "summary": summary,
+                    "status": "ok",
+                    "method": tool_name,
+                },
+                headers=auth_headers(),
+            )
+    except Exception:
+        # Telemetry failures must not break the conversation.
+        return
+
+
+# Substring patterns that classify an exception as the specific
+# claude_agent_sdk init-timeout wedge (vs. a rate-limit, transient
+# subprocess crash, etc.). Match is case-insensitive on the formatted
+# error string. Adding a new pattern here MUST come with a test in
+# tests/test_claude_sdk_executor.py — false-positives lock the
+# workspace into degraded until the next successful query clears it.
+#
+# `:initialize` suffix-anchored — the SDK can theoretically time out
+# on later control messages (in-flight tool callbacks), but those
+# don't leave the SDK in the unrecoverable post-init state we're
+# trying to detect. Limit the pattern to the specific wedge.
+_WEDGE_ERROR_PATTERNS = (
+    "control request timeout: initialize",
+)
+
 
 _SWALLOWED_STDERR_MARKER = "Check stderr output for details"
 
@@ -344,6 +520,14 @@ class ClaudeSDKExecutor(AgentExecutor):
                     for block in message.content:
                         if isinstance(block, sdk.TextBlock):
                             assistant_chunks.append(block.text)
+                        else:
+                            # ToolUseBlock / ServerToolUseBlock are present
+                            # on the real SDK but not on the conftest stub —
+                            # check by class name to avoid an isinstance()
+                            # against a class the stub doesn't define.
+                            cls = type(block).__name__
+                            if cls in ("ToolUseBlock", "ServerToolUseBlock"):
+                                await _report_tool_use(block)
                 elif isinstance(message, sdk.ResultMessage):
                     sid = getattr(message, "session_id", None)
                     if sid:
@@ -352,6 +536,20 @@ class ClaudeSDKExecutor(AgentExecutor):
         finally:
             self._active_stream = None
         text = result_text if result_text is not None else "".join(assistant_chunks)
+        # Auto-recover the wedge flag — if a previous query() left this
+        # process in `_sdk_wedged` and THIS query just completed
+        # cleanly, the SDK clearly works again. Clear so the next
+        # heartbeat reports runtime_state empty and the platform flips
+        # status degraded → online without a manual restart.
+        #
+        # Gate on actual content from the stream so a degenerate
+        # "iterator returned without raising but emitted nothing"
+        # case (possible from a partial stream or a stub SDK) doesn't
+        # falsely advertise recovery. A real successful query yields
+        # at least a ResultMessage (sets result_text) or one
+        # AssistantMessage TextBlock (populates assistant_chunks).
+        if result_text is not None or assistant_chunks:
+            _clear_sdk_wedge_on_success()
         return QueryResult(text=text, session_id=session_id)
 
     # ------------------------------------------------------------------
@@ -365,6 +563,18 @@ class ClaudeSDKExecutor(AgentExecutor):
         workspace queue rather than racing on `_session_id` / `_active_stream`.
         """
         user_input = extract_message_text(context.message)
+        # Surface attached files to claude-code via a manifest in the prompt.
+        # Claude Code reads files through its own Read/Glob tools by path —
+        # as long as the prompt names the path, the CLI will open them on
+        # demand. Same contract every platform runtime uses so the UX is
+        # identical across hermes / langgraph / claude-code.
+        attached = extract_attached_files(context.message)
+        if attached:
+            manifest = "\n\nAttached files:\n" + "\n".join(
+                f"- {f['name']} ({f['mime_type'] or 'unknown type'}) at {f['path']}"
+                for f in attached
+            )
+            user_input = (user_input + manifest) if user_input else manifest.lstrip()
         if not user_input:
             await event_queue.enqueue_event(new_agent_text_message(_NO_TEXT_MSG))
             return
@@ -375,7 +585,26 @@ class ClaudeSDKExecutor(AgentExecutor):
         # Enqueue outside the lock so the next queued turn can start
         # preparing its prompt while this turn's response ships. Event
         # ordering is preserved per-queue by the A2A server, so no races.
-        await event_queue.enqueue_event(new_agent_text_message(response_text))
+        # If the response mentions /workspace/... files, stage each and
+        # emit FileParts alongside the text so the canvas can download.
+        outbound = collect_outbound_files(response_text)
+        if outbound:
+            from a2a.types import FilePart, FileWithUri, Message, Part, Role, TextPart
+            import uuid as _uuid
+            parts: list = [Part(root=TextPart(text=response_text))] if response_text else []
+            for f in outbound:
+                parts.append(Part(root=FilePart(file=FileWithUri(
+                    uri="workspace:" + f["path"],
+                    name=f["name"],
+                    mimeType=f["mime_type"],
+                ))))
+            await event_queue.enqueue_event(Message(
+                messageId=_uuid.uuid4().hex,
+                role=Role.agent,
+                parts=parts,
+            ))
+        else:
+            await event_queue.enqueue_event(new_agent_text_message(response_text))
 
     @staticmethod
     def _is_retryable(exc: BaseException) -> bool:
@@ -473,6 +702,19 @@ class ClaudeSDKExecutor(AgentExecutor):
                     # subprocess died.
                     logger.error("SDK agent error [claude-code]: %s", formatted)
                     logger.exception("SDK agent error [claude-code] — full traceback follows")
+                    # Detect the specific claude_agent_sdk init-wedge case
+                    # so the heartbeat task can flip the workspace to
+                    # `degraded`. Match on the lowercased formatted error;
+                    # `formatted` is whatever _format_process_error built,
+                    # which already includes both the message and the
+                    # exception class name.
+                    formatted_lc = formatted.lower()
+                    for pat in _WEDGE_ERROR_PATTERNS:
+                        if pat in formatted_lc:
+                            _mark_sdk_wedged(
+                                f"claude_agent_sdk wedge: {formatted[:200]} — restart workspace to recover"
+                            )
+                            break
                     response_text = sanitize_agent_error(exc)
                     break
         finally:
diff --git a/workspace/executor_helpers.py b/workspace/executor_helpers.py
index f40fa6b7..8c60c9e0 100644
--- a/workspace/executor_helpers.py
+++ b/workspace/executor_helpers.py
@@ -10,16 +10,22 @@ Provides:
 - Brief task summary extraction (markdown-aware)
 - Error message sanitization (exception classes and subprocess categories)
 - Shared workspace path constants and the MCP server path resolver
+- Attached-file extraction and outbound-file staging (platform-wide chat
+  attachments — every runtime routes through these helpers so the
+  drag-dropped image / returned report experience is identical)
 """
 
 from __future__ import annotations
 
 import asyncio
+import base64
 import json
 import logging
+import mimetypes
 import os
 import re
 import subprocess
+import uuid as _uuid
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
@@ -582,3 +588,276 @@ async def auto_push_hook(cwd: str | None = None) -> None:
         await asyncio.to_thread(_auto_push_and_pr_sync, cwd)
     except Exception:
         logger.exception("auto_push_hook: failed (non-fatal)")
+
+
+# ========================================================================
+# Chat attachments — platform-level support for drag-drop uploads and
+# agent-returned files. Every runtime executor routes inbound file parts
+# through ``extract_attached_files`` + ``build_user_content_with_files``
+# and post-processes replies through ``collect_outbound_files`` so a file
+# attached in the canvas shows up correctly across hermes, claude-code,
+# langgraph, CLI runtimes, etc. Living here (not in any one executor)
+# keeps the attachment contract in one place — match canvas/ChatTab.tsx
+# and workspace-server/internal/handlers/chat_files.go, and every runtime
+# benefits at once.
+# ========================================================================
+
+# Matches CHAT_UPLOAD_DIR in workspace-server/internal/handlers/chat_files.go.
+# The canvas uploads files here; outbound files get staged here so the
+# download endpoint (which whitelists this directory) can serve them.
+CHAT_UPLOADS_DIR = f"{WORKSPACE_MOUNT}/.molecule/chat-uploads"
+
+
+def ensure_workspace_writable() -> None:
+    """Make /workspace (and the chat-uploads dir) writable by whoever the
+    agent will run as.
+
+    Docker's default for a new named volume is root-owned 755 — that
+    bricks the agent→user "write a file, hand it to the user" flow for
+    every template whose agent runs under a non-root user (hermes uses
+    `agent`, most others use some dedicated UID too). Each Dockerfile
+    solving this individually was the anti-pattern; this helper belongs
+    to the platform so every runtime picks up the fix by calling into
+    ``molecule_runtime`` during boot.
+
+    Runs best-effort: if molecule-runtime itself started as non-root
+    (rare, but possible in some CP configurations), the chmod silently
+    no-ops — the template's own start.sh is expected to have already
+    handled perms in that case. We prefer silent degradation to a hard
+    boot failure because misconfigured perms are recoverable (user gets
+    a clear "permission denied" from the agent) but an uncatchable
+    exception here would wedge the whole workspace in `provisioning`.
+    """
+    # 777 matches the intent: one container, one tenant, anyone in the
+    # container can read/write workspace files. Cross-tenant isolation
+    # happens at the Docker boundary, not inside the volume.
+    for path in (WORKSPACE_MOUNT, CHAT_UPLOADS_DIR):
+        try:
+            os.makedirs(path, exist_ok=True)
+            os.chmod(path, 0o777)
+        except PermissionError:
+            logger.info(
+                "ensure_workspace_writable: lacking root (non-fatal) for %s", path
+            )
+        except OSError as exc:
+            logger.warning(
+                "ensure_workspace_writable: %s for %s", exc, path
+            )
+
+# Cap image inlining so a 25MB PNG doesn't blow past provider context
+# limits. Images larger than this fall back to a path mention only —
+# the agent can still read them via file_read / bash tools.
+MAX_INLINE_ATTACHMENT_BYTES = 8 * 1024 * 1024
+
+# Absolute /workspace/... paths the agent may mention in its reply.
+# Leading boundary prevents matching the middle of URLs like
+# https://example.com/workspace/foo while allowing markdown emphasis
+# wrappers (**, *, _, `, (, [) so "**/workspace/x.pdf**" still matches.
+# Trailing '.' is stripped post-capture (see collect_outbound_files).
+_WORKSPACE_PATH_RE = re.compile(
+    r"(?:^|[\s`\"'*_(\[])(/workspace/[A-Za-z0-9_./\-]+)"
+)
+_UNSAFE_NAME_RE = re.compile(r"[^A-Za-z0-9._\-]")
+
+
+def resolve_attachment_uri(uri: str) -> str | None:
+    """Resolve a canvas-issued attachment URI to an in-container path.
+
+    Accepted shapes (matches canvas uploads.ts + chat_files.go):
+      - ``workspace:/workspace/.molecule/chat-uploads/<name>``  (canonical)
+      - ``file:///workspace/...``                               (legacy)
+      - ``/workspace/...``                                      (bare)
+
+    Anything resolving outside ``/workspace`` is refused. ``Path.resolve``
+    collapses ``..`` segments so a crafted ``workspace:/workspace/../etc/passwd``
+    returns None instead of leaking the real filesystem.
+    """
+    if not uri:
+        return None
+    path: str | None = None
+    if uri.startswith("workspace:"):
+        path = uri[len("workspace:"):]
+    elif uri.startswith("file://"):
+        path = uri[len("file://"):]
+    elif uri.startswith("/"):
+        path = uri
+    if not path:
+        return None
+    try:
+        resolved = str(Path(path).resolve())
+    except (OSError, RuntimeError):
+        return None
+    if not (resolved == WORKSPACE_MOUNT or resolved.startswith(WORKSPACE_MOUNT + "/")):
+        return None
+    return resolved
+
+
+def extract_attached_files(message: Any) -> list[dict[str, str]]:
+    """Pull ``{name, mime_type, path}`` dicts out of an A2A message.
+
+    Handles the discriminated-union shape ``part.root.file`` that a2a-sdk
+    produces via Pydantic RootModel, and the flatter ``part.file`` shape
+    hand-built callers sometimes emit. Non-file parts and files with
+    unresolvable URIs are skipped — the caller sees an empty list rather
+    than a mix of valid and broken entries.
+    """
+    if message is None:
+        return []
+    parts = getattr(message, "parts", None) or []
+    out: list[dict[str, str]] = []
+    for part in parts:
+        root = getattr(part, "root", part)
+        if getattr(root, "kind", None) != "file":
+            continue
+        f = getattr(root, "file", None)
+        if f is None:
+            continue
+        uri = getattr(f, "uri", "") or ""
+        name = getattr(f, "name", "") or ""
+        mime = getattr(f, "mimeType", None) or getattr(f, "mime_type", None) or ""
+        path = resolve_attachment_uri(uri)
+        if not path or not os.path.isfile(path):
+            logger.warning("skipping attached file with unresolvable uri=%r", uri)
+            continue
+        out.append({"name": name, "mime_type": mime, "path": path})
+    return out
+
+
+def _read_as_data_url(path: str, mime_type: str) -> str | None:
+    """Return ``data:<mime>;base64,<...>`` or None if too large / unreadable."""
+    try:
+        size = os.path.getsize(path)
+    except OSError:
+        return None
+    if size > MAX_INLINE_ATTACHMENT_BYTES:
+        logger.info(
+            "attachment %s too large to inline (%d bytes > cap)", path, size
+        )
+        return None
+    try:
+        with open(path, "rb") as fh:
+            b64 = base64.b64encode(fh.read()).decode("ascii")
+    except OSError as exc:
+        logger.warning("failed to read attachment %s: %s", path, exc)
+        return None
+    return f"data:{mime_type or 'application/octet-stream'};base64,{b64}"
+
+
+def build_user_content_with_files(
+    user_text: str, attached: list[dict[str, str]]
+) -> Any:
+    """Combine text + attachments into an OpenAI-compat ``content`` field.
+
+    - No attachments → plain string (preserves simple shape for non-vision
+      models).
+    - Any image attachment → list-of-parts with text + image_url entries
+      (multi-modal; vision-capable models see the image bytes). Skipped
+      when ``MOLECULE_DISABLE_IMAGE_INLINING`` is truthy — some provider/
+      model combos (e.g. MiniMax's hermes-agent adapter as of 2026-04)
+      claim vision support but hang indefinitely on image payloads, and
+      the caller may prefer manifest-only so the agent can still use its
+      file_read tool instead of stalling the whole request.
+    - Non-image attachments → manifest appended to the text so the agent
+      knows the filenames + absolute paths and can inspect via its
+      file_read / bash tools.
+
+    This is the platform's one-line fix for "agent didn't know I attached
+    a file": any executor that calls it gets attachment awareness for
+    free, regardless of which LLM provider is behind it.
+    """
+    if not attached:
+        return user_text
+
+    manifest_lines = [
+        f"- {f['name']} ({f['mime_type'] or 'unknown type'}) at {f['path']}"
+        for f in attached
+    ]
+    manifest = "Attached files:\n" + "\n".join(manifest_lines)
+    combined = f"{user_text}\n\n{manifest}" if user_text else manifest
+
+    disable_inline = os.environ.get("MOLECULE_DISABLE_IMAGE_INLINING", "").lower() in (
+        "1", "true", "yes", "on",
+    )
+    if disable_inline or not any(
+        (f["mime_type"] or "").startswith("image/") for f in attached
+    ):
+        return combined
+
+    content: list[dict[str, Any]] = [{"type": "text", "text": combined}]
+    for f in attached:
+        mt = f["mime_type"] or ""
+        if not mt.startswith("image/"):
+            continue
+        data_url = _read_as_data_url(f["path"], mt)
+        if data_url is not None:
+            content.append({"type": "image_url", "image_url": {"url": data_url}})
+    return content
+
+
+def _sanitize_attachment_name(name: str) -> str:
+    cleaned = _UNSAFE_NAME_RE.sub("_", name) or "file"
+    return cleaned[:100]
+
+
+def _guess_mime(path: str) -> str:
+    mt, _ = mimetypes.guess_type(path)
+    return mt or "application/octet-stream"
+
+
+def stage_outbound_file(src_path: str) -> dict[str, str] | None:
+    """Copy ``src_path`` into ``CHAT_UPLOADS_DIR`` (unless already there)
+    and return ``{name, mime_type, path}`` so the caller can attach it to
+    the A2A reply.
+
+    Files already in the chat-uploads directory are attached as-is;
+    anything elsewhere under /workspace gets a uuid-prefixed copy so
+    basenames can't collide with existing uploads and the original
+    workspace layout stays untouched. Returns None on I/O failure.
+    """
+    try:
+        os.makedirs(CHAT_UPLOADS_DIR, exist_ok=True)
+    except OSError as exc:
+        logger.warning("cannot ensure chat-uploads dir: %s", exc)
+        return None
+    name = os.path.basename(src_path)
+    mime = _guess_mime(src_path)
+    if os.path.dirname(src_path) == CHAT_UPLOADS_DIR:
+        return {"name": name, "mime_type": mime, "path": src_path}
+    try:
+        stored = f"{_uuid.uuid4().hex[:16]}-{_sanitize_attachment_name(name)}"
+        dst = os.path.join(CHAT_UPLOADS_DIR, stored)
+        with open(src_path, "rb") as fin, open(dst, "wb") as fout:
+            fout.write(fin.read())
+    except OSError as exc:
+        logger.warning("failed to stage %s → chat-uploads: %s", src_path, exc)
+        return None
+    return {"name": name, "mime_type": mime, "path": dst}
+
+
+def collect_outbound_files(reply_text: str) -> list[dict[str, str]]:
+    """Detect /workspace/... paths the agent mentioned in its reply and
+    stage each one so it can be returned to the canvas as a file part.
+
+    Each unique, readable file goes through ``stage_outbound_file`` — the
+    download endpoint only serves files from whitelisted directories, so
+    a reply referencing /workspace/private/secret.pem still can't be
+    exfiltrated via the chat download link unless we've explicitly
+    copied it under the chat-uploads dir.
+    """
+    if not reply_text:
+        return []
+    seen: set[str] = set()
+    out: list[dict[str, str]] = []
+    for match in _WORKSPACE_PATH_RE.finditer(reply_text):
+        # Trim trailing sentence punctuation that the character class
+        # greedily swallowed — "wrote /workspace/x.txt." would otherwise
+        # resolve to "x.txt." which doesn't exist.
+        raw = match.group(1).rstrip(".")
+        resolved = resolve_attachment_uri(raw)
+        if not resolved or resolved in seen or not os.path.isfile(resolved):
+            continue
+        seen.add(resolved)
+        staged = stage_outbound_file(resolved)
+        if staged is not None:
+            out.append(staged)
+    return out
diff --git a/workspace/heartbeat.py b/workspace/heartbeat.py
index 1eb5b4fd..c0fc2f1d 100644
--- a/workspace/heartbeat.py
+++ b/workspace/heartbeat.py
@@ -17,7 +17,31 @@ from pathlib import Path
 
 import httpx
 
-from platform_auth import auth_headers, refresh_cache
+from platform_auth import auth_headers, refresh_cache, self_source_headers
+
+
+def _runtime_state_payload() -> dict:
+    """Build the {runtime_state, sample_error} portion of the heartbeat
+    body when the Claude SDK has hit a wedge. Returns an empty dict
+    when the runtime is healthy so the heartbeat payload doesn't grow
+    fields the platform doesn't need.
+
+    Imported lazily so workspaces running non-Claude runtimes (where
+    `claude_sdk_executor` may not be importable at all) keep working —
+    a missing import means "no Claude wedge possible here, healthy."
+    """
+    try:
+        from claude_sdk_executor import is_wedged, wedge_reason
+    except Exception:
+        return {}
+    if not is_wedged():
+        return {}
+    return {
+        "runtime_state": "wedged",
+        # sample_error doubles as the human-readable banner text on the
+        # canvas's degraded card — keep it short and actionable.
+        "sample_error": wedge_reason(),
+    }
 
 logger = logging.getLogger(__name__)
 
@@ -85,16 +109,23 @@ class HeartbeatLoop:
                 while True:
                     # 1. Send heartbeat (Phase 30.1: include auth header if token known)
                     try:
+                        body = {
+                            "workspace_id": self.workspace_id,
+                            "error_rate": self.error_rate,
+                            "sample_error": self.sample_error,
+                            "active_tasks": self.active_tasks,
+                            "current_task": self.current_task,
+                            "uptime_seconds": int(time.time() - self.start_time),
+                        }
+                        # Layer the runtime-wedge fields on top so a
+                        # non-empty sample_error from the wedge wins
+                        # over the (typically empty) heartbeat
+                        # sample_error field. The platform reads
+                        # runtime_state to flip status → degraded.
+                        body.update(_runtime_state_payload())
                         await client.post(
                             f"{self.platform_url}/registry/heartbeat",
-                            json={
-                                "workspace_id": self.workspace_id,
-                                "error_rate": self.error_rate,
-                                "sample_error": self.sample_error,
-                                "active_tasks": self.active_tasks,
-                                "current_task": self.current_task,
-                                "uptime_seconds": int(time.time() - self.start_time),
-                            },
+                            json=body,
                             headers=auth_headers(),
                         )
                         self.error_count = 0
@@ -113,16 +144,18 @@ class HeartbeatLoop:
                             logger.warning("Heartbeat 401 for %s — refreshing token cache and retrying once", self.workspace_id)
                             refresh_cache()
                             try:
+                                retry_body = {
+                                    "workspace_id": self.workspace_id,
+                                    "error_rate": self.error_rate,
+                                    "sample_error": self.sample_error,
+                                    "active_tasks": self.active_tasks,
+                                    "current_task": self.current_task,
+                                    "uptime_seconds": int(time.time() - self.start_time),
+                                }
+                                retry_body.update(_runtime_state_payload())
                                 await client.post(
                                     f"{self.platform_url}/registry/heartbeat",
-                                    json={
-                                        "workspace_id": self.workspace_id,
-                                        "error_rate": self.error_rate,
-                                        "sample_error": self.sample_error,
-                                        "active_tasks": self.active_tasks,
-                                        "current_task": self.current_task,
-                                        "uptime_seconds": int(time.time() - self.start_time),
-                                    },
+                                    json=retry_body,
                                     headers=auth_headers(),
                                 )
                                 self._consecutive_failures = 0
@@ -284,6 +317,9 @@ class HeartbeatLoop:
                 else:
                     self._last_self_message_time = now
                     try:
+                        # self_source_headers() adds X-Workspace-ID so the
+                        # platform tags this row source=agent, not canvas
+                        # — see platform_auth.py for the full rationale.
                         await client.post(
                             f"{self.platform_url}/workspaces/{self.workspace_id}/a2a",
                             json={
@@ -295,7 +331,7 @@ class HeartbeatLoop:
                                     },
                                 },
                             },
-                            headers=auth_headers(),
+                            headers=self_source_headers(self.workspace_id),
                             timeout=120.0,
                         )
                         logger.info("Heartbeat: self-message sent to process delegation results")
diff --git a/workspace/main.py b/workspace/main.py
index 1cdd64aa..ab0b06f5 100644
--- a/workspace/main.py
+++ b/workspace/main.py
@@ -33,7 +33,7 @@ from initial_prompt import (
     mark_initial_prompt_attempted,
     resolve_initial_prompt_marker,
 )
-from platform_auth import auth_headers
+from platform_auth import auth_headers, self_source_headers
 
 
 def get_machine_ip() -> str:  # pragma: no cover
@@ -69,6 +69,15 @@ async def main():  # pragma: no cover
     # 0. Initialise OpenTelemetry (no-op if packages not installed)
     setup_telemetry(service_name=workspace_id)
 
+    # 0a. Fix /workspace perms before any agent code runs. Docker ships
+    # named volumes as root:root 755 — without this the non-root agent
+    # user can't write files the user asked it to produce, and the
+    # "agent → file → user downloads" flow dead-ends at a bash "permission
+    # denied". Best-effort: no-ops silently if molecule-runtime itself
+    # isn't root (template's own start.sh should have handled it there).
+    from executor_helpers import ensure_workspace_writable
+    ensure_workspace_writable()
+
     # 1. Load config
     config = load_config(config_path)
     port = config.a2a.port
@@ -430,7 +439,15 @@ async def main():  # pragma: no cover
                 # silently rejected once any workspace has a live token on
                 # file. Without this, initial_prompt 401s in multi-tenant
                 # mode exactly like /registry/register did in #215.
-                headers = {"Content-Type": "application/json", **auth_headers()}
+                # X-Workspace-ID via self_source_headers() so the platform
+                # tags the row source=agent — without it the canvas's
+                # My Chat tab renders the initial_prompt as if the user
+                # had typed it. See platform_auth.py for the full
+                # explanation.
+                headers = {
+                    "Content-Type": "application/json",
+                    **self_source_headers(workspace_id),
+                }
 
                 # Retry with backoff — the platform proxy may not be able to
                 # reach us yet (container networking takes a moment to settle).
@@ -522,7 +539,13 @@ async def main():  # pragma: no cover
                     # actual outcome instead of a bare "post failed" line.
                     # #220: include auth_headers() on every idle fire. Without
                     # this, the idle loop 401s in multi-tenant mode.
-                    headers = {"Content-Type": "application/json", **auth_headers()}
+                    # self_source_headers() adds X-Workspace-ID so the
+                    # platform classifies the idle fire as source=agent
+                    # rather than user-typed canvas input.
+                    headers = {
+                        "Content-Type": "application/json",
+                        **self_source_headers(workspace_id),
+                    }
                     try:
                         req = _urlreq.Request(
                             f"{platform_url}/workspaces/{workspace_id}/a2a",
diff --git a/workspace/platform_auth.py b/workspace/platform_auth.py
index 39a17075..f767381d 100644
--- a/workspace/platform_auth.py
+++ b/workspace/platform_auth.py
@@ -98,6 +98,26 @@ def auth_headers() -> dict[str, str]:
     return {"Authorization": f"Bearer {tok}"}
 
 
+def self_source_headers(workspace_id: str) -> dict[str, str]:
+    """Return auth headers PLUS X-Workspace-ID identifying this workspace
+    as the source of the request.
+
+    Use this for any POST the workspace's own runtime fires against the
+    platform's A2A endpoints — heartbeat self-messages, initial_prompt,
+    idle-loop fires, peer-to-peer A2A from runtime tools. Without the
+    X-Workspace-ID header the platform's a2a_receive logger writes
+    source_id=NULL, which the canvas's My Chat tab interprets as a
+    user-typed message and renders the internal prompt to the user.
+    See workspace-server/internal/handlers/a2a_proxy.go:184 for the
+    server-side classification rule.
+
+    Centralised here so adding a new system header (e.g. a per-fire
+    correlation ID) only touches one place — and so that any
+    workspace→A2A POST that doesn't use this helper stands out in
+    review as a probable bug."""
+    return {**auth_headers(), "X-Workspace-ID": workspace_id}
+
+
 def clear_cache() -> None:
     """Reset the in-memory cache. Used by tests that write fresh token
     files between cases."""
diff --git a/workspace/scripts/molecule-git-token-helper.sh b/workspace/scripts/molecule-git-token-helper.sh
index 0faab0fc..125d5109 100755
--- a/workspace/scripts/molecule-git-token-helper.sh
+++ b/workspace/scripts/molecule-git-token-helper.sh
@@ -110,18 +110,45 @@ _read_cache() {
 }
 
 # _write_cache — atomically persist token + expiry.
+#
+# Hardened per #1552:
+#  - umask 077 around the writes so .tmp files are 600 from creation,
+#    closing the TOCTOU window where a concurrent reader could read
+#    the token while it was still mode 644 (between the create-with-
+#    default-umask and the later chmod 600).
+#  - Don't swallow chmod errors with `|| true`. A chmod failure leaves
+#    tokens potentially world-readable; surface it as a WARN line so
+#    ops can grep `[molecule-git-token-helper] WARN` and see real
+#    permission failures instead of silent 644 files.
 _write_cache() {
     local token="$1"
     mkdir -p "${CACHE_DIR}"
-    chmod 700 "${CACHE_DIR}" 2>/dev/null || true
+    if ! chmod 700 "${CACHE_DIR}" 2>/dev/null; then
+        echo "[molecule-git-token-helper] WARN: failed to chmod 700 ${CACHE_DIR} — cache dir may be world-readable" >&2
+    fi
     now=$(_now_epoch)
     expiry=$((now + TOKEN_CACHE_TTL_SEC))
+
+    # Restrictive umask so the .tmp files are 600 from creation. Restored
+    # before return so callers' umask isn't perturbed.
+    local prev_umask
+    prev_umask=$(umask)
+    umask 077
+
     # Write atomically via tmp + mv to avoid partial reads.
     printf '%s' "${token}" > "${CACHE_TOKEN_FILE}.tmp"
     printf '%s' "${expiry}" > "${CACHE_EXPIRY_FILE}.tmp"
     mv -f "${CACHE_TOKEN_FILE}.tmp" "${CACHE_TOKEN_FILE}"
     mv -f "${CACHE_EXPIRY_FILE}.tmp" "${CACHE_EXPIRY_FILE}"
-    chmod 600 "${CACHE_TOKEN_FILE}" "${CACHE_EXPIRY_FILE}" 2>/dev/null || true
+
+    umask "${prev_umask}"
+
+    # Belt-and-suspenders chmod — umask 077 should make the files 600
+    # already, but a chmod that fails on the post-rename file is itself
+    # a real signal worth surfacing.
+    if ! chmod 600 "${CACHE_TOKEN_FILE}" "${CACHE_EXPIRY_FILE}" 2>/dev/null; then
+        echo "[molecule-git-token-helper] WARN: chmod 600 failed on cache files — token may be world-readable" >&2
+    fi
 }
 
 # _fetch_token_from_api — hit the platform endpoint.
@@ -230,10 +257,21 @@ case "${ACTION}" in
             echo "[molecule-git-token-helper] _refresh_gh: gh auth login failed (non-fatal)" >&2
         }
         # Also update GH_TOKEN file for scripts that source it.
+        # Same #1552 hardening as _write_cache — umask 077 around the
+        # write so the .tmp file is 600 from creation, and surface a
+        # WARN on chmod failure instead of swallowing it.
         gh_token_file="${HOME}/.gh_token"
+        # `local` is illegal here (top-level case branch, not a
+        # function); shadow with a uniquely-named global instead.
+        _gh_prev_umask=$(umask)
+        umask 077
         printf '%s' "${api_token}" > "${gh_token_file}.tmp"
         mv -f "${gh_token_file}.tmp" "${gh_token_file}"
-        chmod 600 "${gh_token_file}" 2>/dev/null || true
+        umask "${_gh_prev_umask}"
+        unset _gh_prev_umask
+        if ! chmod 600 "${gh_token_file}" 2>/dev/null; then
+            echo "[molecule-git-token-helper] WARN: chmod 600 failed on ${gh_token_file} — token may be world-readable" >&2
+        fi
         echo "[molecule-git-token-helper] _refresh_gh: token refreshed successfully" >&2
         ;;
     _invalidate_cache)
diff --git a/workspace/tests/test_a2a_client.py b/workspace/tests/test_a2a_client.py
index fbae0ea0..fd813f3e 100644
--- a/workspace/tests/test_a2a_client.py
+++ b/workspace/tests/test_a2a_client.py
@@ -199,10 +199,34 @@ class TestSendA2AMessage:
             result = await a2a_client.send_a2a_message("http://target/a2a", "task")
 
         assert result.startswith(a2a_client._A2A_ERROR_PREFIX)
-        assert "unknown" in result
+        # The error includes the JSON-RPC code so the operator can look it
+        # up; "no message" surfaces the missing-message condition explicitly
+        # instead of the previous opaque "unknown".
+        assert "code=-32600" in result
+        assert "no message" in result.lower()
+        # Target URL is included so chained delegations are traceable.
+        assert "target=http://target/a2a" in result
 
-    async def test_neither_result_nor_error_returns_str_of_data(self):
-        """Response with neither 'result' nor 'error' → str(data)."""
+    async def test_jsonrpc_error_with_code_zero_includes_code_in_detail(self):
+        """JSON-RPC error code=0 is technically not valid in the spec,
+        but a malformed peer can still send it — make sure the code is
+        preserved in the detail rather than collapsing into the
+        no-code path. Locks in the `code is not None` semantics over
+        the truthy-check shortcut."""
+        import a2a_client
+
+        resp = _make_response(200, {"error": {"code": 0, "message": "weird"}})
+        mock_client = _make_mock_client(post_resp=resp)
+
+        with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+            result = await a2a_client.send_a2a_message("http://target/a2a", "task")
+
+        assert result.startswith(a2a_client._A2A_ERROR_PREFIX)
+        assert "code=0" in result
+        assert "weird" in result
+
+    async def test_neither_result_nor_error_returns_a2a_error_with_payload(self):
+        """Response with neither 'result' nor 'error' → A2A_ERROR + payload context."""
         import a2a_client
 
         payload = {"jsonrpc": "2.0", "id": "abc123"}
@@ -212,7 +236,14 @@ class TestSendA2AMessage:
         with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
             result = await a2a_client.send_a2a_message("http://target/a2a", "task")
 
-        assert result == str(payload)
+        # Pre-fix this returned bare str(payload) which the canvas
+        # rendered as a confusing "looks like a successful response"
+        # block. Now it's tagged so downstream UI / delegate_task
+        # routes it through the error path.
+        assert result.startswith(a2a_client._A2A_ERROR_PREFIX)
+        assert "unexpected response shape" in result
+        assert "abc123" in result  # snippet of payload included for context
+        assert "target=http://target/a2a" in result
 
     async def test_exception_returns_error_prefix_and_message(self):
         """Network exception → returns _A2A_ERROR_PREFIX + exception text."""
@@ -225,6 +256,39 @@ class TestSendA2AMessage:
 
         assert result.startswith(a2a_client._A2A_ERROR_PREFIX)
         assert "connection refused" in result
+        # Exception class name is prepended when the message doesn't
+        # already include it — gives the operator a typed handle to
+        # search for in container logs.
+        assert "ConnectionError" in result
+        assert "target=http://target/a2a" in result
+
+    async def test_empty_stringifying_exception_falls_back_to_class_name(self):
+        """The user's reported bug: httpx.RemoteProtocolError and similar
+        exceptions can stringify to "" — pre-fix the canvas rendered
+        "[A2A_ERROR] " with no detail. Verify the empty path now
+        produces an actionable message including the exception type
+        and the target URL."""
+        import a2a_client
+
+        # Subclass Exception with __str__ → "" to simulate the
+        # silent-exception variants without depending on a specific
+        # httpx version's behavior.
+        class _SilentRemoteProtocolError(Exception):
+            def __str__(self) -> str:
+                return ""
+
+        mock_client = _make_mock_client(post_exc=_SilentRemoteProtocolError())
+
+        with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+            result = await a2a_client.send_a2a_message("http://target/a2a", "task")
+
+        # Must NOT be just the bare prefix — that's the regression.
+        assert result != a2a_client._A2A_ERROR_PREFIX.strip()
+        assert result != f"{a2a_client._A2A_ERROR_PREFIX}"
+        # Must include the class name + something explanatory.
+        assert "_SilentRemoteProtocolError" in result
+        assert "no message" in result.lower()
+        assert "target=http://target/a2a" in result
 
     async def test_result_text_part_missing_text_key_returns_empty(self):
         """Part dict without 'text' key → falls back to '' (empty string returned)."""
diff --git a/workspace/tests/test_a2a_tools_module.py b/workspace/tests/test_a2a_tools_module.py
index 3f70b631..1a058326 100644
--- a/workspace/tests/test_a2a_tools_module.py
+++ b/workspace/tests/test_a2a_tools_module.py
@@ -114,11 +114,11 @@ class TestDelegateTask:
             async def __aexit__(self, *a): pass
 
             async def get(self, url, headers=None):
-                calls.append(("get", url))
+                calls.append(("get", url, headers))
                 return _FakeResponse(200, {"url": "http://target.test/a2a"})
 
-            async def post(self, url, json=None):
-                calls.append(("post", url))
+            async def post(self, url, json=None, headers=None):
+                calls.append(("post", url, headers))
                 return _FakeResponse(200, {
                     "result": {
                         "parts": [{"kind": "text", "text": "Task done!"}]
@@ -130,7 +130,17 @@ class TestDelegateTask:
         result = await mod.delegate_task("ws-target", "do something")
         assert result == "Task done!"
         assert any(c[0] == "get" for c in calls)
-        assert any(c[0] == "post" for c in calls)
+        post_calls = [c for c in calls if c[0] == "post"]
+        assert post_calls, "delegate_task must POST to the target's /a2a endpoint"
+        # Regression: peer A2A POSTs MUST include X-Workspace-ID so
+        # the platform's a2a_receive logger writes source_id correctly
+        # — without it the recipient's My Chat tab would render the
+        # delegation as user-typed input. Same hazard fixed in
+        # heartbeat.py / a2a_client.py / main.py initial+idle flows.
+        post_headers = post_calls[0][2] or {}
+        assert post_headers.get("X-Workspace-ID"), (
+            f"delegate_task POST must include X-Workspace-ID; got headers={post_headers!r}"
+        )
 
     async def test_delegate_task_success_empty_parts(self, monkeypatch):
         """Result with empty parts list falls back to str(result)."""
@@ -144,7 +154,7 @@ class TestDelegateTask:
             async def get(self, url, headers=None):
                 return _FakeResponse(200, {"url": "http://target.test/a2a"})
 
-            async def post(self, url, json=None):
+            async def post(self, url, json=None, headers=None):
                 return _FakeResponse(200, {"result": {"parts": []}})
 
         monkeypatch.setattr(mod.httpx, "AsyncClient", FakeClient)
@@ -217,7 +227,7 @@ class TestDelegateTask:
             async def get(self, url, headers=None):
                 return _FakeResponse(200, {"url": "http://target.test/a2a"})
 
-            async def post(self, url, json=None):
+            async def post(self, url, json=None, headers=None):
                 return _FakeResponse(200, {
                     "error": {"code": -32603, "message": "Internal error"}
                 })
@@ -240,7 +250,7 @@ class TestDelegateTask:
             async def get(self, url, headers=None):
                 return _FakeResponse(200, {"url": "http://target.test/a2a"})
 
-            async def post(self, url, json=None):
+            async def post(self, url, json=None, headers=None):
                 return _FakeResponse(200, {"jsonrpc": "2.0", "id": "123"})
 
         monkeypatch.setattr(mod.httpx, "AsyncClient", FakeClient)
@@ -262,7 +272,7 @@ class TestDelegateTask:
             async def get(self, url, headers=None):
                 return _FakeResponse(200, {"url": "http://target.test/a2a"})
 
-            async def post(self, url, json=None):
+            async def post(self, url, json=None, headers=None):
                 call_count["n"] += 1
                 raise ConnectionError("target down")
 
diff --git a/workspace/tests/test_claude_sdk_executor.py b/workspace/tests/test_claude_sdk_executor.py
index ac3b0c3d..7122fe98 100644
--- a/workspace/tests/test_claude_sdk_executor.py
+++ b/workspace/tests/test_claude_sdk_executor.py
@@ -21,7 +21,25 @@ _FakeTextBlock = _sdk_stub.TextBlock
 _FakeAssistantMessage = _sdk_stub.AssistantMessage
 _FakeResultMessage = _sdk_stub.ResultMessage
 
-from claude_sdk_executor import ClaudeSDKExecutor, QueryResult  # noqa: E402
+from claude_sdk_executor import (  # noqa: E402
+    ClaudeSDKExecutor,
+    QueryResult,
+    _mark_sdk_wedged,
+    _reset_sdk_wedge_for_test,
+    is_wedged,
+    wedge_reason,
+)
+
+# Module alias used by the wedge tests below — they read
+# `_executor_mod.<helper>` to make the module-state vs function-state
+# distinction explicit at the call site, separate from the names
+# imported above. Hoisted to the top-of-file imports because the late
+# binding (originally at line ~1248) was invisible to @pytest.mark.asyncio
+# wrappers under coverage instrumentation (--cov, added by #1817):
+# sys.settrace + the asyncio wrapper combination caused a
+# `NameError: name '_executor_mod' is not defined` on every async wedge
+# test. Hoisting the alias fixes that scope-resolution issue.
+import claude_sdk_executor as _executor_mod  # noqa: E402
 
 
 # ---------- Helpers ----------
@@ -1221,3 +1239,170 @@ def test_load_config_dict_empty_file_returns_empty(tmp_path):
     e = ClaudeSDKExecutor(system_prompt=None, config_path=str(tmp_path), heartbeat=None)
     result = e._load_config_dict()
     assert result == {}
+
+
+# ==================== SDK wedge detector ====================
+#
+# Exercises the module-level _sdk_wedged_reason flag set when the
+# claude_agent_sdk init handshake times out. The flag is sticky — the
+# heartbeat task reads it via is_wedged() / wedge_reason() and reports
+# runtime_state="wedged" so the platform flips status → degraded.
+
+
+
+def test_wedge_helpers_default_clean():
+    """Fresh module: no wedge."""
+    _reset_sdk_wedge_for_test()
+    assert is_wedged() is False
+    assert wedge_reason() == ""
+
+
+def test_mark_sdk_wedged_sets_flag_and_reason():
+    """First mark wins and sets both is_wedged() and the reason text."""
+    _reset_sdk_wedge_for_test()
+    _mark_sdk_wedged("init timeout — restart")
+    try:
+        assert is_wedged() is True
+        assert "init timeout" in wedge_reason()
+    finally:
+        _reset_sdk_wedge_for_test()
+
+
+def test_mark_sdk_wedged_sticky_first_wins():
+    """A second wedge call with a different reason does NOT overwrite
+    the first. The first cause is the one the user needs to see; later
+    knock-on errors from the same wedge would otherwise mask it."""
+    _reset_sdk_wedge_for_test()
+    _mark_sdk_wedged("first cause — Control request timeout")
+    _mark_sdk_wedged("noise from a downstream symptom")
+    try:
+        assert wedge_reason() == "first cause — Control request timeout"
+    finally:
+        _reset_sdk_wedge_for_test()
+
+
+@pytest.mark.asyncio
+async def test_execute_marks_wedge_on_control_request_timeout():
+    """End-to-end: when _run_query raises an exception whose formatted
+    error contains 'Control request timeout' (case-insensitive), the
+    executor's catch block flags the SDK as wedged. Subsequent
+    is_wedged() reads return True until process restart (or the
+    test-only reset)."""
+    _executor_mod._reset_sdk_wedge_for_test()
+    e = _make_executor()
+    ctx = _make_context(["test prompt"])
+    eq = _make_event_queue()
+
+    async def boom(prompt, options):
+        # Match the literal exception claude_agent_sdk raises in the
+        # observed wedge path.
+        raise Exception("Control request timeout: initialize")
+        yield  # pragma: no cover — make this an async generator
+
+    with patch("claude_sdk_executor.recall_memories", new=AsyncMock(return_value="")), \
+         patch("claude_sdk_executor.read_delegation_results", return_value=""), \
+         patch("claude_sdk_executor.commit_memory", new=AsyncMock()), \
+         patch("claude_sdk_executor.set_current_task", new=AsyncMock()), \
+         patch("claude_agent_sdk.query", new=boom):
+        try:
+            await e.execute(ctx, eq)
+            assert _executor_mod.is_wedged() is True, "wedge flag must be set"
+            assert "Control request timeout" in _executor_mod.wedge_reason()
+        finally:
+            _executor_mod._reset_sdk_wedge_for_test()
+
+
+@pytest.mark.asyncio
+async def test_execute_does_not_mark_wedge_on_unrelated_error():
+    """Sanity: a generic non-wedge exception (e.g. ValueError) MUST
+    NOT trigger the wedge flag. False-positives lock the workspace
+    into degraded for the whole process lifetime."""
+    _executor_mod._reset_sdk_wedge_for_test()
+    e = _make_executor()
+    ctx = _make_context(["test prompt"])
+    eq = _make_event_queue()
+
+    async def boom(prompt, options):
+        raise ValueError("ordinary tool failure, not a wedge")
+        yield  # pragma: no cover
+
+    with patch("claude_sdk_executor.recall_memories", new=AsyncMock(return_value="")), \
+         patch("claude_sdk_executor.read_delegation_results", return_value=""), \
+         patch("claude_sdk_executor.commit_memory", new=AsyncMock()), \
+         patch("claude_sdk_executor.set_current_task", new=AsyncMock()), \
+         patch("claude_agent_sdk.query", new=boom):
+        try:
+            await e.execute(ctx, eq)
+            assert _executor_mod.is_wedged() is False, "non-wedge error must not flip the flag"
+        finally:
+            _executor_mod._reset_sdk_wedge_for_test()
+
+
+@pytest.mark.asyncio
+async def test_execute_clears_wedge_on_successful_query():
+    """Auto-recovery: a process that previously hit a wedge should be
+    able to recover when the SDK starts working again. _run_query
+    calls _clear_sdk_wedge_on_success at the end of a clean
+    completion; the flag flips back to None and the next heartbeat
+    reports runtime_state empty so the platform recovers status →
+    online without forcing the user to restart the workspace."""
+    # Pre-set the wedge as if a prior call had tripped it.
+    _executor_mod._reset_sdk_wedge_for_test()
+    _executor_mod._mark_sdk_wedged("transient: Control request timeout: initialize")
+    assert _executor_mod.is_wedged() is True
+
+    e = _make_executor()
+    ctx = _make_context(["test prompt"])
+    eq = _make_event_queue()
+
+    async def good_query(prompt, options):
+        # Working SDK — yield one normal assistant message + result.
+        yield _FakeAssistantMessage([_FakeTextBlock("hello back")])
+        yield _FakeResultMessage(session_id="recovered-sess")
+
+    with patch("claude_sdk_executor.recall_memories", new=AsyncMock(return_value="")), \
+         patch("claude_sdk_executor.read_delegation_results", return_value=""), \
+         patch("claude_sdk_executor.commit_memory", new=AsyncMock()), \
+         patch("claude_sdk_executor.set_current_task", new=AsyncMock()), \
+         patch("claude_agent_sdk.query", new=good_query):
+        try:
+            await e.execute(ctx, eq)
+            assert _executor_mod.is_wedged() is False, "wedge flag must clear after a successful query"
+            assert _executor_mod.wedge_reason() == ""
+        finally:
+            _executor_mod._reset_sdk_wedge_for_test()
+
+
+@pytest.mark.asyncio
+async def test_execute_does_not_clear_wedge_on_empty_stream():
+    """Regression for the gate added in 3c4eef49: a stream that
+    iterates without raising but emits NEITHER an AssistantMessage
+    NOR a ResultMessage (degenerate or stub-driven shape) must NOT
+    clear the wedge flag. A real successful query yields at least
+    one of those; treating an empty stream as "recovered" would
+    falsely flip the workspace back to online without any evidence
+    the SDK is actually working."""
+    _executor_mod._reset_sdk_wedge_for_test()
+    _executor_mod._mark_sdk_wedged("pre-existing wedge — must not clear on empty stream")
+    assert _executor_mod.is_wedged() is True
+
+    e = _make_executor()
+    ctx = _make_context(["test prompt"])
+    eq = _make_event_queue()
+
+    async def empty_query(prompt, options):
+        # Iterator returns without yielding — the degenerate case.
+        if False:
+            yield  # pragma: no cover
+
+    with patch("claude_sdk_executor.recall_memories", new=AsyncMock(return_value="")), \
+         patch("claude_sdk_executor.read_delegation_results", return_value=""), \
+         patch("claude_sdk_executor.commit_memory", new=AsyncMock()), \
+         patch("claude_sdk_executor.set_current_task", new=AsyncMock()), \
+         patch("claude_agent_sdk.query", new=empty_query):
+        try:
+            await e.execute(ctx, eq)
+            assert _executor_mod.is_wedged() is True, \
+                "wedge must persist when the stream emitted no content"
+        finally:
+            _executor_mod._reset_sdk_wedge_for_test()
diff --git a/workspace/tests/test_executor_helpers.py b/workspace/tests/test_executor_helpers.py
index 017aaf60..557da50c 100644
--- a/workspace/tests/test_executor_helpers.py
+++ b/workspace/tests/test_executor_helpers.py
@@ -654,3 +654,255 @@ def test_classify_subprocess_error_generic_fallback():
     assert classify_subprocess_error("generic unknown failure", None) == "subprocess_error"
     # exit_code=0 with no keyword match also lands here
     assert classify_subprocess_error("mysterious but zero exit", 0) == "subprocess_error"
+
+
+# ============================================================================
+# Chat attachment helpers (drag-drop file + agent-returned file)
+# ============================================================================
+
+
+def test_resolve_attachment_uri_all_schemes(tmp_path, monkeypatch):
+    """All three canvas-issued URI shapes resolve to the same container path.
+
+    The canvas mints ``workspace:`` but the download endpoint used to accept
+    ``file:///`` and bare ``/workspace/…`` for legacy agents — the helper has
+    to handle all three so agents don't have to normalize before calling us.
+    """
+    from executor_helpers import resolve_attachment_uri, WORKSPACE_MOUNT
+
+    # Use a real path that starts with WORKSPACE_MOUNT. resolve() enforces
+    # the containment check — anything outside /workspace/ must return None.
+    ws_path = f"{WORKSPACE_MOUNT}/foo.txt"
+    assert resolve_attachment_uri(f"workspace:{ws_path}") == ws_path
+    assert resolve_attachment_uri(f"file://{ws_path}") == ws_path
+    assert resolve_attachment_uri(ws_path) == ws_path
+
+    # Out-of-tree is refused even when the raw path shape looks right.
+    # CWE-22 regression: a crafted "workspace:/workspace/../etc/passwd"
+    # must NOT return "/etc/passwd" just because resolve() normalizes it.
+    assert resolve_attachment_uri("/etc/passwd") is None
+    assert resolve_attachment_uri("workspace:/workspace/../etc/passwd") is None
+    assert resolve_attachment_uri("") is None
+    assert resolve_attachment_uri("https://example.com/x") is None
+
+
+def test_extract_attached_files_skips_unresolvable():
+    """Files with URIs that don't resolve to an existing file are dropped.
+
+    A crafted A2A message can include any uri it wants; we must not hand
+    non-existent or out-of-tree paths to downstream code as if they were
+    real attachments.
+    """
+    from types import SimpleNamespace
+    from executor_helpers import extract_attached_files
+
+    msg = SimpleNamespace(parts=[
+        SimpleNamespace(kind="file", file=SimpleNamespace(
+            uri="workspace:/etc/passwd", name="x", mimeType="text/plain"
+        )),
+        SimpleNamespace(root=SimpleNamespace(kind="file", file=SimpleNamespace(
+            uri="/workspace/does-not-exist", name="y", mimeType="text/plain"
+        ))),
+        SimpleNamespace(kind="text", text="ignored"),
+    ])
+    assert extract_attached_files(msg) == []
+
+
+def test_extract_attached_files_accepts_both_shapes(tmp_path, monkeypatch):
+    """a2a-sdk emits ``part.root.file`` via RootModel; some callers still
+    build ``part.file`` directly. Both shapes have to yield the same
+    dict structure — runtimes can pick either without surprise."""
+    from types import SimpleNamespace
+    from executor_helpers import extract_attached_files
+
+    # Stage two real files under a fake /workspace for the resolver
+    real_a = tmp_path / "a.txt"
+    real_b = tmp_path / "b.txt"
+    real_a.write_text("A")
+    real_b.write_text("B")
+    # Point the helper's containment check at tmp_path instead of /workspace
+    monkeypatch.setattr("executor_helpers.WORKSPACE_MOUNT", str(tmp_path))
+
+    msg = SimpleNamespace(parts=[
+        SimpleNamespace(kind="file", file=SimpleNamespace(
+            uri=f"workspace:{real_a}", name="a.txt", mimeType="text/plain"
+        )),
+        SimpleNamespace(root=SimpleNamespace(kind="file", file=SimpleNamespace(
+            uri=f"workspace:{real_b}", name="b.txt", mimeType="text/plain"
+        ))),
+    ])
+    out = extract_attached_files(msg)
+    assert len(out) == 2
+    assert {f["name"] for f in out} == {"a.txt", "b.txt"}
+
+
+def test_build_user_content_with_files_no_attachments_is_string():
+    """Zero attachments → plain string so models without multi-modal
+    support (most non-vision LLMs) see the same payload shape they always
+    did. Regressing this would break every runtime that assumed
+    content is a string."""
+    from executor_helpers import build_user_content_with_files
+
+    out = build_user_content_with_files("hello", [])
+    assert out == "hello"
+
+
+def test_build_user_content_with_files_non_image_is_string_with_manifest():
+    """Non-image attachments append a manifest line so the agent knows the
+    filename and absolute path. Without this the agent had no signal that
+    anything was attached — see canvas/src/components/tabs/ChatTab.tsx
+    and the "I'm not sure what you're referring to" user report."""
+    from executor_helpers import build_user_content_with_files
+
+    content = build_user_content_with_files("read this", [
+        {"name": "app.log", "mime_type": "text/plain", "path": "/workspace/app.log"},
+    ])
+    assert isinstance(content, str)
+    assert "app.log" in content and "/workspace/app.log" in content
+    assert "read this" in content
+
+
+def test_build_user_content_with_files_image_is_multimodal(tmp_path):
+    """Image attachments yield the OpenAI-compat list-of-parts shape so
+    vision models see the bytes. Data URL check covers the common
+    regression where an empty/missing file silently drops the image part."""
+    from executor_helpers import build_user_content_with_files
+
+    # Minimal 1x1 PNG
+    png = tmp_path / "x.png"
+    png.write_bytes(bytes.fromhex(
+        "89504e470d0a1a0a0000000d49484452000000010000000108060000001f"
+        "15c4890000000a49444154789c6300010000000500010d0a2db40000000049454e44ae426082"
+    ))
+    content = build_user_content_with_files("describe", [
+        {"name": "x.png", "mime_type": "image/png", "path": str(png)},
+    ])
+    assert isinstance(content, list)
+    assert len(content) == 2
+    assert content[0]["type"] == "text"
+    assert content[1]["type"] == "image_url"
+    assert content[1]["image_url"]["url"].startswith("data:image/png;base64,")
+
+
+def test_build_user_content_with_files_large_image_skipped(tmp_path, monkeypatch):
+    """Images over the inline cap don't break the request — the manifest
+    still carries the path so the agent can read via its file_read tool
+    without blowing past provider context limits with a 50MB base64 blob."""
+    from executor_helpers import build_user_content_with_files
+    monkeypatch.setattr("executor_helpers.MAX_INLINE_ATTACHMENT_BYTES", 10)
+
+    big = tmp_path / "big.png"
+    big.write_bytes(b"x" * 100)
+    content = build_user_content_with_files("describe", [
+        {"name": "big.png", "mime_type": "image/png", "path": str(big)},
+    ])
+    # Image too large → no image_url entry, but the text manifest still mentions it
+    assert isinstance(content, list)
+    # Only the text part — the image_url was skipped
+    assert all(c["type"] == "text" for c in content)
+
+
+def test_collect_outbound_files_stages_workspace_paths(tmp_path, monkeypatch):
+    """Agent reply mentioning a /workspace/… path → each unique existing
+    file becomes an attachment, staged under chat-uploads. A crafted
+    reply referencing /etc/passwd must NOT escape."""
+    from pathlib import Path as _Path
+    from executor_helpers import collect_outbound_files
+
+    # Point the chat-uploads dir and the workspace root at a sandboxed tmp.
+    # resolve() normalizes macOS /var → /private/var so the helper's
+    # containment check (which also resolve()s) sees identical prefixes.
+    ws_root = _Path(str(tmp_path / "workspace"))
+    ws_root.mkdir()
+    ws_root = ws_root.resolve()
+    uploads = ws_root / ".molecule" / "chat-uploads"
+    uploads.mkdir(parents=True)
+    monkeypatch.setattr("executor_helpers.WORKSPACE_MOUNT", str(ws_root))
+    monkeypatch.setattr("executor_helpers.CHAT_UPLOADS_DIR", str(uploads))
+    # Rebuild the regex against the overridden mount (module caches it)
+    import re as _re
+    monkeypatch.setattr(
+        "executor_helpers._WORKSPACE_PATH_RE",
+        _re.compile(rf"(?:^|[\s`(\[])({ws_root}/[A-Za-z0-9_./\-]+)"),
+    )
+
+    # A real file inside the fake workspace
+    report = ws_root / "report.txt"
+    report.write_text("data")
+    # A decoy outside the workspace — must be ignored even if mentioned
+    (tmp_path / "secret.txt").write_text("leaked")
+
+    reply = f"Saved to {report} — also see {tmp_path}/secret.txt for extras."
+    out = collect_outbound_files(reply)
+    assert len(out) == 1
+    assert out[0]["name"] == "report.txt"
+    # Staged copy lives under chat-uploads (the download endpoint's whitelist)
+    assert out[0]["path"].startswith(str(uploads))
+
+
+def test_ensure_workspace_writable_chmods_777(tmp_path, monkeypatch):
+    """The platform-level hook opens /workspace + chat-uploads to 777 so
+    agents running as any non-root user can write files the user will
+    then download. This is the single point of fix for what used to need
+    a chmod in every template's Dockerfile."""
+    import stat
+    from executor_helpers import ensure_workspace_writable
+
+    ws = tmp_path / "workspace"
+    ws.mkdir(mode=0o755)
+    uploads = ws / ".molecule" / "chat-uploads"
+    # Don't pre-create uploads — the helper must makedirs it.
+    monkeypatch.setattr("executor_helpers.WORKSPACE_MOUNT", str(ws))
+    monkeypatch.setattr("executor_helpers.CHAT_UPLOADS_DIR", str(uploads))
+
+    ensure_workspace_writable()
+
+    assert uploads.is_dir(), "chat-uploads dir should be created"
+    assert stat.S_IMODE(ws.stat().st_mode) == 0o777
+    assert stat.S_IMODE(uploads.stat().st_mode) == 0o777
+
+
+def test_ensure_workspace_writable_tolerates_non_root(tmp_path, monkeypatch, caplog):
+    """When molecule-runtime isn't root (rare CP configurations), the
+    chmod silently no-ops rather than crashing boot — a misconfigured
+    perm is recoverable; a SystemExit here would wedge the workspace
+    in provisioning forever."""
+    import logging
+    from executor_helpers import ensure_workspace_writable
+
+    ws = tmp_path / "workspace"
+    ws.mkdir()
+    monkeypatch.setattr("executor_helpers.WORKSPACE_MOUNT", str(ws))
+    monkeypatch.setattr("executor_helpers.CHAT_UPLOADS_DIR", str(ws / "x"))
+
+    def _boom(*_a, **_kw):
+        raise PermissionError("Operation not permitted")
+
+    monkeypatch.setattr("executor_helpers.os.chmod", _boom)
+    with caplog.at_level(logging.INFO, logger="executor_helpers"):
+        ensure_workspace_writable()  # must not raise
+
+
+def test_collect_outbound_files_deduplicates(tmp_path, monkeypatch):
+    """Reply mentioning the same path twice should only attach once."""
+    from pathlib import Path as _Path
+    from executor_helpers import collect_outbound_files
+
+    ws_root = _Path(str(tmp_path / "workspace"))
+    ws_root.mkdir()
+    ws_root = ws_root.resolve()
+    uploads = ws_root / ".molecule" / "chat-uploads"
+    uploads.mkdir(parents=True)
+    monkeypatch.setattr("executor_helpers.WORKSPACE_MOUNT", str(ws_root))
+    monkeypatch.setattr("executor_helpers.CHAT_UPLOADS_DIR", str(uploads))
+    import re as _re
+    monkeypatch.setattr(
+        "executor_helpers._WORKSPACE_PATH_RE",
+        _re.compile(rf"(?:^|[\s`(\[])({ws_root}/[A-Za-z0-9_./\-]+)"),
+    )
+
+    report = ws_root / "report.txt"
+    report.write_text("data")
+    reply = f"Wrote {report}. Again at {report}."
+    out = collect_outbound_files(reply)
+    assert len(out) == 1
diff --git a/workspace/tests/test_heartbeat.py b/workspace/tests/test_heartbeat.py
index 09e971e8..b4b51b3c 100644
--- a/workspace/tests/test_heartbeat.py
+++ b/workspace/tests/test_heartbeat.py
@@ -269,6 +269,19 @@ async def test_check_delegations_sends_self_message(tmp_path):
     a2a_call = post_calls[0]
     assert "/a2a" in str(a2a_call)
 
+    # Regression: the self-message MUST include X-Workspace-ID set to
+    # the workspace's own id, so the platform's a2a_receive logger
+    # records source_id = workspace_id (not NULL). Without this header
+    # the canvas's My Chat tab (which filters source_id IS NULL) would
+    # render the internal "Delegation results are ready..." trigger
+    # as a user-typed message. Bug observed 2026-04-25 on UX A/B Lab
+    # Design Director chat.
+    a2a_headers = a2a_call.kwargs.get("headers") or {}
+    assert a2a_headers.get("X-Workspace-ID") == "ws-abc", (
+        f"self-message must self-identify via X-Workspace-ID header, "
+        f"got headers={a2a_headers!r}"
+    )
+
 
 @pytest.mark.asyncio
 async def test_check_delegations_cooldown():