merge: sync staging into refactor/remove-canvas-hermes-runtime-profile-2054 (pickup #2099+#2107 TLS fixes)

2026-04-26 12:12:51 -07:00 · 2026-04-26 12:12:51 -07:00 · b8f24e93da
commit b8f24e93da
parent 756aa00e1f b0a33d9ebf
132 changed files with 12284 additions and 623 deletions
--- a/.github/workflows/auto-tag-runtime.yml
+++ b/.github/workflows/auto-tag-runtime.yml
@ -0,0 +1,113 @@
+name: auto-tag-runtime
+
+# Auto-tag runtime releases on every merge to main that touches workspace/.
+# This is the entry point of the runtime CD chain:
+#
+#   merge PR → auto-tag-runtime (this) → publish-runtime → cascade → template
+#   image rebuilds → repull on hosts.
+#
+# Default bump is patch. Override via PR label `release:minor` or
+# `release:major` BEFORE merging — the label is read off the merged PR
+# associated with the push commit.
+#
+# Skips when:
+#   - The push isn't to main (other branches don't auto-release).
+#   - The merge commit message contains `[skip-release]` (escape hatch
+#     for cleanup PRs that touch workspace/ but shouldn't ship).
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "workspace/**"
+      - "scripts/build_runtime_package.py"
+      - ".github/workflows/auto-tag-runtime.yml"
+      - ".github/workflows/publish-runtime.yml"
+
+permissions:
+  contents: write    # to push the new tag
+  pull-requests: read # to read labels off the merged PR
+
+concurrency:
+  # Serialize tag bumps so two near-simultaneous merges can't both think
+  # they're 0.1.6 and race to push the same tag.
+  group: auto-tag-runtime
+  cancel-in-progress: false
+
+jobs:
+  tag:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0    # need full tag history for `git describe` / sort
+
+      - name: Skip when commit asks
+        id: skip
+        run: |
+          MSG=$(git log -1 --format=%B "${{ github.sha }}")
+          if echo "$MSG" | grep -qiE '\[skip-release\]|\[no-release\]'; then
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            echo "Commit message contains [skip-release] — no tag will be created."
+          else
+            echo "skip=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Determine bump kind from PR label
+        id: bump
+        if: steps.skip.outputs.skip != 'true'
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          # The merged PR for this push commit. `gh pr list --search` finds
+          # closed PRs whose merge commit matches; we take the first.
+          PR=$(gh pr list --state merged --search "${{ github.sha }}" --json number,labels --jq '.[0]' 2>/dev/null || echo "")
+          if [ -z "$PR" ] || [ "$PR" = "null" ]; then
+            echo "No merged PR found for ${{ github.sha }} — defaulting to patch bump."
+            echo "kind=patch" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          LABELS=$(echo "$PR" | jq -r '.labels[].name')
+          if echo "$LABELS" | grep -qx 'release:major'; then
+            echo "kind=major" >> "$GITHUB_OUTPUT"
+          elif echo "$LABELS" | grep -qx 'release:minor'; then
+            echo "kind=minor" >> "$GITHUB_OUTPUT"
+          else
+            echo "kind=patch" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Compute next version from latest runtime-v* tag
+        id: version
+        if: steps.skip.outputs.skip != 'true'
+        run: |
+          # Find the highest runtime-vX.Y.Z tag. `sort -V` handles semver
+          # ordering; `grep` filters to the right tag prefix.
+          LATEST=$(git tag --list 'runtime-v*' | sort -V | tail -1)
+          if [ -z "$LATEST" ]; then
+            # No prior tag — start the runtime line at 0.1.0.
+            CURRENT="0.0.0"
+          else
+            CURRENT="${LATEST#runtime-v}"
+          fi
+          MAJOR=$(echo "$CURRENT" | cut -d. -f1)
+          MINOR=$(echo "$CURRENT" | cut -d. -f2)
+          PATCH=$(echo "$CURRENT" | cut -d. -f3)
+          case "${{ steps.bump.outputs.kind }}" in
+            major) MAJOR=$((MAJOR+1)); MINOR=0; PATCH=0;;
+            minor) MINOR=$((MINOR+1)); PATCH=0;;
+            patch) PATCH=$((PATCH+1));;
+          esac
+          NEW="$MAJOR.$MINOR.$PATCH"
+          echo "current=$CURRENT" >> "$GITHUB_OUTPUT"
+          echo "new=$NEW" >> "$GITHUB_OUTPUT"
+          echo "Bumping runtime $CURRENT → $NEW (${{ steps.bump.outputs.kind }})"
+
+      - name: Push new tag
+        if: steps.skip.outputs.skip != 'true'
+        run: |
+          NEW_TAG="runtime-v${{ steps.version.outputs.new }}"
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git tag -a "$NEW_TAG" -m "runtime $NEW_TAG (auto-bump from ${{ steps.bump.outputs.kind }})"
+          git push origin "$NEW_TAG"
+          echo "Pushed $NEW_TAG — publish-runtime workflow will fire on the tag."
--- a/.github/workflows/publish-runtime.yml
+++ b/.github/workflows/publish-runtime.yml
@ -0,0 +1,161 @@
+name: publish-runtime
+
+# Publishes molecule-ai-workspace-runtime to PyPI from monorepo workspace/.
+# Monorepo workspace/ is the only source-of-truth for runtime code; this
+# workflow is the bridge from monorepo edits to the PyPI artifact that
+# the 8 workspace-template-* repos depend on.
+#
+# Triggered by:
+#   - Pushing a tag matching `runtime-vX.Y.Z` (the version is derived from
+#     the tag — `runtime-v0.1.6` publishes `0.1.6`).
+#   - Manual workflow_dispatch with an explicit `version` input (useful for
+#     dev/test releases without tagging the repo).
+#
+# The workflow:
+#   1. Runs scripts/build_runtime_package.py to copy workspace/ →
+#      build/molecule_runtime/ with imports rewritten (`a2a_client` →
+#      `molecule_runtime.a2a_client`).
+#   2. Builds wheel + sdist with `python -m build`.
+#   3. Publishes to PyPI via twine + repo secret PYPI_TOKEN.
+#
+# After publish: the 8 template repos pick up the new version on their
+# next image rebuild (their requirements.txt pin
+# `molecule-ai-workspace-runtime>=0.1.0`, so any new release is eligible).
+# To force-pull immediately, bump the pin in each template repo's
+# requirements.txt and merge — that triggers their own publish-image.yml.
+
+on:
+  push:
+    tags:
+      - "runtime-v*"
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "Version to publish (e.g. 0.1.6). Required for manual dispatch."
+        required: true
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    environment: pypi-publish
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: pip
+
+      - name: Derive version from tag or input
+        id: version
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            VERSION="${{ inputs.version }}"
+          else
+            # Tag is `runtime-vX.Y.Z` — strip the prefix.
+            VERSION="${GITHUB_REF_NAME#runtime-v}"
+          fi
+          if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+(\.dev[0-9]+|rc[0-9]+|a[0-9]+|b[0-9]+|\.post[0-9]+)?$'; then
+            echo "::error::version $VERSION does not match PEP 440"
+            exit 1
+          fi
+          echo "version=$VERSION" >> "$GITHUB_OUTPUT"
+          echo "Publishing molecule-ai-workspace-runtime $VERSION"
+
+      - name: Install build tooling
+        run: pip install build twine
+
+      - name: Build package from workspace/
+        run: |
+          python scripts/build_runtime_package.py \
+            --version "${{ steps.version.outputs.version }}" \
+            --out "${{ runner.temp }}/runtime-build"
+
+      - name: Build wheel + sdist
+        working-directory: ${{ runner.temp }}/runtime-build
+        run: python -m build
+
+      - name: Verify package contents (sanity)
+        working-directory: ${{ runner.temp }}/runtime-build
+        run: |
+          python -m twine check dist/*
+          # Smoke-import the built wheel to catch import-rewrite mistakes
+          # before they hit PyPI. The package depends on a2a-sdk + httpx
+          # via pyproject; install those so the smoke import resolves.
+          python -m venv /tmp/smoke
+          /tmp/smoke/bin/pip install --quiet dist/*.whl
+          WORKSPACE_ID=00000000-0000-0000-0000-000000000000 \
+          PLATFORM_URL=http://localhost:8080 \
+            /tmp/smoke/bin/python -c "
+          from molecule_runtime import a2a_client, a2a_tools
+          from molecule_runtime.builtin_tools import memory
+          from molecule_runtime.adapters import get_adapter, BaseAdapter, AdapterConfig
+          assert a2a_client._A2A_QUEUED_PREFIX, 'queued prefix missing — chat-leak fix not in build'
+          print('✓ smoke import passed')
+          "
+
+      - name: Publish to PyPI
+        working-directory: ${{ runner.temp }}/runtime-build
+        env:
+          TWINE_USERNAME: __token__
+          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+        run: python -m twine upload dist/*
+
+  cascade:
+    # After PyPI accepts the upload, fan out a repository_dispatch to each
+    # template repo so they rebuild their image against the new runtime.
+    # Each template's `runtime-published.yml` receiver picks up the event,
+    # pulls the new PyPI version (their requirements.txt pin is `>=`), and
+    # republishes ghcr.io/molecule-ai/workspace-template-<runtime>:latest.
+    #
+    # Soft-fail per repo: if one template's dispatch fails (perms missing,
+    # repo archived, etc.) we still try the others and surface the failures
+    # in the workflow summary instead of aborting the whole cascade.
+    needs: publish
+    runs-on: ubuntu-latest
+    steps:
+      - name: Fan out repository_dispatch
+        env:
+          # Fine-grained PAT with `actions:write` on the 8 template repos.
+          # GITHUB_TOKEN can't fire dispatches across repos — needs an explicit
+          # token. Stored as a repo secret; rotate per the standard schedule.
+          DISPATCH_TOKEN: ${{ secrets.TEMPLATE_DISPATCH_TOKEN }}
+          RUNTIME_VERSION: ${{ needs.publish.outputs.version || steps.version.outputs.version }}
+        run: |
+          set +e   # don't abort on a single repo failure — collect them all
+          if [ -z "$DISPATCH_TOKEN" ]; then
+            echo "::warning::TEMPLATE_DISPATCH_TOKEN secret not set — skipping cascade. PyPI was published; templates will pick up the new version on their own next rebuild."
+            exit 0
+          fi
+          # Re-derive version from the tag here too (in case publish job
+          # didn't expose an output the previous step's reference reads).
+          VERSION="${GITHUB_REF_NAME#runtime-v}"
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            VERSION="${{ inputs.version }}"
+          fi
+          TEMPLATES="claude-code langgraph crewai autogen deepagents hermes gemini-cli openclaw"
+          FAILED=""
+          for tpl in $TEMPLATES; do
+            REPO="Molecule-AI/molecule-ai-workspace-template-$tpl"
+            STATUS=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" \
+              -X POST "https://api.github.com/repos/$REPO/dispatches" \
+              -H "Authorization: Bearer $DISPATCH_TOKEN" \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              -d "{\"event_type\":\"runtime-published\",\"client_payload\":{\"runtime_version\":\"$VERSION\"}}")
+            if [ "$STATUS" = "204" ]; then
+              echo "✓ dispatched $tpl ($VERSION)"
+            else
+              echo "::warning::✗ failed to dispatch $tpl: HTTP $STATUS — $(cat /tmp/dispatch.out)"
+              FAILED="$FAILED $tpl"
+            fi
+          done
+          if [ -n "$FAILED" ]; then
+            echo "::warning::Cascade incomplete. Failed templates:$FAILED"
+            # Don't fail the whole job — PyPI publish already succeeded;
+            # operators can retry the failed templates manually.
+          fi
--- a/.github/workflows/sweep-cf-orphans.yml
+++ b/.github/workflows/sweep-cf-orphans.yml
@ -40,10 +40,14 @@ on:
        description: "Override safety gate (default 50, set higher only for major cleanup)"
        required: false
        default: "50"
-  # Required-check support: scheduled-only today, but include merge_group
-  # so a future branch-protection wire-in doesn't need a workflow edit.
-  merge_group:
-    types: [checks_requested]
+  # No `merge_group:` trigger on purpose. This is a janitor — it doesn't
+  # need to gate merges, and including it as written before #2088 fired
+  # the full sweep job (or its secret-check) on every PR going through
+  # the merge queue, generating one red CI run per merge-queue eval. If
+  # this workflow is ever wired up as a required check, re-add
+  #   merge_group: { types: [checks_requested] }
+  # AND gate the sweep step with `if: github.event_name != 'merge_group'`
+  # so merge-queue evals report success without actually running.

 # Don't let two sweeps race the same zone. workflow_dispatch during a
 # scheduled run would otherwise issue duplicate DELETE calls.
@ -77,9 +81,12 @@ jobs:
      - uses: actions/checkout@v4

      - name: Verify required secrets present
-        # Fail fast and loud if a secret is unset — sweep-cf-orphans.sh
-        # also checks via `need`, but we want a single distinct error
-        # in the workflow log instead of script-level multi-line noise.
+        id: verify
+        # Soft skip when secrets aren't configured. The 6 secrets have
+        # to be set on the repo manually before this workflow can do
+        # real work; until they are, the schedule is a no-op rather
+        # than a recurring red CI run. workflow_dispatch surfaces a
+        # warning so an operator running it ad-hoc sees the gap.
        run: |
          missing=()
          for var in CF_API_TOKEN CF_ZONE_ID CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
@ -88,12 +95,15 @@ jobs:
            fi
          done
          if [ ${#missing[@]} -gt 0 ]; then
-            echo "::error::missing required secret(s): ${missing[*]}"
-            exit 2
+            echo "::warning::skipping sweep — secrets not yet configured: ${missing[*]}"
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            exit 0
          fi
          echo "All required secrets present ✓"
+          echo "skip=false" >> "$GITHUB_OUTPUT"

      - name: Run sweep
+        if: steps.verify.outputs.skip != 'true'
        # Schedule-vs-dispatch dry-run asymmetry (intentional):
        #   - Scheduled runs: github.event.inputs.dry_run is empty →
        #     defaults to "false" below → script runs with --execute
--- a/canvas/e2e/staging-setup.ts
+++ b/canvas/e2e/staging-setup.ts
@ -46,7 +46,17 @@ const TENANT_DOMAIN = process.env.STAGING_TENANT_DOMAIN || "staging.moleculesai.
 // were blocking staging→main syncs on 2026-04-24.
 const PROVISION_TIMEOUT_MS = 20 * 60 * 1000;
 const WORKSPACE_ONLINE_TIMEOUT_MS = 20 * 60 * 1000;
-const TLS_TIMEOUT_MS = 3 * 60 * 1000;
+
+// TLS readiness depends on (1) Cloudflare DNS propagation through the
+// edge, (2) the tenant's CF Tunnel registering the new hostname, (3)
+// CF's edge ACME cert provisioning + cache. Each of these layers can
+// add 1-3 min on its own under heavy staging load. Bumped 10→15 min
+// after a burst of canary failures correlated with CP changes (#2090).
+// Stays below the 20-min PROVISION_TIMEOUT envelope so a genuinely-
+// stuck tenant fails-loud at the provision step rather than
+// masquerading as a TLS issue. Kept aligned with
+// tests/e2e/test_staging_full_saas.sh.
+const TLS_TIMEOUT_MS = 15 * 60 * 1000;

 async function jsonFetch(
  url: string,
--- a/canvas/next.config.ts
+++ b/canvas/next.config.ts
@ -1,7 +1,100 @@
 import type { NextConfig } from "next";
+import { existsSync, readFileSync } from "node:fs";
+import { dirname, join } from "node:path";
+
+// Load NEXT_PUBLIC_* vars from the monorepo root .env so a fresh
+// `pnpm dev` works without a per-developer canvas/.env.local. Next.js
+// only auto-loads .env from the project root by default — but our
+// canonical config (NEXT_PUBLIC_PLATFORM_URL, NEXT_PUBLIC_WS_URL,
+// MOLECULE_ENV, etc.) lives at the monorepo root, gitignored, shared
+// by the Go platform binary. Without this, the canvas falls back to
+// `window.location` (`ws://localhost:3000/ws`) and the WS pill stays
+// "Reconnecting" forever because Next.js dev doesn't serve /ws.
+//
+// Mirrors workspace-server/cmd/server/dotenv.go's monorepo-rooted .env
+// loader. Both processes look for the SAME marker (`workspace-server/
+// go.mod`) so a developer renaming or relocating the repo only has to
+// update one heuristic. Production is unaffected: `output: "standalone"`
+// bakes resolved env into the build, and the marker file isn't shipped.
+loadMonorepoEnv();

 const nextConfig: NextConfig = {
  output: "standalone",
 };

 export default nextConfig;
+
+function loadMonorepoEnv() {
+  const root = findMonorepoRoot(__dirname);
+  if (!root) return;
+  const envPath = join(root, ".env");
+  if (!existsSync(envPath)) return;
+  const body = readFileSync(envPath, "utf8");
+  let loaded = 0;
+  let skipped = 0;
+  for (const line of body.split(/\r?\n/)) {
+    const kv = parseLine(line);
+    if (!kv) continue;
+    const [k, v] = kv;
+    // Existing env wins. NOTE: an explicitly-set empty string
+    // (`KEY=` exported from a parent shell, where Node represents it
+    // as `""` not `undefined`) counts as "set" — we keep the empty
+    // value rather than backfilling from the file. Matches Go's
+    // os.LookupEnv check in workspace-server/cmd/server/dotenv.go so
+    // both processes treat the same input identically. Operators who
+    // want the file value to win must `unset KEY` in the launching
+    // shell.
+    if (process.env[k] !== undefined) {
+      skipped++;
+      continue;
+    }
+    process.env[k] = v;
+    loaded++;
+  }
+  // eslint-disable-next-line no-console
+  console.log(
+    `[next.config] loaded ${loaded} vars from ${envPath} (${skipped} already set in env)`,
+  );
+}
+
+function findMonorepoRoot(start: string): string | null {
+  let dir = start;
+  for (let i = 0; i < 6; i++) {
+    if (existsSync(join(dir, "workspace-server", "go.mod"))) return dir;
+    const parent = dirname(dir);
+    if (parent === dir) break;
+    dir = parent;
+  }
+  return null;
+}
+
+// Mirror of workspace-server/cmd/server/dotenv.go's parseDotEnvLine
+// — same rules so the two loaders agree on every line in the shared
+// .env. If you change one parser, change the other.
+function parseLine(raw: string): [string, string] | null {
+  let line = raw.replace(/^/, "").trim();
+  if (line === "" || line.startsWith("#")) return null;
+  // `export ` prefix uses a literal space — `export\tFOO=bar` with a
+  // tab is intentionally rejected, matching the Go mirror in
+  // workspace-server/cmd/server/dotenv.go. Shells emit the prefix
+  // with a space; tabs would only appear in hand-mangled files.
+  if (line.startsWith("export ")) line = line.slice("export ".length).trimStart();
+  const eq = line.indexOf("=");
+  if (eq <= 0) return null;
+  const k = line.slice(0, eq).trim();
+  let v = line.slice(eq + 1).replace(/^[ \t]+/, "");
+  if (v.length >= 2 && (v[0] === '"' || v[0] === "'")) {
+    const quote = v[0];
+    const end = v.indexOf(quote, 1);
+    if (end >= 0) return [k, v.slice(1, end)];
+    // unterminated — fall through to bare-value handling
+  }
+  for (let i = 0; i < v.length; i++) {
+    if (v[i] !== "#") continue;
+    if (i === 0 || v[i - 1] === " " || v[i - 1] === "\t") {
+      v = v.slice(0, i);
+      break;
+    }
+  }
+  return [k, v.trim()];
+}
--- a/canvas/src/app/globals.css
+++ b/canvas/src/app/globals.css
@ -1,5 +1,9 @@
@import "xterm/css/xterm.css";
+/* Theme tokens MUST load before any feature stylesheet that
+   references them so custom properties are in scope. */
+@import "../styles/theme-tokens.css";
@import "../styles/settings-panel.css";
+@import "../styles/org-deploy.css";

@tailwind base;
@tailwind components;
@ -38,7 +42,20 @@ body {
 }

 .react-flow__node {
-  transition: box-shadow 0.2s ease;
+  /* Transform transition drives the "spawn from parent" motion —
+     org-deploy sets the node's initial position to the parent's
+     absolute coords, then repositions to the real slot, and this
+     transition interpolates the translate() in between.
+     Non-deploy workspace moves (drag, nest) get the same smoothing
+     for free. */
+  transition:
+    box-shadow var(--mol-duration-fast) ease,
+    transform var(--mol-duration-spawn) var(--mol-easing-bounce-out);
+}
+/* Drag events must feel instant — React Flow adds this class
+   for the lifetime of the gesture. */
+.react-flow__node.dragging {
+  transition: box-shadow var(--mol-duration-fast) ease;
 }

 /* Scrollbar styling */
--- a/canvas/src/app/page.tsx
+++ b/canvas/src/app/page.tsx
@ -7,13 +7,19 @@ import { CommunicationOverlay } from "@/components/CommunicationOverlay";
 import { Spinner } from "@/components/Spinner";
 import { connectSocket, disconnectSocket } from "@/store/socket";
 import { useCanvasStore } from "@/store/canvas";
-import { api } from "@/lib/api";
+import { api, PlatformUnavailableError } from "@/lib/api";
 import type { WorkspaceData } from "@/store/socket";

 export default function Home() {
  const hydrationError = useCanvasStore((s) => s.hydrationError);
  const setHydrationError = useCanvasStore((s) => s.setHydrationError);
  const [hydrating, setHydrating] = useState(true);
+  // Distinct from hydrationError: platform-down is its own UX path
+  // (different copy, different action — the user's next step is to
+  // check local services, not to retry the API call). Tracked
+  // separately rather than encoded into hydrationError so the
+  // generic-error branch can stay simple.
+  const [platformDown, setPlatformDown] = useState(false);

  useEffect(() => {
    connectSocket();
@ -28,8 +34,11 @@ export default function Home() {
        useCanvasStore.getState().setViewport(viewport);
      }
    }).catch((err) => {
-      // Initial hydration failed — show error banner to user
      console.error("Canvas: initial hydration failed", err);
+      if (err instanceof PlatformUnavailableError) {
+        setPlatformDown(true);
+        return;
+      }
      useCanvasStore.getState().setHydrationError(
        err instanceof Error && err.message ? err.message : "Failed to load canvas"
      );
@ -53,6 +62,10 @@ export default function Home() {
    );
  }

+  if (platformDown) {
+    return <PlatformDownDiagnostic />;
+  }
+
  return (
    <>
      <Canvas />
@ -83,3 +96,43 @@ export default function Home() {
    </>
  );
 }
+
+/**
+ * Dedicated diagnostic for the case where the platform reported its
+ * datastore (Postgres / Redis) is unreachable. Distinct from the
+ * generic API-error overlay: the user's next action is to check
+ * local services, not to retry the API call. Includes the exact
+ * commands for the common dev-host setup.
+ */
+function PlatformDownDiagnostic() {
+  return (
+    <div
+      role="alert"
+      className="fixed inset-0 flex flex-col items-center justify-center bg-zinc-950 text-zinc-300 gap-5 z-[9999] px-6"
+    >
+      <div className="text-amber-400 text-sm font-semibold uppercase tracking-wider">
+        Platform infrastructure unreachable
+      </div>
+      <p className="text-zinc-400 text-sm max-w-lg text-center leading-relaxed">
+        The platform server returned <code className="font-mono text-amber-300">503 platform_unavailable</code>.
+        That means it can&apos;t reach Postgres or Redis to validate your session.
+        Most common cause on a dev host: one of those services stopped.
+      </p>
+      <div className="bg-zinc-900/80 border border-zinc-700/50 rounded-lg px-4 py-3 max-w-lg w-full">
+        <div className="text-[10px] uppercase tracking-wider text-zinc-500 mb-2">Try first</div>
+        <pre className="text-[12px] text-zinc-300 font-mono whitespace-pre-wrap leading-relaxed">{`brew services start postgresql@14
+brew services start redis`}</pre>
+      </div>
+      <p className="text-[11px] text-zinc-500 max-w-lg text-center">
+        If both are running, check <code className="font-mono">/tmp/molecule-server.log</code> for
+        the underlying error. If you&apos;re on hosted SaaS, this is a platform incident — try again in a moment.
+      </p>
+      <button
+        onClick={() => window.location.reload()}
+        className="px-4 py-2 bg-blue-600 hover:bg-blue-500 text-white rounded-md text-sm mt-2"
+      >
+        Reload
+      </button>
+    </div>
+  );
+}
--- a/canvas/src/components/A2ATopologyOverlay.tsx
+++ b/canvas/src/components/A2ATopologyOverlay.tsx
@ -74,7 +74,11 @@ export function buildA2AEdges(
    });
  }

-  // 3. Build React Flow Edge objects
+  // 3. Build React Flow Edge objects. We tag every overlay edge with
+  //    type: "a2a" so React Flow renders it via our custom A2AEdge
+  //    component (canvas/A2AEdge.tsx). The custom component portals
+  //    its label out of the SVG layer so it (a) doesn't get hidden
+  //    behind workspace cards and (b) is clickable.
  return Array.from(map.values()).map(({ source, target, count, lastAt }) => {
    const isHot = now - lastAt < A2A_HOT_MS;
    const stroke = isHot ? "#8b5cf6" : "#3b82f6"; // violet-500 : blue-500
@ -84,6 +88,7 @@ export function buildA2AEdges(

    return {
      id: `a2a-${source}-${target}`,
+      type: "a2a",
      source,
      target,
      animated: isHot,
@ -96,22 +101,22 @@ export function buildA2AEdges(
      style: {
        stroke,
        strokeWidth: 2,
-        // Non-blocking: label overlay never intercepts pointer events
+        // Path itself stays non-interactive so node drags through
+        // the line still work. The clickable target is the label
+        // pill, which sets pointerEvents: all on its own div.
        pointerEvents: "none" as React.CSSProperties["pointerEvents"],
      },
+      // `label` keeps the same string for back-compat with any test
+      // that asserts on it (e.g. buildA2AEdges output shape). Custom
+      // edge reads the rich data from `data` so the label visual is
+      // not constrained to a string anymore.
      label,
-      labelStyle: {
-        fill: "#a1a1aa",   // zinc-400
-        fontSize: 10,
-        pointerEvents: "none" as React.CSSProperties["pointerEvents"],
+      data: {
+        count,
+        lastAt,
+        isHot,
+        label,
      },
-      labelBgStyle: {
-        fill: "#18181b",   // zinc-900
-        fillOpacity: 0.9,
-        pointerEvents: "none" as React.CSSProperties["pointerEvents"],
-      },
-      labelBgPadding: [4, 6] as [number, number],
-      labelBgBorderRadius: 4,
    };
  });
 }
--- a/canvas/src/components/Canvas.tsx
+++ b/canvas/src/components/Canvas.tsx
@ -36,11 +36,22 @@ import { DropTargetBadge } from "./canvas/DropTargetBadge";
 import { useDragHandlers } from "./canvas/useDragHandlers";
 import { useKeyboardShortcuts } from "./canvas/useKeyboardShortcuts";
 import { useCanvasViewport } from "./canvas/useCanvasViewport";
+import { A2AEdge } from "./canvas/A2AEdge";

 const nodeTypes = {
  workspaceNode: WorkspaceNode,
 };

+// Custom edge types. The default React Flow edge renders its label
+// inside the SVG group (always under nodes) with pointerEvents: none
+// inherited from the path. A2AEdge portals the label to a sibling
+// DOM layer so it renders above nodes and accepts clicks. Keep the
+// reference stable (module-scope const) so React Flow doesn't see a
+// new edgeTypes object on every render and warn about prop churn.
+const edgeTypes = {
+  a2a: A2AEdge,
+};
+
 const defaultEdgeOptions: Partial<Edge> = {
  animated: true,
  style: {
@ -58,14 +69,95 @@ export function Canvas() {
 }

 function CanvasInner() {
-  const nodes = useCanvasStore((s) => s.nodes);
+  const rawNodes = useCanvasStore((s) => s.nodes);
  const edges = useCanvasStore((s) => s.edges);
  const a2aEdges = useCanvasStore((s) => s.a2aEdges);
  const showA2AEdges = useCanvasStore((s) => s.showA2AEdges);
+  const deletingIds = useCanvasStore((s) => s.deletingIds);
  const allEdges = useMemo(
    () => (showA2AEdges ? [...edges, ...a2aEdges] : edges),
    [edges, a2aEdges, showA2AEdges],
  );
+  // Drag-lock during a system-owned operation (deploy OR delete).
+  // React Flow respects Node.draggable, which stops the gesture
+  // before it starts — preventDefault() on the drag-start callback
+  // isn't authoritative in v12. We project `draggable: false` onto
+  // each locked node before handing the array to ReactFlow; the
+  // drag-start handler in useDragHandlers remains as a belt-and-
+  // braces check.
+  //
+  // Perf: short-circuit when nothing is provisioning so the memo
+  // passes rawNodes through unchanged (identity-stable → RF
+  // reconciles nothing). When a deploy IS active, build an O(n)
+  // root index once and re-use it. Critically, do NOT spread every
+  // node — only mutate the locked ones — so unmodified nodes keep
+  // their object identity and RF's per-node memo short-circuits.
+  const nodes = useMemo(() => {
+    const anyProvisioning = rawNodes.some((n) => n.data.status === "provisioning");
+    const anyDeleting = deletingIds.size > 0;
+    if (!anyProvisioning && !anyDeleting) return rawNodes;
+
+    const byId = new Map<string, typeof rawNodes[number]>();
+    for (const n of rawNodes) byId.set(n.id, n);
+    const rootOf = new Map<string, string>();
+    const resolveRoot = (id: string): string => {
+      // Iterative walk guards against a pathological cycle (hostile
+      // data) — recursion would hit the stack limit on a deep tree.
+      const visited = new Set<string>();
+      let cursor: string | null = id;
+      while (cursor) {
+        if (visited.has(cursor)) break;
+        visited.add(cursor);
+        const cached = rootOf.get(cursor);
+        if (cached) {
+          for (const seenId of visited) rootOf.set(seenId, cached);
+          return cached;
+        }
+        const n = byId.get(cursor);
+        if (!n) break;
+        if (!n.data.parentId) {
+          for (const seenId of visited) rootOf.set(seenId, cursor);
+          return cursor;
+        }
+        cursor = n.data.parentId;
+      }
+      return id;
+    };
+
+    const provisioningByRoot = new Map<string, number>();
+    for (const n of rawNodes) {
+      if (n.data.status !== "provisioning") continue;
+      const rootId = resolveRoot(n.id);
+      provisioningByRoot.set(rootId, (provisioningByRoot.get(rootId) ?? 0) + 1);
+    }
+
+    let touched = false;
+    const next = rawNodes.map((n) => {
+      const rootId = resolveRoot(n.id);
+      const deployLocked = n.id !== rootId && (provisioningByRoot.get(rootId) ?? 0) > 0;
+      // Delete-locked: nothing in a subtree whose DELETE is in
+      // flight should be draggable, INCLUDING the root of that
+      // subtree (unlike deploy, there's no cancel — the delete
+      // is irrevocable at this point).
+      const deleteLocked = deletingIds.has(n.id);
+      const shouldLock = deployLocked || deleteLocked;
+      if (shouldLock && n.draggable !== false) {
+        touched = true;
+        return { ...n, draggable: false };
+      }
+      if (!shouldLock && n.draggable === false) {
+        // Node was locked in a prior render; deploy cancelled /
+        // completed, or delete failed and was reverted. Restore
+        // default dragability.
+        touched = true;
+        const { draggable: _d, ...rest } = n;
+        void _d;
+        return rest as typeof n;
+      }
+      return n; // identity-preserved
+    });
+    return touched ? next : rawNodes;
+  }, [rawNodes, deletingIds]);
  const onNodesChange = useCanvasStore((s) => s.onNodesChange);
  const selectNode = useCanvasStore((s) => s.selectNode);
  const selectedNodeId = useCanvasStore((s) => s.selectedNodeId);
@ -91,18 +183,45 @@ function CanvasInner() {
  // outside-click handler.
  const pendingDelete = useCanvasStore((s) => s.pendingDelete);
  const setPendingDelete = useCanvasStore((s) => s.setPendingDelete);
-  const removeNode = useCanvasStore((s) => s.removeNode);
+  const removeSubtree = useCanvasStore((s) => s.removeSubtree);
  const confirmDelete = useCallback(async () => {
    if (!pendingDelete) return;
    const { id } = pendingDelete;
    setPendingDelete(null);
+    // Compute the full subtree and mark it as "deleting" so every
+    // node in the chain renders dim + non-draggable during the
+    // network round-trip + the server-side cascade. Matches the
+    // deploy-lock UX: once a system-initiated operation owns this
+    // subtree, the user shouldn't be able to move its pieces
+    // around until it resolves.
+    const state = useCanvasStore.getState();
+    const subtree = new Set<string>();
+    const stack = [id];
+    while (stack.length) {
+      const nid = stack.pop()!;
+      subtree.add(nid);
+      for (const n of state.nodes) {
+        if (n.data.parentId === nid) stack.push(n.id);
+      }
+    }
+    state.beginDelete(subtree);
    try {
      await api.del(`/workspaces/${id}?confirm=true`);
-      removeNode(id);
+      // Mirror the server-side cascade locally — drop the parent AND
+      // every descendant in one atomic update. The per-descendant
+      // WORKSPACE_REMOVED WS events still arrive (and are no-ops
+      // because the nodes are already gone), but we no longer depend
+      // on them: a wedged WS used to leave orphan child cards on the
+      // canvas until the user refreshed the page.
+      removeSubtree(id);
+      state.endDelete(subtree);
    } catch (e) {
+      // Network or server error — restore the subtree to normal
+      // interaction and surface the error.
+      state.endDelete(subtree);
      showToast(e instanceof Error ? e.message : "Delete failed", "error");
    }
-  }, [pendingDelete, setPendingDelete, removeNode]);
+  }, [pendingDelete, setPendingDelete, removeSubtree]);

  const onPaneClick = useCallback(() => {
    selectNode(null);
@ -141,6 +260,7 @@ function CanvasInner() {
          onPaneClick={onPaneClick}
          onMoveEnd={onMoveEnd}
          nodeTypes={nodeTypes}
+          edgeTypes={edgeTypes}
          defaultEdgeOptions={defaultEdgeOptions}
          defaultViewport={defaultViewport}
          fitView={viewport.x === 0 && viewport.y === 0 && viewport.zoom === 1}
--- a/canvas/src/components/EmptyState.tsx
+++ b/canvas/src/components/EmptyState.tsx
@ -1,27 +1,19 @@
 "use client";

-import { useState, useEffect } from "react";
+import { useState, useEffect, useCallback } from "react";
 import { api } from "@/lib/api";
 import { useCanvasStore } from "@/store/canvas";
 import { OrgTemplatesSection } from "./TemplatePalette";
+import { type Template } from "@/lib/deploy-preflight";
+import { useTemplateDeploy } from "@/hooks/useTemplateDeploy";
 import { Spinner } from "./Spinner";
 import { TIER_CONFIG } from "@/lib/design-tokens";

-interface Template {
-  id: string;
-  name: string;
-  description: string;
-  tier: number;
-  model: string;
-  skills: string[];
-  skill_count: number;
-}
-
 export function EmptyState() {
  const [templates, setTemplates] = useState<Template[]>([]);
  const [loading, setLoading] = useState(true);
-  const [deploying, setDeploying] = useState<string | null>(null);
-  const [error, setError] = useState<string | null>(null);
+  const [blankCreating, setBlankCreating] = useState(false);
+  const [blankError, setBlankError] = useState<string | null>(null);

  useEffect(() => {
    api
@ -31,48 +23,56 @@ export function EmptyState() {
      .finally(() => setLoading(false));
  }, []);

-  const deploy = async (template: Template) => {
-    setDeploying(template.id);
-    setError(null);
-    try {
-      const ws = await api.post<{ id: string }>("/workspaces", {
-        name: template.name,
-        template: template.id,
-        tier: template.tier,
-        canvas: { x: 200, y: 150 },
-      });
-      // Auto-select the new workspace and open chat
-      setTimeout(() => {
-        useCanvasStore.getState().selectNode(ws.id);
-        useCanvasStore.getState().setPanelTab("chat");
-      }, 500);
-    } catch (e) {
-      setError(e instanceof Error ? e.message : "Deploy failed");
-    } finally {
-      setDeploying(null);
-    }
-  };
+  // Canvas fills in a visible "center-ish" spot on a fresh tenant so
+  // the user doesn't have to pan to find their new workspace. Fixed
+  // (200, 150) instead of the sidebar's random placement because the
+  // canvas is guaranteed empty when this component mounts.
+  const firstDeployCoords = useCallback(() => ({ x: 200, y: 150 }), []);

+  // After the POST succeeds, auto-select the new workspace and flip
+  // the panel to Chat. This is a UX flourish that only makes sense
+  // on first deploy (the canvas is empty so the selection can't
+  // surprise anyone); the sidebar intentionally skips this step.
+  // 500 ms delay so React Flow has a frame to render the new node
+  // before it receives focus.
+  const handleDeployed = useCallback((workspaceId: string) => {
+    setTimeout(() => {
+      useCanvasStore.getState().selectNode(workspaceId);
+      useCanvasStore.getState().setPanelTab("chat");
+    }, 500);
+  }, []);
+
+  const { deploy, deploying, error, modal } = useTemplateDeploy({
+    canvasCoords: firstDeployCoords,
+    onDeployed: handleDeployed,
+  });
+
+  // "Create blank" bypasses templates entirely — no preflight, no
+  // modal, just POST /workspaces with a default name and tier.
+  // Deliberately NOT routed through useTemplateDeploy because it
+  // has no `template.id` to deploy against.
  const createBlank = async () => {
-    setDeploying("blank");
-    setError(null);
+    setBlankCreating(true);
+    setBlankError(null);
    try {
      const ws = await api.post<{ id: string }>("/workspaces", {
        name: "My First Agent",
        tier: 2,
-        canvas: { x: 200, y: 150 },
+        canvas: firstDeployCoords(),
      });
-      setTimeout(() => {
-        useCanvasStore.getState().selectNode(ws.id);
-        useCanvasStore.getState().setPanelTab("chat");
-      }, 500);
+      handleDeployed(ws.id);
    } catch (e) {
-      setError(e instanceof Error ? e.message : "Create failed");
+      setBlankError(e instanceof Error ? e.message : "Create failed");
    } finally {
-      setDeploying(null);
+      setBlankCreating(false);
    }
  };

+  // Any active gesture locks every button so the user can't fire a
+  // second POST while the first is still in flight.
+  const anyDeploying = !!deploying || blankCreating;
+  const displayError = error ?? blankError;
+
  return (
    <div className="absolute inset-0 flex items-start justify-center pointer-events-none z-[1] overflow-y-auto py-8">
      <div className="relative max-w-2xl w-full rounded-3xl border border-zinc-800/70 bg-zinc-950/80 backdrop-blur-xl px-8 py-8 text-center shadow-2xl shadow-black/40 pointer-events-auto mx-4">
@ -112,8 +112,8 @@ export function EmptyState() {
                <button
                  type="button"
                  key={t.id}
-                  onClick={() => deploy(t)}
-                  disabled={!!deploying}
+                  onClick={() => void deploy(t)}
+                  disabled={anyDeploying}
                  className="group rounded-xl border border-zinc-800/60 bg-zinc-900/50 px-3.5 py-3 hover:border-blue-500/40 hover:bg-zinc-900/80 transition-all disabled:opacity-50 disabled:cursor-not-allowed disabled:hover:border-zinc-800/60 disabled:hover:bg-zinc-900/50 text-left focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70"
                >
                  <div className="flex items-center gap-2 mb-1">
@ -143,10 +143,10 @@ export function EmptyState() {
        <button
          type="button"
          onClick={createBlank}
-          disabled={!!deploying}
+          disabled={anyDeploying}
          className="w-full rounded-xl border border-dashed border-zinc-700/60 bg-zinc-900/30 px-4 py-3 text-sm text-zinc-400 hover:text-zinc-200 hover:border-zinc-600 hover:bg-zinc-900/50 transition-all disabled:opacity-50 disabled:cursor-not-allowed disabled:hover:text-zinc-400 disabled:hover:border-zinc-700/60 focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70"
        >
-          {deploying === "blank" ? "Creating..." : "+ Create blank workspace"}
+          {blankCreating ? "Creating..." : "+ Create blank workspace"}
        </button>

        {/* Org templates — instantiate a whole team in one click */}
@ -154,12 +154,17 @@ export function EmptyState() {
          <OrgTemplatesSection />
        </div>

-        {error && (
+        {displayError && (
          <div role="alert" className="mt-3 px-3 py-2 bg-red-950/40 border border-red-800/50 rounded-lg text-xs text-red-400">
-            {error}
+            {displayError}
          </div>
        )}

+        {/* Missing-keys preflight modal — owned by useTemplateDeploy,
+            shared with TemplatePalette. Rendered inline here so it
+            overlays this card naturally. */}
+        {modal}
+
        {/* Tips */}
        <div className="mt-5 pt-4 border-t border-zinc-800/50">
          <div className="flex items-center justify-center gap-6 text-[10px] text-zinc-400">
--- a/canvas/src/components/Legend.tsx
+++ b/canvas/src/components/Legend.tsx
@ -1,19 +1,92 @@
 "use client";

+import { useEffect, useState } from "react";
 import { STATUS_CONFIG } from "@/lib/design-tokens";
 import { useCanvasStore } from "@/store/canvas";

 const LEGEND_STATUSES = ["online", "provisioning", "degraded", "failed", "paused", "offline"] as const;

+// Persist the user's choice across sessions. Default is "open" so
+// first-time users still see the symbol key; once dismissed we
+// respect that until they explicitly reopen via the floating pill.
+const STORAGE_KEY = "molecule.legend.open";
+
+function readStoredOpen(): boolean {
+  if (typeof window === "undefined") return true;
+  try {
+    const v = window.localStorage.getItem(STORAGE_KEY);
+    if (v === null) return true;
+    return v === "1";
+  } catch {
+    return true;
+  }
+}
+
+function writeStoredOpen(open: boolean) {
+  if (typeof window === "undefined") return;
+  try {
+    window.localStorage.setItem(STORAGE_KEY, open ? "1" : "0");
+  } catch {
+    // localStorage can throw in private mode / quota / disabled
+    // contexts. Silent fallback — the in-memory state still works
+    // for the current session.
+  }
+}
+
 export function Legend() {
  // TemplatePalette (when open) is fixed top-0 left-0 w-[280px] — the
  // default bottom-6 left-4 position of this legend would sit under it.
  // Shift past the 280 px palette + a 16 px gap when the palette is open.
  const paletteOpen = useCanvasStore((s) => s.templatePaletteOpen);
  const leftClass = paletteOpen ? "left-[296px]" : "left-4";
+
+  // SSR-safe pattern: mount with the default (true) so first paint
+  // matches the server output, then hydrate the persisted value
+  // after mount. Avoids a hydration mismatch warning when the user
+  // had previously closed the legend.
+  const [open, setOpen] = useState(true);
+  useEffect(() => {
+    setOpen(readStoredOpen());
+  }, []);
+
+  const closeLegend = () => {
+    setOpen(false);
+    writeStoredOpen(false);
+  };
+  const openLegend = () => {
+    setOpen(true);
+    writeStoredOpen(true);
+  };
+
+  if (!open) {
+    return (
+      <button
+        type="button"
+        onClick={openLegend}
+        aria-label="Show legend"
+        title="Show legend"
+        className={`fixed bottom-6 ${leftClass} z-30 flex items-center gap-1.5 rounded-full bg-zinc-900/95 border border-zinc-700/50 px-3 py-1.5 text-[11px] font-semibold text-zinc-400 uppercase tracking-wider shadow-xl shadow-black/30 backdrop-blur-sm hover:text-zinc-200 hover:border-zinc-600 transition-[left,colors] duration-200`}
+      >
+        <span aria-hidden="true" className="text-[10px]">ⓘ</span>
+        Legend
+      </button>
+    );
+  }
+
  return (
    <div className={`fixed bottom-6 ${leftClass} z-30 bg-zinc-900/95 border border-zinc-700/50 rounded-xl px-4 py-3 shadow-xl shadow-black/30 backdrop-blur-sm max-w-[280px] transition-[left] duration-200`}>
-      <div className="text-[11px] font-semibold text-zinc-400 uppercase tracking-wider mb-2">Legend</div>
+      <div className="flex items-start justify-between mb-2">
+        <div className="text-[11px] font-semibold text-zinc-400 uppercase tracking-wider">Legend</div>
+        <button
+          type="button"
+          onClick={closeLegend}
+          aria-label="Hide legend"
+          title="Hide legend"
+          className="-mt-0.5 -mr-1 px-1.5 text-[14px] leading-none text-zinc-500 hover:text-zinc-200 transition-colors"
+        >
+          ×
+        </button>
+      </div>

      {/* Status */}
      <div className="mb-2">
--- a/canvas/src/components/MissingKeysModal.tsx
+++ b/canvas/src/components/MissingKeysModal.tsx
@ -1,6 +1,7 @@
 "use client";

 import { useState, useEffect, useCallback, useRef, useMemo } from "react";
+import { createPortal } from "react-dom";
 import { api } from "@/lib/api";
 import { getKeyLabel, type ProviderChoice } from "@/lib/deploy-preflight";

@ -196,6 +197,12 @@ function ProviderPickerModal({
  );

  if (!open) return null;
+  // Portal to document.body for the same reason as
+  // OrgImportPreflightModal — several callers (TemplatePalette,
+  // EmptyState) render the modal inside their own fixed+filtered
+  // containers, which re-anchor the "fixed" positioning to the
+  // wrapper's bounds instead of the viewport.
+  if (typeof document === "undefined") return null;

  const allSaved = entries.length > 0 && entries.every((e) => e.saved);
  const anySaving = entries.some((e) => e.saving);
@ -203,8 +210,14 @@ function ProviderPickerModal({
    .replace(/[-_]/g, " ")
    .replace(/\b\w/g, (c) => c.toUpperCase());

-  return (
-    <div className="fixed inset-0 z-50 flex items-center justify-center">
+  return createPortal(
+    // z-[60] so this stacks ABOVE OrgImportPreflightModal (z-50).
+    // Both can be on screen at once during an org import: the org-
+    // preflight is open while the user clicks a per-workspace deploy
+    // that triggers MissingKeys. Without the explicit z-order the
+    // backdrop click might dismiss the wrong modal depending on
+    // React's commit ordering.
+    <div className="fixed inset-0 z-[60] flex items-center justify-center">
      <div
        aria-hidden="true"
        className="absolute inset-0 bg-black/70 backdrop-blur-sm"
@ -215,7 +228,7 @@ function ProviderPickerModal({
        role="dialog"
        aria-modal="true"
        aria-labelledby="missing-keys-title"
-        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[480px] w-full mx-4 overflow-hidden"
+        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[480px] w-full mx-4 max-h-[80vh] overflow-auto"
      >
        <div className="px-5 py-4 border-b border-zinc-800">
          <div className="flex items-center gap-2 mb-1">
@ -360,7 +373,8 @@ function ProviderPickerModal({
          </div>
        </div>
      </div>
-    </div>
+    </div>,
+    document.body,
  );
 }

@ -474,6 +488,7 @@ function AllKeysModal({
  }, [open]);

  if (!open) return null;
+  if (typeof document === "undefined") return null;

  const allSaved = entries.length > 0 && entries.every((e) => e.saved);
  const anySaving = entries.some((e) => e.saving);
@ -481,8 +496,14 @@ function AllKeysModal({
    .replace(/[-_]/g, " ")
    .replace(/\b\w/g, (c) => c.toUpperCase());

-  return (
-    <div className="fixed inset-0 z-50 flex items-center justify-center">
+  return createPortal(
+    // z-[60] so this stacks ABOVE OrgImportPreflightModal (z-50).
+    // Both can be on screen at once during an org import: the org-
+    // preflight is open while the user clicks a per-workspace deploy
+    // that triggers MissingKeys. Without the explicit z-order the
+    // backdrop click might dismiss the wrong modal depending on
+    // React's commit ordering.
+    <div className="fixed inset-0 z-[60] flex items-center justify-center">
      <div
        className="absolute inset-0 bg-black/70 backdrop-blur-sm"
        aria-hidden="true"
@ -493,7 +514,7 @@ function AllKeysModal({
        role="dialog"
        aria-modal="true"
        aria-labelledby="missing-keys-title"
-        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[440px] w-full mx-4 overflow-hidden"
+        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[440px] w-full mx-4 max-h-[80vh] overflow-auto"
      >
        <div className="px-5 py-4 border-b border-zinc-800">
          <div className="flex items-center gap-2 mb-1">
@ -608,6 +629,7 @@ function AllKeysModal({
          </div>
        </div>
      </div>
-    </div>
+    </div>,
+    document.body,
  );
 }
--- a/canvas/src/components/OrgImportPreflightModal.tsx
+++ b/canvas/src/components/OrgImportPreflightModal.tsx
@ -0,0 +1,540 @@
+"use client";
+
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+import { createPortal } from "react-dom";
+import { createSecret } from "@/lib/api/secrets";
+
+/**
+ * One entry from the server's preflight `required_env` / `recommended_env`.
+ *
+ *   - A plain string is a STRICT requirement: that exact env var must be
+ *     configured.
+ *   - A `{any_of: [...]}` object is an OR group: at least one member
+ *     must be configured to satisfy it. Lets a template say "either
+ *     ANTHROPIC_API_KEY or CLAUDE_CODE_OAUTH_TOKEN" without forcing
+ *     both.
+ *
+ * Matches the Go `EnvRequirement` type's JSON shape (MarshalJSON in
+ * workspace-server/internal/handlers/org.go). The union is written so
+ * that a narrow check — `typeof e === "string"` — distinguishes cleanly.
+ */
+export type EnvRequirement = string | { any_of: string[] };
+
+/** Flat member list for a requirement. */
+export function envReqMembers(r: EnvRequirement): string[] {
+  return typeof r === "string" ? [r] : r.any_of;
+}
+
+/** True if any member is present in `configured`. */
+export function envReqSatisfied(r: EnvRequirement, configured: Set<string>): boolean {
+  if (typeof r === "string") return configured.has(r);
+  return r.any_of.some((m) => configured.has(m));
+}
+
+/** Stable react-key / dedup key for a requirement. Sorted for groups so
+ *  reordered-member variants still collapse to one entry. */
+export function envReqKey(r: EnvRequirement): string {
+  if (typeof r === "string") return r;
+  return [...r.any_of].sort().join("|");
+}
+
+interface Props {
+  open: boolean;
+  /** Display name of the org template — headline only. */
+  orgName: string;
+  /** Total workspace count so the header can read "12 workspaces". */
+  workspaceCount: number;
+  /** Env vars the server has declared MUST be set as global secrets.
+   *  Import is disabled until every entry here is configured. Entries
+   *  are either a single key name or an any-of group. */
+  requiredEnv: EnvRequirement[];
+  /** Env vars the server suggests — import can proceed without them,
+   *  but the user sees them listed so they can decide. Same union
+   *  shape as `requiredEnv`. */
+  recommendedEnv: EnvRequirement[];
+  /** Names of env vars already configured globally. Used to strike
+   *  through entries the user has already set up in another
+   *  session. Passed in rather than queried inside the modal so the
+   *  parent can refresh after each save without prop-driven effects. */
+  configuredKeys: Set<string>;
+  /** Called after a successful secret save so the parent can refresh
+   *  `configuredKeys`. */
+  onSecretSaved: () => void;
+  /** User clicked Import with all required envs satisfied. */
+  onProceed: () => void;
+  /** User dismissed the modal. Import is NOT fired. */
+  onCancel: () => void;
+}
+
+interface DraftEntry {
+  key: string;
+  value: string;
+  saving: boolean;
+  error: string | null;
+}
+
+/**
+ * OrgImportPreflightModal
+ * -----------------------
+ * Two-tier env preflight before POST /org/import:
+ *
+ *   - REQUIRED section (red, blocking) — every entry MUST be configured
+ *     globally before the Import button enables. Matches the server-
+ *     side preflight that would 412 the import anyway.
+ *
+ *   - RECOMMENDED section (yellow, non-blocking) — listed so the user
+ *     can add them if they want the full experience, but the Import
+ *     button stays enabled regardless.
+ *
+ * Saving goes to the GLOBAL secrets endpoint (PUT /settings/secrets)
+ * because org-level templates deploy shared resources. Per-workspace
+ * overrides still work via the Config tab on an individual node
+ * after import. The modal does NOT enable Import the moment a key is
+ * typed — only after it saves successfully (so a half-entered token
+ * can't proceed and then fail at container-start time instead).
+ */
+export function OrgImportPreflightModal({
+  open,
+  orgName,
+  workspaceCount,
+  requiredEnv,
+  recommendedEnv,
+  configuredKeys,
+  onSecretSaved,
+  onProceed,
+  onCancel,
+}: Props) {
+  const [drafts, setDrafts] = useState<Record<string, DraftEntry>>({});
+
+  // Flatten the union-shaped requirement lists to the set of every key
+  // that could ever appear as an input row. Used purely to seed the
+  // drafts map — satisfaction semantics still read from the grouped
+  // EnvRequirement entries (a group can be satisfied by any one
+  // member).
+  const allMemberKeys = useMemo(() => {
+    const keys: string[] = [];
+    for (const r of requiredEnv) keys.push(...envReqMembers(r));
+    for (const r of recommendedEnv) keys.push(...envReqMembers(r));
+    return keys;
+  }, [requiredEnv, recommendedEnv]);
+
+  // Seed a draft entry per declared key the first time the modal
+  // opens. Entries persist across `configuredKeys` changes so a mid-
+  // save recheck doesn't wipe what the user typed.
+  //
+  // Dep: derive a STABLE string from the env-name lists rather than
+  // the array refs themselves. The parent computes
+  // `preflight.org.required_env ?? []`, which produces a fresh []
+  // identity on every re-render (e.g. when refreshConfiguredKeys
+  // bumps state); depending on the array refs would re-fire the
+  // effect on every parent render and mask any future edit that
+  // drops the `if (!next[k])` guard as a silent input-reset bug.
+  const envKeysSignature = useMemo(
+    () => [...allMemberKeys].sort().join("|"),
+    [allMemberKeys],
+  );
+  useEffect(() => {
+    if (!open) return;
+    setDrafts((prev) => {
+      const next = { ...prev };
+      for (const k of allMemberKeys) {
+        if (!next[k]) {
+          next[k] = { key: k, value: "", saving: false, error: null };
+        }
+      }
+      return next;
+    });
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [open, envKeysSignature]);
+
+  const missingRequired = useMemo(
+    () => requiredEnv.filter((r) => !envReqSatisfied(r, configuredKeys)),
+    [requiredEnv, configuredKeys],
+  );
+  const missingRecommended = useMemo(
+    () => recommendedEnv.filter((r) => !envReqSatisfied(r, configuredKeys)),
+    [recommendedEnv, configuredKeys],
+  );
+  const canProceed = missingRequired.length === 0;
+
+  // Synchronous in-flight gate. A ref (not state) so two clicks
+  // dispatched in the SAME microtask both see the gate flip — state
+  // commits don't help here because setState is async. The previous
+  // closure-based `current.saving` gate worked under React Testing
+  // Library's act() flushing but failed for true microtask-level
+  // double-fires (programmatic clicks, dblclick events, Enter-spam
+  // before React commits). Set is keyed by env var name so different
+  // rows can save concurrently.
+  const inFlightRef = useRef<Set<string>>(new Set());
+
+  // Latest-drafts ref so saveOne can read the current input value
+  // without taking `drafts` as a useCallback dep — that dep would
+  // re-create saveOne on every keystroke and re-bind every Save
+  // button's onClick handler, churn that scales with row count.
+  const draftsRef = useRef(drafts);
+  useEffect(() => {
+    draftsRef.current = drafts;
+  }, [drafts]);
+
+  const saveOne = useCallback(
+    async (key: string) => {
+      // Microtask-safe gate: claim the slot synchronously BEFORE any
+      // await so a second click in the same tick bounces immediately.
+      if (inFlightRef.current.has(key)) return;
+      const current = draftsRef.current[key];
+      if (!current || !current.value.trim()) return;
+      inFlightRef.current.add(key);
+
+      const startValue = current.value;
+      setDrafts((d) => ({
+        ...d,
+        [key]: { ...d[key], saving: true, error: null },
+      }));
+      try {
+        await createSecret("global", key, startValue);
+        setDrafts((d) => ({
+          ...d,
+          [key]: { ...d[key], value: "", saving: false, error: null },
+        }));
+        // Let the parent refresh configuredKeys so the strike-through
+        // updates and canProceed recomputes.
+        onSecretSaved();
+      } catch (e) {
+        setDrafts((d) => ({
+          ...d,
+          [key]: {
+            ...d[key],
+            saving: false,
+            error: e instanceof Error ? e.message : "Save failed",
+          },
+        }));
+      } finally {
+        inFlightRef.current.delete(key);
+      }
+    },
+    [onSecretSaved],
+  );
+
+  if (!open) return null;
+
+  // Portal the dialog to document.body so it escapes any ancestor
+  // containing block. TemplatePalette renders this modal inside a
+  // sidebar whose `fixed` container plus backdrop-filter together
+  // re-anchor descendants' `position: fixed` to the sidebar's own
+  // bounds instead of the viewport — the modal ends up glued to the
+  // sidebar's scrollable region and only becomes visible after the
+  // user scrolls the sidebar. Portal dodges that class of issue
+  // once and for all, regardless of what future wrappers do.
+  //
+  // SSR-safe guard: `document` is undefined on the server. Since
+  // the modal is gated by `if (!open) return null` above, this
+  // effectively only runs after open flips true on the client.
+  if (typeof document === "undefined") return null;
+
+  return createPortal(
+    <div
+      role="dialog"
+      aria-modal="true"
+      aria-labelledby="org-preflight-title"
+      className="fixed inset-0 z-50 flex items-center justify-center bg-black/70"
+      onClick={onCancel}
+    >
+      <div
+        className="w-[560px] max-h-[80vh] overflow-auto rounded-xl bg-zinc-900 border border-zinc-700 shadow-2xl"
+        onClick={(e) => e.stopPropagation()}
+      >
+        <header className="px-5 py-4 border-b border-zinc-800">
+          <h2 id="org-preflight-title" className="text-sm font-semibold text-zinc-100">
+            Deploy {orgName}
+          </h2>
+          <p className="mt-0.5 text-[11px] text-zinc-500">
+            {workspaceCount} workspace{workspaceCount === 1 ? "" : "s"}.
+            Review the credentials needed before import.
+          </p>
+        </header>
+
+        <section className="p-5 space-y-5">
+          {requiredEnv.length > 0 && (
+            <EnvList
+              tone="required"
+              title="Required"
+              subtitle="Import is blocked until every key below is saved globally."
+              entries={requiredEnv}
+              configuredKeys={configuredKeys}
+              drafts={drafts}
+              onChange={(key, value) =>
+                setDrafts((d) => ({ ...d, [key]: { ...d[key], value } }))
+              }
+              onSave={saveOne}
+            />
+          )}
+          {recommendedEnv.length > 0 && (
+            <EnvList
+              tone="recommended"
+              title="Recommended"
+              subtitle="Not required, but some features degrade without them. Add them now for the best experience."
+              entries={recommendedEnv}
+              configuredKeys={configuredKeys}
+              drafts={drafts}
+              onChange={(key, value) =>
+                setDrafts((d) => ({ ...d, [key]: { ...d[key], value } }))
+              }
+              onSave={saveOne}
+            />
+          )}
+          {requiredEnv.length === 0 && recommendedEnv.length === 0 && (
+            <p className="text-[12px] text-zinc-400">
+              No additional credentials required for this template.
+            </p>
+          )}
+        </section>
+
+        <footer className="px-5 py-3 border-t border-zinc-800 flex items-center justify-between">
+          <button
+            type="button"
+            onClick={onCancel}
+            className="px-3 py-1.5 text-[11px] rounded bg-zinc-800 hover:bg-zinc-700 text-zinc-300"
+          >
+            Cancel
+          </button>
+          <div className="flex items-center gap-2">
+            {missingRecommended.length > 0 && canProceed && (
+              <span className="text-[10px] text-amber-400/90">
+                {missingRecommended.length} recommended key
+                {missingRecommended.length === 1 ? "" : "s"} still unset
+              </span>
+            )}
+            <button
+              type="button"
+              onClick={onProceed}
+              disabled={!canProceed}
+              className="px-4 py-1.5 text-[11px] font-semibold rounded bg-blue-600 hover:bg-blue-500 text-white disabled:bg-zinc-700 disabled:text-zinc-500 disabled:cursor-not-allowed"
+            >
+              Import
+            </button>
+          </div>
+        </footer>
+      </div>
+    </div>,
+    document.body,
+  );
+}
+
+interface EnvListProps {
+  tone: "required" | "recommended";
+  title: string;
+  subtitle: string;
+  entries: EnvRequirement[];
+  configuredKeys: Set<string>;
+  drafts: Record<string, DraftEntry>;
+  onChange: (key: string, value: string) => void;
+  onSave: (key: string) => void;
+}
+
+function EnvList({
+  tone,
+  title,
+  subtitle,
+  entries,
+  configuredKeys,
+  drafts,
+  onChange,
+  onSave,
+}: EnvListProps) {
+  const accent =
+    tone === "required"
+      ? "border-red-800/60 bg-red-950/20"
+      : "border-amber-800/50 bg-amber-950/15";
+  const headerColor =
+    tone === "required" ? "text-red-300" : "text-amber-300";
+
+  return (
+    <div className={`rounded-lg border ${accent} p-3`}>
+      <h3 className={`text-[11px] font-semibold uppercase tracking-wide ${headerColor}`}>
+        {title}
+      </h3>
+      <p className="mt-0.5 mb-2 text-[10px] text-zinc-400">{subtitle}</p>
+      <ul className="space-y-2">
+        {entries.map((entry) =>
+          typeof entry === "string" ? (
+            <StrictEnvRow
+              key={envReqKey(entry)}
+              envKey={entry}
+              configured={configuredKeys.has(entry)}
+              draft={drafts[entry]}
+              onChange={onChange}
+              onSave={onSave}
+            />
+          ) : (
+            <AnyOfEnvGroup
+              key={envReqKey(entry)}
+              members={entry.any_of}
+              configuredKeys={configuredKeys}
+              drafts={drafts}
+              onChange={onChange}
+              onSave={onSave}
+            />
+          ),
+        )}
+      </ul>
+    </div>
+  );
+}
+
+interface StrictEnvRowProps {
+  envKey: string;
+  configured: boolean;
+  draft: DraftEntry | undefined;
+  onChange: (key: string, value: string) => void;
+  onSave: (key: string) => void;
+}
+
+function StrictEnvRow({
+  envKey,
+  configured,
+  draft: d,
+  onChange,
+  onSave,
+}: StrictEnvRowProps) {
+  return (
+    <li className="flex items-center gap-2 rounded bg-zinc-900/70 border border-zinc-800 px-2 py-1.5">
+      <code
+        className={`text-[11px] font-mono flex-1 ${
+          configured ? "text-zinc-500 line-through" : "text-zinc-200"
+        }`}
+      >
+        {envKey}
+      </code>
+      {configured ? (
+        <span className="text-[10px] text-emerald-400">✓ set</span>
+      ) : (
+        <>
+          <input
+            type="password"
+            aria-label={`Value for ${envKey}`}
+            placeholder="paste value"
+            value={d?.value ?? ""}
+            onChange={(e) => onChange(envKey, e.target.value)}
+            onKeyDown={(e) => {
+              if (e.key === "Enter") {
+                e.preventDefault();
+                onSave(envKey);
+              }
+            }}
+            disabled={d?.saving}
+            className="flex-1 px-2 py-1 rounded bg-zinc-800 border border-zinc-700 text-[11px] text-zinc-200 focus:outline-none focus:border-blue-500 disabled:opacity-50"
+          />
+          <button
+            type="button"
+            onClick={() => onSave(envKey)}
+            disabled={d?.saving || !d?.value.trim()}
+            className="px-2 py-1 text-[10px] rounded bg-blue-600 hover:bg-blue-500 text-white disabled:opacity-40 disabled:cursor-not-allowed"
+          >
+            {d?.saving ? "…" : "Save"}
+          </button>
+        </>
+      )}
+      {d?.error && (
+        <span className="text-[9px] text-red-400 basis-full pl-1">
+          {d.error}
+        </span>
+      )}
+    </li>
+  );
+}
+
+interface AnyOfEnvGroupProps {
+  members: string[];
+  configuredKeys: Set<string>;
+  drafts: Record<string, DraftEntry>;
+  onChange: (key: string, value: string) => void;
+  onSave: (key: string) => void;
+}
+
+/**
+ * Renders an OR group: the user only needs to configure ONE of the
+ * members to satisfy the requirement. Once any member is configured
+ * the group shows a green banner identifying the satisfying key; the
+ * other inputs remain visible but muted so the user can still switch
+ * providers if they want (uncommon but cheap to support).
+ */
+function AnyOfEnvGroup({
+  members,
+  configuredKeys,
+  drafts,
+  onChange,
+  onSave,
+}: AnyOfEnvGroupProps) {
+  const satisfiedBy = members.find((m) => configuredKeys.has(m));
+  return (
+    <li className="rounded border border-zinc-800 bg-zinc-900/50 px-2.5 py-2">
+      <div className="flex items-center justify-between mb-1.5">
+        <span className="text-[10px] uppercase tracking-wide text-zinc-400">
+          Configure any one
+        </span>
+        {satisfiedBy && (
+          <span className="text-[10px] text-emerald-400">
+            ✓ using <code className="font-mono">{satisfiedBy}</code>
+          </span>
+        )}
+      </div>
+      <ul className="space-y-1.5">
+        {members.map((m) => {
+          const isConfigured = configuredKeys.has(m);
+          const d = drafts[m];
+          const dimmed = !!satisfiedBy && !isConfigured;
+          return (
+            <li
+              key={m}
+              className={`flex items-center gap-2 rounded bg-zinc-900/70 border border-zinc-800 px-2 py-1 ${
+                dimmed ? "opacity-50" : ""
+              }`}
+            >
+              <code
+                className={`text-[11px] font-mono flex-1 ${
+                  isConfigured ? "text-zinc-500 line-through" : "text-zinc-200"
+                }`}
+              >
+                {m}
+              </code>
+              {isConfigured ? (
+                <span className="text-[10px] text-emerald-400">✓ set</span>
+              ) : (
+                <>
+                  <input
+                    type="password"
+                    aria-label={`Value for ${m}`}
+                    placeholder="paste value"
+                    value={d?.value ?? ""}
+                    onChange={(e) => onChange(m, e.target.value)}
+                    onKeyDown={(e) => {
+                      if (e.key === "Enter") {
+                        e.preventDefault();
+                        onSave(m);
+                      }
+                    }}
+                    disabled={d?.saving}
+                    className="flex-1 px-2 py-1 rounded bg-zinc-800 border border-zinc-700 text-[11px] text-zinc-200 focus:outline-none focus:border-blue-500 disabled:opacity-50"
+                  />
+                  <button
+                    type="button"
+                    onClick={() => onSave(m)}
+                    disabled={d?.saving || !d?.value.trim()}
+                    className="px-2 py-1 text-[10px] rounded bg-blue-600 hover:bg-blue-500 text-white disabled:opacity-40 disabled:cursor-not-allowed"
+                  >
+                    {d?.saving ? "…" : "Save"}
+                  </button>
+                </>
+              )}
+              {d?.error && (
+                <span className="text-[9px] text-red-400 basis-full pl-1">
+                  {d.error}
+                </span>
+              )}
+            </li>
+          );
+        })}
+      </ul>
+    </li>
+  );
+}
--- a/canvas/src/components/ProvisioningTimeout.tsx
+++ b/canvas/src/components/ProvisioningTimeout.tsx
@ -65,6 +65,12 @@ export function ProvisioningTimeout({
  // banner even if they stay in provisioning. Cleared when the
  // workspace leaves provisioning (status changes).
  const [dismissed, setDismissed] = useState<Set<string>>(new Set());
+  // Watch the live WS health. While it's not "connected", local node
+  // status reflects the last event we received before the drop —
+  // workspaces may have actually transitioned to online minutes ago.
+  // Suppress the banner until WS recovers + rehydrate confirms each
+  // workspace is genuinely still provisioning.
+  const wsStatus = useCanvasStore((s) => s.wsStatus);

  // Subscribe to provisioning nodes — use shallow compare to avoid infinite re-render
  // (filter+map creates new array reference on every store update).
@ -273,8 +279,11 @@ export function ProvisioningTimeout({
  }, []);

  const visibleTimedOut = useMemo(
-    () => timedOut.filter((e) => !dismissed.has(e.workspaceId)),
-    [timedOut, dismissed],
+    () =>
+      wsStatus === "connected"
+        ? timedOut.filter((e) => !dismissed.has(e.workspaceId))
+        : [],
+    [timedOut, dismissed, wsStatus],
  );

  if (visibleTimedOut.length === 0) return null;
--- a/canvas/src/components/SidePanel.tsx
+++ b/canvas/src/components/SidePanel.tsx
@ -29,7 +29,7 @@ const TABS: { id: PanelTab; label: string; icon: string }[] = [
  { id: "chat", label: "Chat", icon: "◈" },
  { id: "activity", label: "Activity", icon: "⊙" },
  { id: "details", label: "Details", icon: "◉" },
-  { id: "skills", label: "Skills", icon: "✦" },
+  { id: "skills", label: "Plugins", icon: "✦" },
  { id: "terminal", label: "Terminal", icon: "▸" },
  { id: "config", label: "Config", icon: "⚙" },
  { id: "schedule", label: "Schedule", icon: "⏲" },
@ -280,7 +280,7 @@ export function SidePanel() {
        className="flex-1 overflow-y-auto focus:outline-none"
      >
        {panelTab === "details" && <DetailsTab key={selectedNodeId} workspaceId={selectedNodeId} data={node.data} />}
-        {panelTab === "skills" && <SkillsTab key={selectedNodeId} data={node.data} />}
+        {panelTab === "skills" && <SkillsTab key={selectedNodeId} workspaceId={selectedNodeId} data={node.data} />}
        {panelTab === "activity" && <ActivityTab key={selectedNodeId} workspaceId={selectedNodeId} />}
        {panelTab === "chat" && <ChatTab key={selectedNodeId} workspaceId={selectedNodeId} data={node.data} />}
        {panelTab === "terminal" && <TerminalTab key={selectedNodeId} workspaceId={selectedNodeId} />}
--- a/canvas/src/components/TemplatePalette.tsx
+++ b/canvas/src/components/TemplatePalette.tsx
@ -1,35 +1,48 @@
 "use client";

 import { useState, useEffect, useCallback, useRef } from "react";
+import { flushSync } from "react-dom";
 import { api } from "@/lib/api";
 import { useCanvasStore } from "@/store/canvas";
 import type { WorkspaceData } from "@/store/socket";
-import { checkDeploySecrets, type PreflightResult, type ModelSpec } from "@/lib/deploy-preflight";
-import { MissingKeysModal } from "./MissingKeysModal";
+import { type Template } from "@/lib/deploy-preflight";
+import { useTemplateDeploy } from "@/hooks/useTemplateDeploy";
+import {
+  OrgImportPreflightModal,
+  type EnvRequirement,
+} from "./OrgImportPreflightModal";
 import { ConfirmDialog } from "./ConfirmDialog";
 import { Spinner } from "./Spinner";
 import { showToast } from "./Toaster";
 import { TIER_CONFIG } from "@/lib/design-tokens";
+import { listSecrets } from "@/lib/api/secrets";

-interface Template {
-  id: string;
-  name: string;
-  description: string;
-  tier: number;
-  runtime?: string;
-  model: string;
-  models?: ModelSpec[];
-  /** AND-required env vars declared at runtime_config.required_env. */
-  required_env?: string[];
-  skills: string[];
-  skill_count: number;
-}
-
+// `Template` type and `resolveRuntime` helper now live in
+// `@/lib/deploy-preflight` so EmptyState can import the same ones. Was
+// redeclared here + a narrower redeclaration in EmptyState; the
+// narrower one dropped `runtime`, `models`, `required_env`, which is
+// exactly the data the preflight needs. See reviewer's "runtime
+// fallback drift" note — single source of truth closes the drift.
 export interface OrgTemplate {
  dir: string;
  name: string;
  description: string;
  workspaces: number;
+  /** Env vars that MUST be set as global secrets before the org can
+   *  import. Server refuses the import with 412 if any are missing;
+   *  the canvas preflights against /secrets/list to avoid the round
+   *  trip. Aggregated from org-level + every workspace in the tree.
+   *
+   *  Each entry is either a key name (strict) or an `{any_of: [...]}`
+   *  group (any one of the listed members satisfies the requirement —
+   *  e.g. `ANTHROPIC_API_KEY` OR `CLAUDE_CODE_OAUTH_TOKEN`). */
+  required_env?: EnvRequirement[];
+  /** "Nice-to-have" tier. Import proceeds without them but features
+   *  may degrade — a channel's webhook posts get dropped, a fallback
+   *  LLM isn't available, etc. Surfaced to the user as a non-blocking
+   *  warning with an "add now" affordance. Same union shape as
+   *  `required_env`. */
+  recommended_env?: EnvRequirement[];
 }

 /** Fetch the list of org templates from the platform. Returns [] on error
@ -91,6 +104,14 @@ export function OrgTemplatesSection() {
  const [loading, setLoading] = useState(false);
  const [importing, setImporting] = useState<string | null>(null);
  const [error, setError] = useState<string | null>(null);
+  // Preflight modal state. `preflight` is non-null when the user
+  // clicked Import on an org with declared required/recommended envs
+  // and we're waiting for them to confirm; null otherwise (direct
+  // import path for orgs with zero env requirements).
+  const [preflight, setPreflight] = useState<{
+    org: OrgTemplate;
+    configuredKeys: Set<string>;
+  } | null>(null);
  // Collapsed by default — org templates are multi-workspace imports
  // that most new users don't reach for first. Keeping them
  // expand-on-demand frees ~400 px of vertical space for the
@ -109,21 +130,55 @@ export function OrgTemplatesSection() {
    loadOrgs();
  }, [loadOrgs]);

-  const handleImport = async (org: OrgTemplate) => {
+  /** Fetch the set of global secret KEYS that are already configured.
+   *  Used to strike through already-set entries in the preflight modal
+   *  and to decide whether the import needs the modal at all. */
+  const loadConfiguredKeys = useCallback(async (): Promise<Set<string>> => {
+    try {
+      const secrets = await listSecrets("global");
+      return new Set(secrets.map((s) => s.name));
+    } catch {
+      // Secrets endpoint unreachable → assume nothing configured.
+      // The server will refuse the import with 412 and the user
+      // retries; safer than letting the import fly blind.
+      return new Set();
+    }
+  }, []);
+
+  /** Actually run the import. Split out so both the "no preflight
+   *  needed" fast path and the "preflight modal approved" path can
+   *  share the fetch + hydrate + toast sequence. */
+  const doImport = useCallback(async (org: OrgTemplate) => {
    setImporting(org.dir);
    setError(null);
    try {
      await importOrgTemplate(org.dir);
-      // Refresh canvas inline — the WebSocket may be offline, in which case
-      // WORKSPACE_PROVISIONING broadcasts never arrive and the user sees
-      // no change from clicking "Import org". A direct fetch guarantees
-      // the new workspaces land on canvas regardless of WS state.
-      try {
-        const workspaces = await api.get<WorkspaceData[]>("/workspaces");
-        useCanvasStore.getState().hydrate(workspaces);
-      } catch {
-        // Rehydrate failure is non-fatal; WS (if alive) or the next
-        // health-check cycle will eventually pick the new workspaces up.
+      // Hydrate is the safety net for the "WS is offline" case —
+      // without live events the canvas stays empty. But calling it
+      // immediately wipes the org-deploy animation (hydrate rebuilds
+      // the node array from scratch, dropping the spawn / shimmer
+      // classes and position tweens). So:
+      //   1. If the number of nodes on the canvas already matches
+      //      (or exceeds) the template's workspace count, WS
+      //      delivered everything — skip hydrate.
+      //   2. Otherwise, wait a short window to let any in-flight WS
+      //      events land, then hydrate only if still behind.
+      const expectedCount = org.workspaces;
+      // Nodes transition through WORKSPACE_REMOVED which physically
+      // drops them from the store — there is no "removed" status in
+      // WorkspaceNodeData — so a simple length check is enough here.
+      const hasAll = () => useCanvasStore.getState().nodes.length >= expectedCount;
+      if (!hasAll()) {
+        await new Promise((r) => setTimeout(r, 1500));
+      }
+      if (!hasAll()) {
+        try {
+          const workspaces = await api.get<WorkspaceData[]>("/workspaces");
+          useCanvasStore.getState().hydrate(workspaces);
+        } catch {
+          // WS (if alive) or the next health-check cycle will
+          // eventually pick the new workspaces up.
+        }
      }
      showToast(`Imported "${org.name || org.dir}" (${org.workspaces} workspaces)`, "success");
    } catch (e) {
@ -133,7 +188,45 @@ export function OrgTemplatesSection() {
    } finally {
      setImporting(null);
    }
-  };
+  }, []);
+
+  /** Entry point for the Import button. Two paths:
+   *
+   *   1. No env declared by the template (required_env + recommended_env
+   *      both empty) → fire doImport directly. Matches the pre-preflight
+   *      behaviour for existing templates.
+   *
+   *   2. Any env declared → load the configured-keys set and open the
+   *      preflight modal. doImport runs only when the user clicks
+   *      Import inside the modal, which is gated to "required envs all
+   *      configured" by the modal itself. */
+  const handleImport = useCallback(async (org: OrgTemplate) => {
+    const hasEnvDeclarations =
+      (org.required_env && org.required_env.length > 0) ||
+      (org.recommended_env && org.recommended_env.length > 0);
+    if (!hasEnvDeclarations) {
+      void doImport(org);
+      return;
+    }
+    // Flip the button to its "Importing…" state while the secrets
+    // lookup runs — on a tenant with 500+ global secrets the round
+    // trip can be > 200 ms and the user otherwise gets zero visual
+    // feedback after clicking. Cleared on modal close / error.
+    setImporting(org.dir);
+    try {
+      const configuredKeys = await loadConfiguredKeys();
+      setPreflight({ org, configuredKeys });
+    } finally {
+      setImporting(null);
+    }
+  }, [doImport, loadConfiguredKeys]);
+
+  /** Called by the preflight modal after a successful key save so the
+   *  strike-through re-renders and canProceed recomputes. */
+  const refreshConfiguredKeys = useCallback(async () => {
+    const keys = await loadConfiguredKeys();
+    setPreflight((prev) => (prev ? { ...prev, configuredKeys: keys } : prev));
+  }, [loadConfiguredKeys]);

  return (
    <div className="space-y-2" data-testid="org-templates-section">
@ -222,6 +315,35 @@ export function OrgTemplatesSection() {
      })}
        </div>
      )}
+
+      {preflight && (
+        <OrgImportPreflightModal
+          open
+          orgName={preflight.org.name || preflight.org.dir}
+          workspaceCount={preflight.org.workspaces}
+          requiredEnv={preflight.org.required_env ?? []}
+          recommendedEnv={preflight.org.recommended_env ?? []}
+          configuredKeys={preflight.configuredKeys}
+          onSecretSaved={refreshConfiguredKeys}
+          onProceed={() => {
+            const org = preflight.org;
+            // flushSync guarantees the modal unmounts BEFORE we kick
+            // off the import network call. Without it, React batches
+            // setPreflight(null) with the setImporting(...) from
+            // doImport's synchronous prefix, both commit at the end
+            // of this handler, AND the await import() POST may yield
+            // a microtask before React schedules the paint. Net
+            // effect: the modal backdrop sat over the canvas during
+            // the first wave of WORKSPACE_PROVISIONING WS events,
+            // hiding the spawn animation. Force the close to land
+            // first so the user sees the canvas reveal + agents
+            // popping into place.
+            flushSync(() => setPreflight(null));
+            void doImport(org);
+          }}
+          onCancel={() => setPreflight(null)}
+        />
+      )}
    </div>
  );
 }
@ -319,14 +441,6 @@ export function TemplatePalette() {

  const [templates, setTemplates] = useState<Template[]>([]);
  const [loading, setLoading] = useState(false);
-  const [creating, setCreating] = useState<string | null>(null);
-  const [error, setError] = useState<string | null>(null);
-
-  // Missing keys modal state
-  const [missingKeysInfo, setMissingKeysInfo] = useState<{
-    template: Template;
-    preflight: PreflightResult;
-  } | null>(null);

  const loadTemplates = useCallback(async () => {
    setLoading(true);
@ -344,65 +458,15 @@ export function TemplatePalette() {
    if (open) loadTemplates();
  }, [open, loadTemplates]);

-  /** Resolve runtime from template ID (e.g., "langgraph", "claude-code-default" → "claude-code") */
-  const resolveRuntime = (templateId: string): string => {
-    const runtimeMap: Record<string, string> = {
-      langgraph: "langgraph",
-      "claude-code-default": "claude-code",
-      openclaw: "openclaw",
-      deepagents: "deepagents",
-      crewai: "crewai",
-      autogen: "autogen",
-    };
-    return runtimeMap[templateId] ?? templateId.replace(/-default$/, "");
-  };
-
-  /** Actually execute the deploy API call */
-  const executeDeploy = useCallback(async (template: Template) => {
-    setCreating(template.id);
-    setError(null);
-    try {
-      await api.post("/workspaces", {
-        name: template.name,
-        template: template.id,
-        tier: template.tier,
-        canvas: {
-          x: Math.random() * 400 + 100,
-          y: Math.random() * 300 + 100,
-        },
-      });
-      setCreating(null);
-    } catch (e) {
-      setError(e instanceof Error ? e.message : "Failed to deploy");
-      setCreating(null);
-    }
-  }, []);
-
-  /** Pre-deploy check: validate secrets before deploying */
-  const handleDeploy = async (template: Template) => {
-    setCreating(template.id);
-    setError(null);
-
-    // Prefer the runtime the Go /templates endpoint returned verbatim —
-    // resolveRuntime() is a legacy id→runtime fallback for installs whose
-    // template summary predates the `runtime` field.
-    const runtime = template.runtime ?? resolveRuntime(template.id);
-    const preflight = await checkDeploySecrets({
-      runtime,
-      models: template.models,
-      required_env: template.required_env,
-    });
-
-    if (!preflight.ok) {
-      // Missing keys — show the modal instead of deploying
-      setMissingKeysInfo({ template, preflight });
-      setCreating(null);
-      return;
-    }
-
-    // All keys present — deploy directly
-    await executeDeploy(template);
-  };
+  // Preflight + POST + modal wiring moved into useTemplateDeploy so
+  // this component and EmptyState use one implementation. The sidebar
+  // uses the hook's default random canvas placement (no override) —
+  // an already-populated canvas shouldn't have new deploys stacking on
+  // a single fixed point. No post-deploy side effect either: the
+  // palette is operator-triggered, so auto-selecting would yank
+  // focus off whatever the user was already looking at.
+  const { deploy: handleDeploy, deploying: creating, error, modal } =
+    useTemplateDeploy();

  return (
    <>
@ -426,21 +490,9 @@ export function TemplatePalette() {
        </svg>
      </button>

-      {/* Missing Keys Modal */}
-      <MissingKeysModal
-        open={!!missingKeysInfo}
-        missingKeys={missingKeysInfo?.preflight.missingKeys ?? []}
-        providers={missingKeysInfo?.preflight.providers ?? []}
-        runtime={missingKeysInfo?.preflight.runtime ?? ""}
-        onKeysAdded={() => {
-          if (missingKeysInfo) {
-            const template = missingKeysInfo.template;
-            setMissingKeysInfo(null);
-            executeDeploy(template);
-          }
-        }}
-        onCancel={() => setMissingKeysInfo(null)}
-      />
+      {/* Missing-keys modal — rendered by the shared hook. Same
+          instance shape used by EmptyState. */}
+      {modal}

      {/* Sidebar */}
      {open && (
@ -483,7 +535,7 @@ export function TemplatePalette() {
                <button
                  type="button"
                  key={t.id}
-                  onClick={() => handleDeploy(t)}
+                  onClick={() => void handleDeploy(t)}
                  disabled={isDeploying}
                  className="w-full text-left bg-zinc-800/40 hover:bg-zinc-800/70 border border-zinc-700/40 hover:border-zinc-600/50 rounded-xl p-3 transition-all disabled:opacity-50 disabled:cursor-not-allowed disabled:hover:bg-zinc-800/40 disabled:hover:border-zinc-700/40 group focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70"
                >
--- a/canvas/src/components/WorkspaceNode.tsx
+++ b/canvas/src/components/WorkspaceNode.tsx
@ -6,6 +6,8 @@ import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
 import { showToast } from "@/components/Toaster";
 import { Tooltip } from "@/components/Tooltip";
 import { STATUS_CONFIG, TIER_CONFIG } from "@/lib/design-tokens";
+import { useOrgDeployState } from "@/components/canvas/useOrgDeployState";
+import { OrgCancelButton } from "@/components/canvas/OrgCancelButton";

 /** Descendant count for the "N sub" badge — children are first-class nodes
 *  rendered as full cards inside this one via React Flow's native parentId,
@ -35,6 +37,10 @@ function EjectIcon(props: React.SVGProps<SVGSVGElement>) {
 export function WorkspaceNode({ id, data }: NodeProps<Node<WorkspaceNodeData>>) {
  const statusCfg = STATUS_CONFIG[data.status] || STATUS_CONFIG.offline;
  const tierCfg = TIER_CONFIG[data.tier] || { label: `T${data.tier}`, color: "text-zinc-500 bg-zinc-800" };
+  // Org-deploy context — four derived flags off one store subscription.
+  // Drives the shimmer while provisioning, the dimmed/non-draggable
+  // treatment on locked descendants, and the Cancel pill on the root.
+  const deploy = useOrgDeployState(id);
  const selectedNodeId = useCanvasStore((s) => s.selectedNodeId);
  const selectNode = useCanvasStore((s) => s.selectNode);
  const openContextMenu = useCanvasStore((s) => s.openContextMenu);
@ -138,8 +144,21 @@ export function WorkspaceNode({ id, data }: NodeProps<Node<WorkspaceNodeData>>)
        }
        backdrop-blur-sm
        focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70 focus-visible:ring-offset-1 focus-visible:ring-offset-zinc-950
+        ${deploy.isActivelyProvisioning ? "mol-deploy-shimmer" : ""}
+        ${deploy.isLockedChild ? "mol-deploy-locked" : ""}
      `}
    >
+      {/* Cancel-deployment pill — rendered on the root of a deploying
+          org only. Positioned absolute inside the card so it moves
+          with drag; class="nodrag" on the button stops React Flow
+          from treating clicks as a drag start. */}
+      {deploy.isDeployingRoot && (
+        <OrgCancelButton
+          rootId={id}
+          rootName={data.name}
+          workspaceCount={deploy.descendantProvisioningCount}
+        />
+      )}
      {/* Status gradient bar at top */}
      <div className={`absolute inset-x-0 top-0 h-8 bg-gradient-to-b ${statusCfg.bar} pointer-events-none`} />

--- a/canvas/src/components/tests/A2ATopologyOverlay.test.tsx
+++ b/canvas/src/components/tests/A2ATopologyOverlay.test.tsx
@ -175,9 +175,28 @@ describe("buildA2AEdges — edge properties", () => {
    expect((edge.style as React.CSSProperties).pointerEvents).toBe("none");
  });

-  it("sets pointerEvents: 'none' on labelStyle", () => {
+  it("tags the edge as type=a2a so React Flow renders the custom A2AEdge component", () => {
+    // The custom edge portals labels above the node layer and makes
+    // them clickable. Without type=a2a, RF falls back to the default
+    // edge whose label sits in the SVG group (hidden under nodes,
+    // pointerEvents:none). Regression guard for the hidden-label /
+    // unclickable-label bug observed 2026-04-25.
    const [edge] = buildA2AEdges([makeRow()], NOW);
-    expect((edge.labelStyle as React.CSSProperties).pointerEvents).toBe("none");
+    expect(edge.type).toBe("a2a");
+  });
+
+  it("populates edge.data with the fields the custom edge component reads", () => {
+    // A2AEdge reads count, lastAt, isHot, label from edge.data so the
+    // shape upstream must keep emitting them. A future buildA2AEdges
+    // refactor that drops any of these silently breaks the rendered
+    // pill (label disappears, hot/warm color swap fails, click handler
+    // can still fire but the label text vanishes).
+    const [edge] = buildA2AEdges([makeRow()], NOW);
+    const data = edge.data as Record<string, unknown>;
+    expect(data.count).toBe(1);
+    expect(typeof data.lastAt).toBe("number");
+    expect(typeof data.isHot).toBe("boolean");
+    expect(data.label).toMatch(/^1 call ·/);
  });

  it("label uses singular 'call' for count === 1", () => {
--- a/canvas/src/components/tests/Canvas.a11y.test.tsx
+++ b/canvas/src/components/tests/Canvas.a11y.test.tsx
@ -72,6 +72,7 @@ const mockStoreState = {
  selectedNodeIds: new Set<string>(),
  clearSelection: vi.fn(),
  toggleNodeSelection: vi.fn(),
+  deletingIds: new Set<string>(),
 };

 vi.mock("@/store/canvas", () => ({
--- a/canvas/src/components/tests/Canvas.pan-to-node.test.tsx
+++ b/canvas/src/components/tests/Canvas.pan-to-node.test.tsx
@ -16,7 +16,9 @@ afterEach(() => {
 // ── Shared fitView spy — must be set up before vi.mock hoisting ──────────────
 const mockFitView = vi.fn();
 const mockFitBounds = vi.fn();
-const mockGetIntersectingNodes = vi.fn(() => []);
+const mockGetIntersectingNodes = vi.fn(
+  (): Array<{ id: string; position: { x: number; y: number } }> => [],
+);

 vi.mock("@xyflow/react", () => {
  const ReactFlow = ({
@ -83,6 +85,12 @@ const mockStoreState = {
  selectedNodeIds: new Set<string>(),
  clearSelection: vi.fn(),
  toggleNodeSelection: vi.fn(),
+  // Cascade-delete / deploy animation state (added in the multilevel-
+  // layout-UX bundle). Canvas.tsx reads deletingIds.size to decide
+  // whether to apply the "locked during delete" class on each node;
+  // an empty Set mirrors the idle canvas and doesn't interact with
+  // any pan/fit behaviour under test here.
+  deletingIds: new Set<string>(),
 };

 vi.mock("@/store/canvas", () => ({
--- a/canvas/src/components/tests/OrgImportPreflightModal.test.tsx
+++ b/canvas/src/components/tests/OrgImportPreflightModal.test.tsx
@ -0,0 +1,225 @@
+// @vitest-environment jsdom
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import { render, screen, fireEvent, cleanup, waitFor } from "@testing-library/react";
+
+// Regression tests for the OrgImportPreflightModal's save path and
+// any-of group rendering. Guards two specific bugs caught in the
+// UX A/B Lab rollout (2026-04-24):
+//
+//   1. saveOne early-returned because it tried to read a local
+//      `startValue` reassigned inside a functional setDrafts
+//      updater. React did not always evaluate the updater
+//      synchronously, so the gate read "" and bailed while
+//      `saving:true` committed at next render, wedging the
+//      button on "…" without ever calling createSecret.
+//
+//   2. Double-click / Enter-spam could race past the disabled-
+//      button UI gate, firing createSecret twice. The production
+//      endpoint is idempotent so no data hazard, but the extra
+//      PUT is wasteful and harder to reason about.
+
+const createSecretMock = vi.fn().mockResolvedValue(undefined);
+
+vi.mock("@/lib/api/secrets", () => ({
+  createSecret: (...args: unknown[]) => createSecretMock(...args),
+}));
+
+import { OrgImportPreflightModal } from "../OrgImportPreflightModal";
+
+beforeEach(() => {
+  createSecretMock.mockClear();
+  createSecretMock.mockResolvedValue(undefined);
+});
+
+afterEach(() => {
+  cleanup();
+});
+
+describe("OrgImportPreflightModal — saveOne", () => {
+  it("calls createSecret exactly once when Save is clicked on an any-of member", async () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    // Both any-of members render their own input + Save.
+    const input = screen.getByLabelText(/Value for ANTHROPIC_API_KEY/i);
+    fireEvent.change(input, { target: { value: "test-secret-value" } });
+
+    // The Save button adjacent to the changed input.
+    const saveButtons = screen
+      .getAllByRole("button")
+      .filter((b) => b.textContent === "Save");
+    // Two saves on screen (one per any-of member). First is ANTHROPIC.
+    fireEvent.click(saveButtons[0]);
+
+    await waitFor(() => {
+      expect(createSecretMock).toHaveBeenCalledTimes(1);
+    });
+    expect(createSecretMock).toHaveBeenCalledWith(
+      "global",
+      "ANTHROPIC_API_KEY",
+      "test-secret-value",
+    );
+  });
+
+  it("synchronous double-click on Save fires createSecret exactly once", async () => {
+    // Pause the first save so we can fire a second click while the
+    // first is still mid-await. The two clicks happen in the SAME
+    // tick — fireEvent runs synchronously through React's event
+    // system — so any guard that depends on a committed setState
+    // (e.g. `disabled={drafts[key].saving}` or a closure read of
+    // `drafts[key].saving`) loses the race: the second click sees
+    // saving=false because React hasn't committed yet. The fix is
+    // a useRef-based gate that flips synchronously before any await.
+    let resolveCreate!: () => void;
+    createSecretMock.mockImplementationOnce(
+      () => new Promise<void>((resolve) => {
+        resolveCreate = resolve;
+      }),
+    );
+
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    const input = screen.getByLabelText(/Value for ANTHROPIC_API_KEY/i);
+    fireEvent.change(input, { target: { value: "test-secret-value" } });
+
+    const saveButtons = screen
+      .getAllByRole("button")
+      .filter((b) => b.textContent === "Save");
+    // Pull the React-bound onClick once so both invocations close
+    // over the SAME callback — simulates a double-fire that happens
+    // before React reconciles between events. Without this, RTL
+    // flushes act() between fireEvent calls and the second click
+    // sees the post-commit state.
+    const saveBtn = saveButtons[0] as HTMLButtonElement;
+    saveBtn.click();
+    saveBtn.click();
+
+    // Give React a tick to process any queued state updates.
+    await waitFor(() => {
+      expect(createSecretMock).toHaveBeenCalledTimes(1);
+    });
+
+    resolveCreate();
+    await waitFor(() => {
+      // Post-save count must remain at exactly one.
+      expect(createSecretMock).toHaveBeenCalledTimes(1);
+    });
+  });
+
+  it("does not call createSecret when value is empty", async () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    // Button is disabled when value is empty — clicking a disabled
+    // button still dispatches onClick in RTL (since fireEvent
+    // bypasses the disabled attribute), so this asserts the code-
+    // level gate catches it, not just the UI.
+    const saveButtons = screen
+      .getAllByRole("button")
+      .filter((b) => b.textContent === "Save");
+    fireEvent.click(saveButtons[0]);
+
+    // Small async wait to let any state updates settle.
+    await new Promise((r) => setTimeout(r, 50));
+    expect(createSecretMock).not.toHaveBeenCalled();
+  });
+});
+
+describe("OrgImportPreflightModal — any-of rendering", () => {
+  it("renders each any-of member as a separate input row", () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    expect(screen.getByText("Configure any one")).toBeTruthy();
+    expect(screen.getByLabelText(/Value for ANTHROPIC_API_KEY/i)).toBeTruthy();
+    expect(screen.getByLabelText(/Value for CLAUDE_CODE_OAUTH_TOKEN/i)).toBeTruthy();
+  });
+
+  it("shows satisfied indicator when any member is configured, and enables Import", () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set(["CLAUDE_CODE_OAUTH_TOKEN"])}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    // "✓ using CLAUDE_CODE_OAUTH_TOKEN" banner renders. Name appears
+    // twice (banner + member row) so use getAllByText.
+    expect(screen.getByText(/using/i)).toBeTruthy();
+    expect(screen.getAllByText("CLAUDE_CODE_OAUTH_TOKEN").length).toBeGreaterThanOrEqual(1);
+
+    const importBtn = screen.getByRole("button", { name: /^Import$/ });
+    expect(importBtn.hasAttribute("disabled")).toBe(false);
+  });
+
+  it("keeps Import disabled when no any-of member is configured", () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    const importBtn = screen.getByRole("button", { name: /^Import$/ });
+    expect(importBtn.hasAttribute("disabled")).toBe(true);
+  });
+});
--- a/canvas/src/components/tests/SkillsTab.install.test.tsx
+++ b/canvas/src/components/tests/SkillsTab.install.test.tsx
@ -0,0 +1,143 @@
+// @vitest-environment jsdom
+//
+// Behavioral coverage for the install flow. Two regressions to pin
+// down:
+//
+//  1. The install POST URL has to include the workspace id. A pre-fix
+//     bug routed it to /workspaces/undefined/plugins because the
+//     component read `data.id`, but `WorkspaceNodeData` has no `id`
+//     field — its `extends Record<string, unknown>` index signature
+//     hid the bad access from TS. The component now takes
+//     `workspaceId` as an explicit prop; this test asserts the URL.
+//
+//  2. The optimistic install update has to flip the registry row to
+//     "Installed" without waiting for the 15s reload timer (the
+//     PLUGIN_RELOAD_DELAY_MS gap). This test asserts the row's "Install"
+//     button is replaced by the green "Installed" tag synchronously
+//     after the POST resolves.
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import { render, screen, fireEvent, waitFor, cleanup } from "@testing-library/react";
+
+const mockApiGet = vi.fn();
+const mockApiPost = vi.fn();
+vi.mock("@/lib/api", () => ({
+  api: {
+    get: (...args: unknown[]) => mockApiGet(...args),
+    post: (...args: unknown[]) => mockApiPost(...args),
+    put: vi.fn().mockResolvedValue({}),
+    del: vi.fn().mockResolvedValue({}),
+    patch: vi.fn().mockResolvedValue({}),
+  },
+}));
+
+vi.mock("@/store/canvas", () => ({
+  useCanvasStore: Object.assign(
+    vi.fn((selector: (s: Record<string, unknown>) => unknown) =>
+      selector({ setPanelTab: vi.fn() } as Record<string, unknown>),
+    ),
+    { getState: () => ({ setPanelTab: vi.fn() }) },
+  ),
+  summarizeWorkspaceCapabilities: vi.fn(() => ({ skills: [], tools: [] })),
+}));
+
+vi.mock("../Toaster", () => ({ showToast: vi.fn() }));
+
+import { SkillsTab } from "../tabs/SkillsTab";
+
+function makeData() {
+  return {
+    name: "Test WS",
+    status: "online",
+    tier: 1,
+    agentCard: null,
+    activeTasks: 0,
+    collapsed: false,
+    role: "agent",
+    lastErrorRate: 0,
+    lastSampleError: "",
+    url: "http://localhost:9000",
+    parentId: null,
+    currentTask: "",
+    runtime: "langgraph",
+    needsRestart: false,
+    budgetLimit: null,
+  };
+}
+
+const REGISTRY = [
+  {
+    name: "browser-automation",
+    version: "1.1.0",
+    description: "Browser automation + testing",
+    author: "molecule",
+    tags: ["browser", "playwright"],
+    skills: [],
+    runtimes: ["claude-code"],
+  },
+];
+
+beforeEach(() => {
+  // Order matches the component's loadInstalled / loadRegistry
+  // /loadSourceSchemes calls. Schemes endpoint resolves with an
+  // empty list so the Install-from-source input doesn't blow up.
+  mockApiGet.mockReset();
+  mockApiPost.mockReset();
+  mockApiGet.mockImplementation((path: string) => {
+    if (path.endsWith("/plugins") && path.startsWith("/workspaces/")) {
+      return Promise.resolve([]); // installed
+    }
+    if (path === "/plugins") {
+      return Promise.resolve(REGISTRY); // registry
+    }
+    if (path === "/plugins/sources") {
+      return Promise.resolve({ schemes: ["github://", "local://"] });
+    }
+    return Promise.resolve(null);
+  });
+  mockApiPost.mockResolvedValue({ status: "installed", plugin: "browser-automation" });
+});
+
+afterEach(() => {
+  cleanup();
+  vi.clearAllMocks();
+});
+
+// Returns the registry row's Install button. The custom-source input
+// also renders an "Install" button, so `findByRole({name: /install/})`
+// throws on multiple matches; scope by the row's plugin-name text.
+async function findRowInstallButton() {
+  const nameNode = await screen.findByText("browser-automation");
+  const row = nameNode.closest("div.flex.items-center.justify-between") as HTMLElement;
+  if (!row) throw new Error("could not locate row container for browser-automation");
+  const buttons = row.querySelectorAll("button");
+  const install = Array.from(buttons).find((b) => b.textContent?.trim() === "Install");
+  if (!install) throw new Error("row has no Install button (already installed?)");
+  return install;
+}
+
+describe("SkillsTab install flow", () => {
+  it("POSTs to /workspaces/<workspaceId>/plugins (no `undefined` in URL)", async () => {
+    render(<SkillsTab workspaceId="ws-abc-123" data={makeData() as never} />);
+
+    fireEvent.click(await findRowInstallButton());
+
+    await waitFor(() => expect(mockApiPost).toHaveBeenCalled());
+    expect(mockApiPost).toHaveBeenCalledWith(
+      "/workspaces/ws-abc-123/plugins",
+      { source: "local://browser-automation" },
+    );
+  });
+
+  it("flips the registry row to 'Installed' synchronously after POST resolves (no 15s wait)", async () => {
+    render(<SkillsTab workspaceId="ws-abc-123" data={makeData() as never} />);
+
+    fireEvent.click(await findRowInstallButton());
+
+    // The "Installed" green tag must appear without advancing the
+    // reload timer — the optimistic update is the entire point of
+    // this fix. If this test ever regresses to needing fake timers
+    // + advanceTimersByTime, the optimistic path is broken.
+    const installedTag = await screen.findByText(/^Installed$/i);
+    expect(installedTag).toBeDefined();
+  });
+});
--- a/canvas/src/components/tests/tabs.a11y.test.tsx
+++ b/canvas/src/components/tests/tabs.a11y.test.tsx
@ -123,7 +123,7 @@ describe("SkillsTab — aria-label on bare source input (WCAG 1.3.1)", () => {
  });

  it('install source input has aria-label="Install from source URL"', async () => {
-    render(<SkillsTab data={makeSkillsData() as never} />);
+    render(<SkillsTab workspaceId="ws-test-id" data={makeSkillsData() as never} />);

    // The source input is inside the registry section (showRegistry=false initially).
    // Click the "+ Install Plugin" button to reveal it.
@ -138,7 +138,7 @@ describe("SkillsTab — aria-label on bare source input (WCAG 1.3.1)", () => {
  });

  it("install source input is a text input (not hidden)", async () => {
-    render(<SkillsTab data={makeSkillsData() as never} />);
+    render(<SkillsTab workspaceId="ws-test-id" data={makeSkillsData() as never} />);

    const installBtn = screen.getByRole("button", { name: /install plugin/i });
    fireEvent.click(installBtn);
--- a/canvas/src/components/canvas/A2AEdge.tsx
+++ b/canvas/src/components/canvas/A2AEdge.tsx
@ -0,0 +1,133 @@
+"use client";
+
+import { memo } from "react";
+import {
+  BaseEdge,
+  EdgeLabelRenderer,
+  getBezierPath,
+  type EdgeProps,
+} from "@xyflow/react";
+import { useCanvasStore } from "@/store/canvas";
+
+/**
+ * Custom edge for the A2A topology overlay. Solves two problems with the
+ * default React Flow edge label rendering:
+ *
+ *   1. **Z-order.** The default `label` prop renders inside the edge's
+ *      SVG group, which always sits below node DOM in React Flow. When
+ *      a label happened to land underneath a workspace card, it was
+ *      hidden. EdgeLabelRenderer mounts label content in a separate
+ *      portal layer that we can pin above nodes via z-index.
+ *
+ *   2. **Clickability.** Default labels inherit `pointerEvents: none`
+ *      from the SVG path so the user can drag through them. The
+ *      portaled label is a regular HTML element with its own pointer
+ *      events — we set `pointerEvents: all` only on the label pill so
+ *      drags on the edge line still pass through to the canvas.
+ *
+ * On click: selects the source workspace and switches its side panel
+ * to Activity, where the user can inspect the underlying delegations.
+ */
+interface A2AEdgeData {
+  count: number;
+  lastAt: number;
+  isHot: boolean;
+  /** Pre-formatted "5 calls · 2m ago" — built upstream by buildA2AEdges
+   *  so the same string renders here and in any future tooltip layer. */
+  label: string;
+}
+
+function A2AEdgeImpl({
+  id,
+  source,
+  sourceX,
+  sourceY,
+  targetX,
+  targetY,
+  sourcePosition,
+  targetPosition,
+  data,
+  style = {},
+}: EdgeProps) {
+  const [edgePath, labelX, labelY] = getBezierPath({
+    sourceX,
+    sourceY,
+    sourcePosition,
+    targetX,
+    targetY,
+    targetPosition,
+  });
+
+  const selectNode = useCanvasStore((s) => s.selectNode);
+  const setPanelTab = useCanvasStore((s) => s.setPanelTab);
+
+  const edgeData = (data ?? {}) as Partial<A2AEdgeData>;
+  const labelText = edgeData.label ?? "";
+  const isHot = edgeData.isHot ?? false;
+  const count = edgeData.count ?? 0;
+
+  const handleClick = (e: React.MouseEvent) => {
+    e.stopPropagation();
+    // Select the source (the agent that initiated the delegations).
+    // The user's mental model when clicking the edge is "show me the
+    // calls FROM here" — that's the source's activity feed.
+    //
+    // Preserve the current tab when the user re-clicks the same edge
+    // (or another edge whose source is already selected). Yanking
+    // them back to Activity every click would surprise — they may
+    // have intentionally switched to Chat / Memory while looking at
+    // this peer. The first click that lands a *different* selection
+    // still routes them to Activity, which is the discovery affordance.
+    const alreadySelected =
+      useCanvasStore.getState().selectedNodeId === source;
+    selectNode(source);
+    if (!alreadySelected) {
+      setPanelTab("activity");
+    }
+  };
+
+  // The edge stroke color matches what buildA2AEdges sets on the SVG
+  // path style. Mirror it on the badge border so the visual identity
+  // (hot=violet vs warm=blue) carries to the clickable label.
+  const accent = isHot ? "border-violet-500/60" : "border-blue-500/60";
+  const accentText = isHot ? "text-violet-200" : "text-blue-200";
+  const ariaLabel = `${count} delegation${count === 1 ? "" : "s"} from ${
+    edgeData.label?.split(" · ")[1] ?? "recent"
+  }. Click to inspect.`;
+
+  return (
+    <>
+      <BaseEdge id={id} path={edgePath} style={style} markerEnd="url(#a2a-arrow)" />
+      {labelText && (
+        <EdgeLabelRenderer>
+          <div
+            // The label sits in a portal at the canvas root. position:
+            // absolute + the (labelX, labelY) translate places it at
+            // the edge midpoint. zIndex 5 wins against React Flow's
+            // node layer (default z=0) without fighting the controls
+            // strip (z=10).
+            style={{
+              position: "absolute",
+              transform: `translate(-50%, -50%) translate(${labelX}px, ${labelY}px)`,
+              pointerEvents: "all",
+              zIndex: 5,
+            }}
+            className="nodrag nopan"
+          >
+            <button
+              type="button"
+              onClick={handleClick}
+              aria-label={ariaLabel}
+              title="Open source workspace's activity feed"
+              className={`px-2 py-0.5 rounded-full bg-zinc-900/95 border ${accent} ${accentText} text-[10px] font-medium shadow-md shadow-black/40 backdrop-blur-sm hover:bg-zinc-800 hover:border-opacity-100 transition-colors cursor-pointer`}
+            >
+              {labelText}
+            </button>
+          </div>
+        </EdgeLabelRenderer>
+      )}
+    </>
+  );
+}
+
+export const A2AEdge = memo(A2AEdgeImpl);
--- a/canvas/src/components/canvas/OrgCancelButton.tsx
+++ b/canvas/src/components/canvas/OrgCancelButton.tsx
@ -0,0 +1,165 @@
+"use client";
+
+import { useState } from "react";
+import { api } from "@/lib/api";
+import { useCanvasStore } from "@/store/canvas";
+import { showToast } from "@/components/Toaster";
+
+interface Props {
+  /** Root workspace of the org being deployed. The cancel action
+   *  cascades delete through workspace-server's existing recursive
+   *  delete handler, so we only need the root id. */
+  rootId: string;
+  rootName: string;
+  /** Count rendered in the pill label; updated live as children
+   *  come online (the useOrgDeployState hook recomputes on every
+   *  status change). */
+  workspaceCount: number;
+}
+
+/**
+ * Cancel-deployment pill attached to the root of a deploying org.
+ * One click → confirm dialog → DELETE /workspaces/:rootId?confirm=true
+ * which cascades through every descendant server-side.
+ *
+ * Rendered inside the root's WorkspaceNode card via an absolute-
+ * positioned overlay so it sits visually ON the card and moves with
+ * drag. `className="nodrag"` stops React Flow from interpreting
+ * clicks here as the start of a drag gesture.
+ *
+ * Deliberately uses only `.mol-deploy-cancel*` classes for styling —
+ * every color / easing comes from theme-tokens.css, so a future
+ * light-theme (or tenant-branded theme) inherits automatically.
+ */
+export function OrgCancelButton({ rootId, rootName, workspaceCount }: Props) {
+  const [confirming, setConfirming] = useState(false);
+  const [submitting, setSubmitting] = useState(false);
+
+  const handleCancel = async () => {
+    setSubmitting(true);
+    // Populate deletingIds with the subtree so every descendant
+    // (and the root) locks into the dim + non-draggable state for
+    // the duration of the network round-trip + server cascade —
+    // same treatment the regular delete gives. Otherwise the org
+    // looks interactive for the several seconds between click and
+    // the first WORKSPACE_REMOVED event.
+    const preState = useCanvasStore.getState();
+    const subtreeIds = new Set<string>();
+    const walkStack = [rootId];
+    while (walkStack.length) {
+      const nid = walkStack.pop()!;
+      subtreeIds.add(nid);
+      for (const n of preState.nodes) {
+        if (n.data.parentId === nid) walkStack.push(n.id);
+      }
+    }
+    preState.beginDelete(subtreeIds);
+    try {
+      await api.del<{ status: string }>(
+        `/workspaces/${rootId}?confirm=true`,
+      );
+      showToast(`Cancelled deployment of "${rootName}"`, "success");
+      // Optimistic local removal — workspace-server broadcasts
+      // WORKSPACE_REMOVED per node but the WS may lag; strip the
+      // subtree now so the user sees immediate feedback. Re-read
+      // the store AFTER the await: children may have landed (or
+      // already been removed by WS events) during the network
+      // round-trip. If the WS_REMOVED handler already dropped the
+      // root during the network call, bail out — the subtree walk
+      // would miss any now-orphaned descendants (handleCanvasEvent
+      // reparents children of a removed node upward, so they no
+      // longer share the original root's id as parentId).
+      const postDeleteState = useCanvasStore.getState();
+      if (!postDeleteState.nodes.some((n) => n.id === rootId)) {
+        return;
+      }
+      const subtree = new Set<string>();
+      const stack = [rootId];
+      while (stack.length) {
+        const id = stack.pop()!;
+        subtree.add(id);
+        for (const n of postDeleteState.nodes) {
+          if (n.data.parentId === id) stack.push(n.id);
+        }
+      }
+      useCanvasStore.setState({
+        nodes: postDeleteState.nodes.filter((n) => !subtree.has(n.id)),
+        edges: postDeleteState.edges.filter(
+          (e) => !subtree.has(e.source) && !subtree.has(e.target),
+        ),
+      });
+    } catch (e) {
+      // Undo the lock so the user can try again / interact with the
+      // still-deploying subtree.
+      useCanvasStore.getState().endDelete(subtreeIds);
+      showToast(
+        e instanceof Error ? `Cancel failed: ${e.message}` : "Cancel failed",
+        "error",
+      );
+    } finally {
+      // Success path's endDelete is covered implicitly — every node
+      // in the subtree is stripped by the optimistic local removal
+      // above, and any stragglers are removed by WORKSPACE_REMOVED
+      // WS events whose handler is a no-op on already-missing ids.
+      // The deletingIds set will naturally empty as endDelete runs
+      // in both paths below.
+      useCanvasStore.getState().endDelete(subtreeIds);
+      setSubmitting(false);
+      setConfirming(false);
+    }
+  };
+
+  if (confirming) {
+    return (
+      <div
+        className="nodrag absolute -top-10 right-0 z-20 flex items-center gap-1.5 rounded-lg bg-zinc-900/95 px-2 py-1 shadow-lg border border-red-800/60"
+        onClick={(e) => e.stopPropagation()}
+      >
+        <span className="text-[10px] text-zinc-300">
+          Delete {workspaceCount} workspace{workspaceCount === 1 ? "" : "s"}?
+        </span>
+        <button
+          type="button"
+          onClick={handleCancel}
+          disabled={submitting}
+          className="mol-deploy-cancel px-2 py-0.5 rounded text-[10px] font-semibold"
+        >
+          {submitting ? "Deleting…" : "Yes"}
+        </button>
+        <button
+          type="button"
+          onClick={() => setConfirming(false)}
+          disabled={submitting}
+          className="px-2 py-0.5 rounded bg-zinc-700/80 hover:bg-zinc-600 text-[10px] text-zinc-200"
+        >
+          No
+        </button>
+      </div>
+    );
+  }
+
+  return (
+    <button
+      type="button"
+      onClick={(e) => {
+        // Stop the click from bubbling to React Flow (selects the
+        // node) — the Cancel pill is a UI surface, not a node
+        // activation.
+        e.stopPropagation();
+        setConfirming(true);
+      }}
+      className="nodrag mol-deploy-cancel mol-deploy-cancel-pulse absolute -top-7 right-1 z-20 flex items-center gap-1 rounded-full px-2.5 py-0.5 text-[10px] font-semibold shadow-md"
+      aria-label={`Cancel deployment of ${rootName}`}
+    >
+      <svg width="10" height="10" viewBox="0 0 16 16" aria-hidden="true">
+        <path
+          d="M4 4l8 8M12 4l-8 8"
+          stroke="currentColor"
+          strokeWidth="2"
+          strokeLinecap="round"
+        />
+      </svg>
+      <span>Cancel ({workspaceCount})</span>
+    </button>
+  );
+}
--- a/canvas/src/components/canvas/tests/useCanvasViewport.test.ts
+++ b/canvas/src/components/canvas/tests/useCanvasViewport.test.ts
@ -0,0 +1,53 @@
+import { describe, it, expect } from "vitest";
+import { shouldFitGrowing } from "../useCanvasViewport";
+
+// Tests cover the auto-fit gate in isolation. The hook itself is
+// effects + refs + React Flow handles, awkward to exercise directly —
+// extracting the pure decision into shouldFitGrowing(...) lets us
+// pin down the regression-prone logic with unit tests instead.
+
+describe("shouldFitGrowing", () => {
+  it("fits the very first time (no prior snapshot)", () => {
+    expect(shouldFitGrowing(["a"], undefined, null, 0)).toBe(true);
+  });
+
+  it("fits when the prior snapshot is empty", () => {
+    expect(shouldFitGrowing(["a", "b"], new Set(), null, 0)).toBe(true);
+  });
+
+  it("fits when a brand-new id has been added since the last fit", () => {
+    const prev = new Set(["root", "a", "b"]);
+    expect(shouldFitGrowing(["root", "a", "b", "c"], prev, null, 0)).toBe(true);
+  });
+
+  it("respects user pan when the subtree hasn't grown", () => {
+    const prev = new Set(["root", "a", "b"]);
+    // Status update on existing node — same membership.
+    expect(shouldFitGrowing(["root", "a", "b"], prev, 5_000, 1_000)).toBe(false);
+  });
+
+  it("fits when the subtree hasn't grown but the user never panned", () => {
+    const prev = new Set(["root", "a", "b"]);
+    expect(shouldFitGrowing(["root", "a", "b"], prev, null, 1_000)).toBe(true);
+  });
+
+  it("fits when the subtree hasn't grown and the user panned BEFORE the last fit", () => {
+    const prev = new Set(["root", "a", "b"]);
+    expect(shouldFitGrowing(["root", "a", "b"], prev, 500, 1_000)).toBe(true);
+  });
+
+  it("forces fit on delete-then-add even when the count is unchanged", () => {
+    // Subtree was [root, a, b, c, d]. Then `d` got removed and a
+    // sibling `e` arrived. Same length, different membership — a
+    // length-only check would skip the fit and leave `e` off-screen.
+    const prev = new Set(["root", "a", "b", "c", "d"]);
+    expect(
+      shouldFitGrowing(["root", "a", "b", "c", "e"], prev, 5_000, 1_000),
+    ).toBe(true);
+  });
+
+  it("does NOT fit on shrink-only when the user has panned (deletion alone shouldn't override exploration)", () => {
+    const prev = new Set(["root", "a", "b", "c"]);
+    expect(shouldFitGrowing(["root", "a", "b"], prev, 5_000, 1_000)).toBe(false);
+  });
+});
--- a/canvas/src/components/canvas/useCanvasViewport.ts
+++ b/canvas/src/components/canvas/useCanvasViewport.ts
@ -3,11 +3,43 @@
 import { useCallback, useEffect, useRef } from "react";
 import { useReactFlow } from "@xyflow/react";
 import { useCanvasStore } from "@/store/canvas";
+import { appendClass, removeClass } from "@/store/classNames";
 import {
  CHILD_DEFAULT_HEIGHT,
  CHILD_DEFAULT_WIDTH,
 } from "@/store/canvas-topology";

+/**
+ * Decide whether the deploy-time auto-fit should run. Pure function so
+ * the gate logic is unit-testable in isolation — the surrounding
+ * useEffect tangle of refs, timers, and React Flow handles is awkward
+ * to exercise directly.
+ *
+ * Returns true when the auto-fit SHOULD fire:
+ *   - the subtree contains an id that wasn't in the previous snapshot
+ *     (a new node arrived → user has lost context, force the fit
+ *     through regardless of any user-pan in between), OR
+ *   - the user has not panned since the last successful fit (so the
+ *     auto-fit isn't fighting their override).
+ *
+ * `prevSubtreeIds === undefined` means no fit has ever run for this
+ * root — treat every id as "new" and fit. `userPannedAt === null`
+ * means the user has never panned at all in this session — fit.
+ */
+export function shouldFitGrowing(
+  currentSubtreeIds: readonly string[],
+  prevSubtreeIds: ReadonlySet<string> | undefined,
+  userPannedAt: number | null,
+  lastAutoFitAt: number,
+): boolean {
+  if (!prevSubtreeIds || prevSubtreeIds.size === 0) return true;
+  for (const id of currentSubtreeIds) {
+    if (!prevSubtreeIds.has(id)) return true;
+  }
+  if (userPannedAt === null) return true;
+  return userPannedAt <= lastAutoFitAt;
+}
+
 /**
 * Wires the two canvas-wide CustomEvent listeners and the viewport
 * save/restore bookkeeping so Canvas.tsx doesn't have to.
@ -25,17 +57,79 @@ export function useCanvasViewport() {
  const saveViewport = useCanvasStore((s) => s.saveViewport);
  const saveTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
  const panTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
-  const autoFitTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
+  // Two distinct fit timers — DO NOT collapse to one.
+  //   - settleFitTimerRef:   1200ms one-shot run by the
+  //     "transition from any-provisioning to none" effect (the deploy
+  //     just finished — settle on the whole org once).
+  //   - trackingFitTimerRef: 500ms debounced by the per-arrival
+  //     molecule:fit-deploying-org event handler (track the org's
+  //     bounds as children land during the deploy).
+  // They MUST NOT share a ref: the two effects fire interleaved
+  // (every WS event during a deploy resets the tracking timer; the
+  // settle timer arms the moment provisioning hits zero), and a
+  // shared ref made each effect silently clearTimeout the other's
+  // pending fit. Today's behavior happened to land in the right
+  // order out of luck; splitting the refs makes ordering independent
+  // of fire sequence.
+  const settleFitTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
+  const trackingFitTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
  // Tracks whether any workspace was provisioning on the previous
  // render so we can detect the boundary when the last one finishes
  // and auto-fit the viewport around the whole tree.
  const hadProvisioningRef = useRef(false);
+  // Respect-user-pan gate for the deploy-time auto-fit. Earlier
+  // revisions tried to detect user pans via `onMoveEnd`, but React
+  // Flow v12 fires that callback with a truthy event at the END of
+  // a programmatic fitView animation — so the first auto-fit we
+  // triggered would immediately look like a user pan and block
+  // every subsequent fit for the rest of the deploy, leaving the
+  // viewport stuck wherever the first fit landed. Now we stamp
+  // this ref ONLY on wheel / pointerdown / touchstart on the
+  // React Flow pane itself (see the effect below), which are
+  // unambiguous user-gesture signals.
+  const userPannedAtRef = useRef<number | null>(null);
+  const lastAutoFitAtRef = useRef(0);

  useEffect(() => {
    return () => {
      clearTimeout(saveTimerRef.current);
      clearTimeout(panTimerRef.current);
-      clearTimeout(autoFitTimerRef.current);
+      clearTimeout(settleFitTimerRef.current);
+      clearTimeout(trackingFitTimerRef.current);
+    };
+  }, []);
+
+  // User-gesture listeners for the respect-user-pan gate. Listens on
+  // `document` with capture phase and filters to events whose target
+  // lies inside the React Flow pane — this avoids a mount-order race
+  // (`.react-flow__pane` may not exist when the hook first runs if
+  // RF is behind a Suspense boundary) AND keeps clicks on the
+  // toolbar / modals / side panel from stamping user-pan-intent.
+  // Capture phase runs before target-phase `stopPropagation` so a
+  // handler elsewhere can't swallow the signal.
+  //
+  // Wheel only — NOT pointerdown. A pointerdown on the pane fires for
+  // ordinary clicks (deselect, click-near-a-card, modal-close-bubble)
+  // as well as the start of a drag-pan. Treating every pointerdown as
+  // "user wants to override auto-fit" meant a single accidental click
+  // before/during an org import locked out every subsequent fit, so
+  // the viewport stuck at whatever the first fit landed on while
+  // children kept materialising off-screen. Wheel is the canonical
+  // unambiguous gesture: scroll-to-pan and pinch-zoom both surface as
+  // wheel events. Drag-pans without an accompanying wheel are rare
+  // enough that letting them be overridden by a follow-up auto-fit is
+  // the right tradeoff.
+  useEffect(() => {
+    if (typeof window === "undefined") return;
+    const stamp = (e: Event) => {
+      const target = e.target as HTMLElement | null;
+      if (!target?.closest?.(".react-flow__pane")) return;
+      userPannedAtRef.current = Date.now();
+    };
+    const opts: AddEventListenerOptions = { passive: true, capture: true };
+    document.addEventListener("wheel", stamp, opts);
+    return () => {
+      document.removeEventListener("wheel", stamp, opts);
    };
  }, []);

@ -55,20 +149,64 @@ export function useCanvasViewport() {
    hadProvisioningRef.current = hasProvisioning;

    if (wasProvisioning && !hasProvisioning && nodeCount > 0) {
-      clearTimeout(autoFitTimerRef.current);
+      // Root-complete moment — every root that has children just
+      // finished deploying. Pop + glow once (mol-deploy-root-complete)
+      // then auto-fit the viewport around the whole org. Leaf-only
+      // roots (single workspaces with no children) are skipped so the
+      // effect reads as "your org landed" not "random card flickered".
+      const state = useCanvasStore.getState();
+      const rootsWithChildren = new Set<string>();
+      for (const n of state.nodes) {
+        if (n.data.parentId) continue;
+        if (state.nodes.some((c) => c.data.parentId === n.id)) {
+          rootsWithChildren.add(n.id);
+        }
+      }
+      if (rootsWithChildren.size > 0) {
+        useCanvasStore.setState({
+          nodes: state.nodes.map((n) =>
+            rootsWithChildren.has(n.id)
+              ? { ...n, className: appendClass(n.className, "mol-deploy-root-complete") }
+              : n,
+          ),
+        });
+        // Strip the one-shot class after the keyframe ends so a later
+        // deploy on the same node can fire it again.
+        window.setTimeout(() => {
+          const s = useCanvasStore.getState();
+          useCanvasStore.setState({
+            nodes: s.nodes.map((n) =>
+              rootsWithChildren.has(n.id)
+                ? { ...n, className: removeClass(n.className, "mol-deploy-root-complete") }
+                : n,
+            ),
+          });
+        }, 800);
+      }
+
+      clearTimeout(settleFitTimerRef.current);
      // 1200ms settle delay: lets React Flow's DOM measurement pass
      // resize newly-online parents before we compute bounds.
      // Measuring too early gives us the pre-render skeleton bbox and
      // fitView zooms to that smaller-than-real rectangle.
-      autoFitTimerRef.current = setTimeout(() => {
+      settleFitTimerRef.current = setTimeout(() => {
        fitView({
+          // Deliberately SLOWER than the in-flight tracking fits
+          // (400ms). The asymmetry reads as "settling" on the
+          // finished org rather than "tracking" another arrival,
+          // which is the intended UX for the "deploy done" moment.
+          // Don't normalize these two durations to the same value.
          duration: 1200,
-          padding: 0.25,
+          // Match the deploy-time fit padding (0.45) so end-state
+          // and in-flight state use the same framing — otherwise
+          // the final zoom-out "jumps" relative to the intermediate
+          // fits and looks like a mis-layout.
+          padding: 0.45,
          // Cap zoom-in: a small tree (2-3 nodes) would otherwise end
          // up at the 2x maxZoom, visually implying "something is
-          // wrong". 0.8 reads like "here's your whole org" even when
-          // the tree is small.
-          maxZoom: 0.8,
+          // wrong". 0.65 reads like "here's your whole org" even when
+          // the tree is small — matches deploy-time cap.
+          maxZoom: 0.65,
          // Cap zoom-out: fitView would fall back to the component's
          // minZoom=0.1 on a sparse/outlier layout, leaving the user
          // staring at a postage-stamp canvas. 0.25 is the floor.
@ -92,6 +230,115 @@ export function useCanvasViewport() {
    return () => window.removeEventListener("molecule:pan-to-node", handler);
  }, [fitView]);

+  // Auto pan+zoom to the whole deploying org after each child
+  // arrival — DEBOUNCED. Firing fitView on every event with a
+  // 600ms animation meant rapid sibling arrivals (server paces 2s
+  // apart, HMR bursts can land faster) made the viewport lurch
+  // continuously, which the user read as "parent flashing around".
+  // We now wait until the arrivals GO QUIET for 500ms, then run
+  // exactly one fit. The rootId we captured on the most recent
+  // event drives the fit bounds. Respect-user-pan still short-
+  // circuits: if the user moved after our last auto-fit, we never
+  // fit again this deploy.
+  const pendingFitRootRef = useRef<string | null>(null);
+  // Membership snapshot of the subtree at the moment of the last
+  // successful auto-fit, keyed by root id. When a new event arrives,
+  // we compute growth as "any id in the current subtree that wasn't
+  // in the snapshot". An id-set rather than just a count handles the
+  // delete-then-add case correctly: subtree of 6 → delete one → 5 →
+  // a different child arrives → 6 again. A length-only comparison
+  // would call this "no growth" and skip the fit even though a
+  // brand-new node landed off-screen. The id-set sees the new id
+  // wasn't in the snapshot and forces the fit.
+  //
+  // Map is keyed by root id and never pruned. Acceptable today because
+  // org roots are UUIDs (no collisions on retry / template re-import),
+  // canvas sessions are per-tab, and entries are tiny. Worth a sweep
+  // if long-lived sessions ever start importing hundreds of orgs.
+  const lastFitSubtreeIdsRef = useRef<Map<string, Set<string>>>(new Map());
+  useEffect(() => {
+    const runFit = () => {
+      const rootCandidate = pendingFitRootRef.current;
+      pendingFitRootRef.current = null;
+      if (!rootCandidate) return;
+      const state = useCanvasStore.getState();
+      // Climb to the true root — the event's rootId is the just-
+      // landed child's direct parent, which may itself be nested.
+      let topId = rootCandidate;
+      let cursor = state.nodes.find((n) => n.id === topId);
+      while (cursor?.data.parentId) {
+        const up = state.nodes.find((n) => n.id === cursor!.data.parentId);
+        if (!up) break;
+        cursor = up;
+        topId = up.id;
+      }
+      const subtree: string[] = [];
+      const stack = [topId];
+      while (stack.length) {
+        const id = stack.pop()!;
+        subtree.push(id);
+        for (const n of state.nodes) {
+          if (n.data.parentId === id) stack.push(n.id);
+        }
+      }
+      if (subtree.length === 0) return;
+
+      // Growth check: did any id in the current subtree NOT appear
+      // in the snapshot from the last fit? If yes, fit through
+      // regardless of the user-pan timestamp — the user has lost
+      // context, the new arrival is off-screen, and the deploy is
+      // the primary thing they want to watch. If no, fall back to
+      // the user-pan respect gate so post-deploy exploration isn't
+      // yanked back.
+      if (!shouldFitGrowing(
+        subtree,
+        lastFitSubtreeIdsRef.current.get(topId),
+        userPannedAtRef.current,
+        lastAutoFitAtRef.current,
+      )) {
+        return;
+      }
+      fitView({
+        nodes: subtree.map((id) => ({ id })),
+        // Short animation — server paces children ~2s apart, so a
+        // 400ms fit animation reads as "smoothly tracked" rather
+        // than "constantly lurching". Longer durations (the earlier
+        // 600ms) start to overlap if the user re-triggers deploys.
+        duration: 400,
+        // Generous padding so the right-hand Communications panel,
+        // bottom-left Legend, and bottom-right "New Workspace"
+        // button don't cover the outer cards. React Flow padding
+        // is a fraction of viewport dims, so 0.45 ≈ ~430px of
+        // margin on a 960-wide canvas — enough clearance for the
+        // two side panels (~300px + ~280px).
+        padding: 0.45,
+        // Lower maxZoom so small orgs (2-3 cards) still zoom out
+        // enough to show the parent frame + children clearly with
+        // the padded margins. 0.65 reads as "here's the whole org"
+        // without getting dragged to the maxZoom by fitView's
+        // "fill the viewport" default.
+        maxZoom: 0.65,
+        minZoom: 0.25,
+      });
+      lastAutoFitAtRef.current = Date.now();
+      lastFitSubtreeIdsRef.current.set(topId, new Set(subtree));
+    };
+    const handler = (e: Event) => {
+      const { rootId } = (e as CustomEvent<{ rootId: string }>).detail;
+      // Keep the most recently-requested root. Back-to-back imports
+      // on two different orgs (rare — user would have to click
+      // Import twice within 500ms) "later wins" the viewport rather
+      // than ping-ponging between them. If this becomes a real
+      // pattern we'd flush the pending fit synchronously when
+      // `rootId` changes, rather than resetting the timer.
+      pendingFitRootRef.current = rootId;
+      clearTimeout(trackingFitTimerRef.current);
+      trackingFitTimerRef.current = setTimeout(runFit, 500);
+    };
+    window.addEventListener("molecule:fit-deploying-org", handler);
+    return () => window.removeEventListener("molecule:fit-deploying-org", handler);
+  }, [fitView]);
+
  // Zoom to a team: fit the parent + its direct children in view.
  useEffect(() => {
    const handler = (e: Event) => {
@ -129,6 +376,11 @@ export function useCanvasViewport() {

  const onMoveEnd = useCallback(
    (_event: unknown, vp: { x: number; y: number; zoom: number }) => {
+      // User-pan detection moved to the wheel/pointerdown listener
+      // above — onMoveEnd fires for programmatic fitView too, which
+      // made this callback an unreliable source for user-intent
+      // tracking. This now only handles the debounced viewport
+      // save so a reload lands the user back where they were.
      clearTimeout(saveTimerRef.current);
      saveTimerRef.current = setTimeout(() => {
        saveViewport(vp.x, vp.y, vp.zoom);
--- a/canvas/src/components/canvas/useDragHandlers.ts
+++ b/canvas/src/components/canvas/useDragHandlers.ts
@ -113,6 +113,18 @@ export function useDragHandlers(): DragHandlers {

  const onNodeDragStart: OnNodeDrag<WorkspaceNode> = useCallback(
    (event, node) => {
+      // Belt-and-braces drag-lock: the primary mechanism is the
+      // `draggable: false` projection in Canvas.tsx — React Flow
+      // won't invoke this callback for locked nodes. But a future
+      // change to the projection that forgets a locked subtree
+      // would silently allow dragging, and locked drags mid-deploy
+      // corrupt the spawn animation. Fall through to a state-based
+      // check here so the invariant stays enforced in both places.
+      if (node.draggable === false) {
+        dragStartStateRef.current = null;
+        return;
+      }
+
      dragModifiersRef.current = {
        alt: event.altKey,
        meta: event.metaKey || event.ctrlKey,
--- a/canvas/src/components/canvas/useOrgDeployState.ts
+++ b/canvas/src/components/canvas/useOrgDeployState.ts
@ -0,0 +1,152 @@
+"use client";
+
+import { useMemo } from "react";
+import { useCanvasStore } from "@/store/canvas";
+
+/**
+ * Org-deploy state for a single workspace node. Computed from the
+ * current canvas store snapshot — no per-org status field on the
+ * backend is required (a root "is deploying" iff any descendant in
+ * its subtree still reports status === "provisioning").
+ *
+ * Performance note: the first version of this hook walked the entire
+ * nodes array per node render — O(n²) for a 50-node org. The current
+ * implementation computes ONE map of derived state for the whole
+ * canvas per nodes-array change, then each call site looks up its
+ * own id. The map is built inside useMemo against a cheap projection
+ * (id + parentId + status tuples via useShallow) so unrelated store
+ * mutations (drag, selection, viewport) don't re-run the walk.
+ */
+export interface OrgDeployState {
+  isActivelyProvisioning: boolean;
+  isDeployingRoot: boolean;
+  isLockedChild: boolean;
+  descendantProvisioningCount: number;
+}
+
+const EMPTY: OrgDeployState = {
+  isActivelyProvisioning: false,
+  isDeployingRoot: false,
+  isLockedChild: false,
+  descendantProvisioningCount: 0,
+};
+
+/** Projection used to drive the deploy-state computation. Shallow-
+ *  compared so re-renders only happen when one of these fields
+ *  actually changes across any node. */
+interface NodeProjection {
+  id: string;
+  parentId: string | null;
+  status: string;
+}
+
+function buildDeployMap(
+  projections: NodeProjection[],
+  deletingIds: ReadonlySet<string>,
+): Map<string, OrgDeployState> {
+  const byId = new Map<string, NodeProjection>();
+  const childrenBy = new Map<string, string[]>();
+  for (const p of projections) {
+    byId.set(p.id, p);
+    if (p.parentId) {
+      const arr = childrenBy.get(p.parentId) ?? [];
+      arr.push(p.id);
+      childrenBy.set(p.parentId, arr);
+    }
+  }
+
+  // Walk once from each node up to its root, memoising the root id.
+  // `rootOf.get(id)` short-circuits further walks on the same chain.
+  const rootOf = new Map<string, string>();
+  const findRoot = (id: string): string => {
+    const cached = rootOf.get(id);
+    if (cached) return cached;
+    let cursor: NodeProjection | undefined = byId.get(id);
+    let rootId = id;
+    while (cursor && cursor.parentId) {
+      const parent = byId.get(cursor.parentId);
+      if (!parent) break;
+      cursor = parent;
+      rootId = parent.id;
+      const alreadyKnown = rootOf.get(rootId);
+      if (alreadyKnown) {
+        rootId = alreadyKnown;
+        break;
+      }
+    }
+    rootOf.set(id, rootId);
+    return rootId;
+  };
+
+  // Count provisioning descendants per node. Also walk once per root
+  // using an iterative DFS so we don't stack-overflow on deep trees.
+  const countProvisioning = (rootId: string): number => {
+    let count = 0;
+    const stack = [rootId];
+    while (stack.length) {
+      const id = stack.pop()!;
+      const node = byId.get(id);
+      if (!node) continue;
+      if (node.status === "provisioning") count++;
+      const kids = childrenBy.get(id);
+      if (kids) stack.push(...kids);
+    }
+    return count;
+  };
+
+  // Per-root cache of subtree count so every descendant resolves in O(1).
+  const rootCount = new Map<string, number>();
+
+  const out = new Map<string, OrgDeployState>();
+  for (const p of projections) {
+    const rootId = findRoot(p.id);
+    let provCount = rootCount.get(rootId);
+    if (provCount === undefined) {
+      provCount = countProvisioning(rootId);
+      rootCount.set(rootId, provCount);
+    }
+    const rootIsDeploying = provCount > 0;
+    // A node being deleted gets the same visual + interaction lock
+    // as a deploying child. "The system owns this node right now,
+    // don't touch it" is the shared semantic — the user only cares
+    // that the card is dim and won't drag; they don't need to know
+    // whether it's coming up or going down.
+    const deleting = deletingIds.has(p.id);
+    out.set(p.id, {
+      isActivelyProvisioning: p.status === "provisioning",
+      isDeployingRoot: p.id === rootId && rootIsDeploying,
+      isLockedChild: deleting || (p.id !== rootId && rootIsDeploying),
+      descendantProvisioningCount:
+        p.id === rootId ? provCount : 0, // only roots display the count
+    });
+  }
+  return out;
+}
+
+/** Store-wide derived map. Recomputed whenever the `nodes` array
+ *  reference changes — which is on every store mutation that touches
+ *  nodes, including pure position tweens. The map build is O(n) so
+ *  a 50-node canvas costs ~50μs per tween frame; that's cheap enough
+ *  to not need a projection layer. (An earlier attempt to narrow the
+ *  subscription via `useShallow((s) => s.nodes.map(...))` triggered
+ *  React 18's "getSnapshot should be cached" loop because the
+ *  projection creates fresh object references each call — shallow
+ *  equality always sees "changed", which re-renders, which re-runs
+ *  the selector, ad infinitum.) */
+function useDeployMap(): Map<string, OrgDeployState> {
+  const nodes = useCanvasStore((s) => s.nodes);
+  const deletingIds = useCanvasStore((s) => s.deletingIds);
+  return useMemo(() => {
+    const projections = nodes.map((n) => ({
+      id: n.id,
+      parentId: n.data.parentId,
+      status: n.data.status,
+    }));
+    return buildDeployMap(projections, deletingIds);
+  }, [nodes, deletingIds]);
+}
+
+export function useOrgDeployState(nodeId: string): OrgDeployState {
+  const map = useDeployMap();
+  return map.get(nodeId) ?? EMPTY;
+}
--- a/canvas/src/components/tabs/ActivityTab.tsx
+++ b/canvas/src/components/tabs/ActivityTab.tsx
@ -5,6 +5,7 @@ import { api } from "@/lib/api";
 import { ConversationTraceModal } from "@/components/ConversationTraceModal";
 import { type ActivityEntry } from "@/types/activity";
 import { useWorkspaceName } from "@/hooks/useWorkspaceName";
+import { inferA2AErrorHint } from "./chat/a2aErrorHint";

 interface Props {
  workspaceId: string;
@ -286,6 +287,26 @@ function ActivityRow({
  );
 }

+const A2A_ERROR_PREFIX = "[A2A_ERROR]";
+
+/** Render a [A2A_ERROR]-prefixed response as a structured error block
+ *  with a stripped detail line + a cause hint. The previous raw render
+ *  ("[A2A_ERROR] " literal in the response area) gave the user no
+ *  signal to act on. */
+function A2AErrorPreview({ label, raw }: { label: string; raw: string }) {
+  const detail = raw.slice(A2A_ERROR_PREFIX.length).trim() || "(no detail provided)";
+  const hint = inferA2AErrorHint(detail);
+  return (
+    <div>
+      <div className="text-[8px] text-red-400/80 uppercase tracking-wider mb-1">{label} — delivery failed</div>
+      <div className="text-[10px] text-red-300 bg-red-950/30 border border-red-800/40 rounded p-2 space-y-1.5">
+        <div className="font-mono whitespace-pre-wrap break-words max-h-32 overflow-y-auto">{detail}</div>
+        <div className="text-[9px] text-red-300/70 leading-relaxed border-t border-red-800/30 pt-1.5">{hint}</div>
+      </div>
+    </div>
+  );
+}
+
 /** Extract human-readable text from A2A request/response JSON */
 function MessagePreview({ label, body }: { label: string; body: Record<string, unknown> }) {
  // Try to extract text from A2A message parts
@ -295,6 +316,14 @@ function MessagePreview({ label, body }: { label: string; body: Record<string, u
    if (body.task && typeof body.task === "string") { text = body.task; }
    if (!text && body.result && typeof body.result === "string") { text = body.result; }
    if (text) {
+      // [A2A_ERROR]-prefixed responses get the structured error
+      // treatment. Bare text fallthrough renders a bland gray block
+      // — fine for normal replies, terrible for "[A2A_ERROR] " with
+      // no further context. Detect at the top of the rendering path
+      // so it short-circuits before the generic preview kicks in.
+      if (text.trimStart().startsWith(A2A_ERROR_PREFIX)) {
+        return <A2AErrorPreview label={label} raw={text.trimStart()} />;
+      }
      return (
        <div>
          <div className="text-[8px] text-zinc-500 uppercase tracking-wider mb-1">{label}</div>
--- a/canvas/src/components/tabs/ChatTab.tsx
+++ b/canvas/src/components/tabs/ChatTab.tsx
@ -7,9 +7,12 @@ import { api } from "@/lib/api";
 import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
 import { WS_URL } from "@/store/socket";
 import { closeWebSocketGracefully } from "@/lib/ws-close";
-import { type ChatMessage, createMessage, appendMessageDeduped } from "./chat/types";
-import { extractResponseText, extractRequestText } from "./chat/message-parser";
+import { type ChatMessage, type ChatAttachment, createMessage, appendMessageDeduped } from "./chat/types";
+import { uploadChatFiles, downloadChatFile } from "./chat/uploads";
+import { AttachmentChip, PendingAttachmentPill } from "./chat/AttachmentViews";
+import { extractResponseText, extractRequestText, extractFilesFromTask } from "./chat/message-parser";
 import { AgentCommsPanel } from "./chat/AgentCommsPanel";
+import { appendActivityLine } from "./chat/activityLog";
 import { runtimeDisplayName } from "@/lib/runtime-names";
 import { ConfirmDialog } from "@/components/ConfirmDialog";

@ -21,10 +24,18 @@ interface Props {
 type ChatSubTab = "my-chat" | "agent-comms";

 // A2A response shape (subset). The full schema is in @a2a-js/sdk but we only
-// need parts/artifacts text extraction for the synchronous fallback path.
+// need parts/artifacts text + file extraction for the synchronous fallback.
+interface A2AFileRef {
+  name?: string;
+  mimeType?: string;
+  uri?: string;
+  bytes?: string;
+  size?: number;
+}
 interface A2APart {
  kind: string;
-  text: string;
+  text?: string;
+  file?: A2AFileRef;
 }
 interface A2AResponse {
  result?: {
@ -33,25 +44,81 @@ interface A2AResponse {
  };
 }

+/** Detect activity-log rows that the workspace's own runtime fired
+ *  against itself but were misclassified as canvas-source. The proper
+ *  fix is the X-Workspace-ID header from `self_source_headers()` in
+ *  workspace/platform_auth.py, which makes the platform record
+ *  source_id = workspace_id. But three failure modes still leak a
+ *  self-message into "My Chat":
+ *
+ *    1. Historical rows already in the DB with source_id=NULL.
+ *    2. Workspace containers running pre-fix heartbeat.py / main.py
+ *       (the fix only takes effect after an image rebuild + redeploy).
+ *    3. Future internal triggers added without the helper.
+ *
+ *  This client-side filter recognises the heartbeat trigger by its
+ *  exact prefix — the heartbeat assembles
+ *
+ *    "Delegation results are ready. Review them and take appropriate
+ *     action:\n" + summary_lines + report_instruction
+ *
+ *  in workspace/heartbeat.py. The prefix is template-fixed so a
+ *  string match is reliable. If the heartbeat copy ever changes,
+ *  update this constant in the same commit.
+ *
+ *  This is a backstop, not the primary defence — the X-Workspace-ID
+ *  header is. Filtering content is fragile to copy edits, so keep
+ *  the list narrow. */
+const INTERNAL_SELF_MESSAGE_PREFIXES = [
+  "Delegation results are ready. Review them and take appropriate action",
+];
+
+function isInternalSelfMessage(text: string): boolean {
+  return INTERNAL_SELF_MESSAGE_PREFIXES.some((p) => text.startsWith(p));
+}
+
 // extractReplyText pulls the agent's text reply out of an A2A response.
-// Mirrors the Go-side extractReplyText in workspace-server/internal/channels/manager.go.
+// Concatenates ALL text parts (joined with "\n") rather than returning
+// just the first. Claude Code and other runtimes commonly emit multi-
+// part text replies for long content (markdown tables, code blocks),
+// and the prior "first part wins" implementation silently truncated
+// the rest — observed on a 15k-char Wave 1 brief that rendered only
+// the table header. Mirrors extractTextsFromParts in message-parser.ts.
+//
+// Server-side counterpart in workspace-server/internal/channels/
+// manager.go has the same single-part bug; fix that too if/when a
+// channel-delivered reply (Slack, Lark, etc.) gets truncated.
 function extractReplyText(resp: A2AResponse): string {
+  const collect = (parts: A2APart[] | undefined): string => {
+    if (!parts) return "";
+    return parts
+      .filter((p) => p.kind === "text")
+      .map((p) => p.text ?? "")
+      .filter(Boolean)
+      .join("\n");
+  };
  const result = resp?.result;
-  if (result?.parts) {
-    for (const p of result.parts) {
-      if (p.kind === "text") return p.text;
-    }
-  }
+  const collected: string[] = [];
+  const fromParts = collect(result?.parts);
+  if (fromParts) collected.push(fromParts);
+  // Walk artifacts even if parts had text — some producers (Hermes
+  // tool calls) emit a summary in parts AND details in artifacts.
+  // Returning early on parts dropped the artifact body silently.
  if (result?.artifacts) {
    for (const a of result.artifacts) {
-      for (const p of a.parts || []) {
-        if (p.kind === "text") return p.text;
-      }
+      const t = collect(a.parts);
+      if (t) collected.push(t);
    }
  }
-  return "";
+  return collected.join("\n");
 }

+// Agent-returned files live on the same response shape as text —
+// delegated to extractFilesFromTask in message-parser.ts, which also
+// walks status.message.parts (that ChatTab's legacy text extractor
+// doesn't). Single source of truth for file-part parsing across
+// live chat, activity log replay, and any future consumers.
+
 /**
 * Load chat history from the activity_logs database via the platform API.
 * Uses source=canvas to only get user-initiated messages (not agent-to-agent).
@ -71,16 +138,23 @@ async function loadMessagesFromDB(workspaceId: string): Promise<{ messages: Chat
    for (const a of [...activities].reverse()) {
      // Extract user message from request_body
      const userText = extractRequestText(a.request_body);
-      if (userText) {
+      if (userText && !isInternalSelfMessage(userText)) {
        messages.push(createMessage("user", userText));
      }

-      // Extract agent response
+      // Extract agent response — text AND any file attachments so a
+      // chat reload surfaces historical download chips, not just plain
+      // text. `result` is nested on successful A2A responses; some
+      // older rows stored the raw `result` payload at the top level,
+      // so fall back to the body itself when `.result` is absent.
      if (a.response_body) {
        const text = extractResponseText(a.response_body);
-        if (text) {
+        const attachments = extractFilesFromTask(
+          (a.response_body.result ?? a.response_body) as Record<string, unknown>,
+        );
+        if (text || attachments.length > 0) {
          const role = a.status === "error" || text.toLowerCase().startsWith("agent error") ? "system" : "agent";
-          messages.push({ ...createMessage(role, text), timestamp: a.created_at });
+          messages.push({ ...createMessage(role, text, attachments), timestamp: a.created_at });
        }
      }
    }
@ -178,7 +252,16 @@ export function ChatTab({ workspaceId, data }: Props) {
 function MyChatPanel({ workspaceId, data }: Props) {
  const [messages, setMessages] = useState<ChatMessage[]>([]);
  const [input, setInput] = useState("");
-  const [sending, setSending] = useState(!!data.currentTask);
+  // `sending` is strictly the "this tab kicked off a send and hasn't
+  // seen the reply yet" signal. Previously this was initialized from
+  // data.currentTask to pick up in-flight agent work on mount, but
+  // that conflated agent-busy (workspace heartbeat) with user-
+  // in-flight (local send): when the WS dropped a TASK_COMPLETE event,
+  // currentTask lingered, the component re-mounted with sending=true,
+  // and the Send button stayed disabled forever even though nothing
+  // local was in flight. For the "agent is busy, show spinner" UX,
+  // use data.currentTask directly in the render path.
+  const [sending, setSending] = useState(false);
  const [thinkingElapsed, setThinkingElapsed] = useState(0);
  const [activityLog, setActivityLog] = useState<string[]>([]);
  const [loading, setLoading] = useState(true);
@ -189,6 +272,17 @@ function MyChatPanel({ workspaceId, data }: Props) {
  const [error, setError] = useState<string | null>(null);
  const [confirmRestart, setConfirmRestart] = useState(false);
  const bottomRef = useRef<HTMLDivElement>(null);
+  // Files the user has picked but not yet sent. Cleared on send
+  // (upload success) or by the × on each pill.
+  const [pendingFiles, setPendingFiles] = useState<File[]>([]);
+  const [uploading, setUploading] = useState(false);
+  const fileInputRef = useRef<HTMLInputElement>(null);
+  // Guard against a double-click during the upload phase: React
+  // state updates from the click that started the upload haven't
+  // flushed yet, so the disabled-button logic sees `uploading=false`
+  // from the closure and lets a second `sendMessage` enter. A ref
+  // observes the latest value synchronously.
+  const sendInFlightRef = useRef(false);

  // Load chat history from database on mount
  useEffect(() => {
@ -231,8 +325,10 @@ function MyChatPanel({ workspaceId, data }: Props) {
      // Dedupe in case the agent proactively pushed the same text the
      // HTTP /a2a response already delivered (observed with the Hermes
      // runtime, which emits both a reply body and a send_message_to_user
-      // push for the same content).
-      setMessages((prev) => appendMessageDeduped(prev, createMessage("agent", m.content)));
+      // push for the same content). Attachments ride along with the
+      // message so files returned by the A2A_RESPONSE WS path render
+      // their download chips.
+      setMessages((prev) => appendMessageDeduped(prev, createMessage("agent", m.content, m.attachments)));
    }
    if (sendingFromAPIRef.current && msgs.length > 0) {
      setSending(false);
@ -277,12 +373,21 @@ function MyChatPanel({ workspaceId, data }: Props) {
      try {
        const msg = JSON.parse(event.data);
        if (msg.event === "ACTIVITY_LOGGED") {
+          // Filter to events for THIS workspace. The platform's
+          // BroadcastOnly fires to every connected client, and
+          // without this guard a sibling workspace's a2a_send would
+          // surface as "→ Delegating to X..." inside the wrong
+          // chat panel. (workspace_id on the WS envelope is the
+          // workspace whose activity_log row we just wrote.)
+          if (msg.workspace_id !== workspaceId) return;
+
          const p = msg.payload || {};
          const type = p.activity_type as string;
          const method = (p.method as string) || "";
          const status = (p.status as string) || "";
          const targetId = (p.target_id as string) || "";
          const durationMs = p.duration_ms as number | undefined;
+          const summary = (p.summary as string) || "";

          let line = "";
          if (type === "a2a_receive" && method === "message/send") {
@ -313,17 +418,23 @@ function MyChatPanel({ workspaceId, data }: Props) {
            const targetName = resolveWorkspaceName(targetId);
            line = `→ Delegating to ${targetName}...`;
          } else if (type === "task_update") {
-            const summary = (p.summary as string) || "";
            if (summary) line = `⟳ ${summary}`;
+          } else if (type === "agent_log") {
+            // Per-tool-use telemetry from claude_sdk_executor's
+            // _report_tool_use. The summary already carries an icon
+            // + human-readable args (📄 Read /path, ⚡ Bash: …)
+            // so we render it verbatim. No icon prefix here — the
+            // emoji at the start of summary is the visual marker.
+            if (summary) line = summary;
          }

          if (line) {
-            setActivityLog((prev) => [...prev.slice(-8), line]);
+            setActivityLog((prev) => appendActivityLine(prev, line));
          }
        } else if (msg.event === "TASK_UPDATED" && msg.workspace_id === workspaceId) {
          const task = (msg.payload?.current_task as string) || "";
          if (task) {
-            setActivityLog((prev) => [...prev.slice(-8), `⟳ ${task}`]);
+            setActivityLog((prev) => appendActivityLine(prev, `⟳ ${task}`));
          }
        }
        // A2A_RESPONSE is already consumed by the store and its text is
@ -339,10 +450,35 @@ function MyChatPanel({ workspaceId, data }: Props) {

  const sendMessage = async () => {
    const text = input.trim();
-    if (!text || !agentReachable || sending) return;
+    const filesToSend = pendingFiles;
+    // Allow sending if EITHER text OR attachments are present — a user
+    // can drop a file with no text and the agent still receives it.
+    if ((!text && filesToSend.length === 0) || !agentReachable || sending || uploading) return;
+    // Synchronous re-entry guard — see sendInFlightRef comment.
+    if (sendInFlightRef.current) return;
+    sendInFlightRef.current = true;
+
+    // Upload attachments first so we can include URIs in the A2A
+    // message parts. Sequential-before-send: a message with references
+    // to files not yet staged would fail agent-side; staging happens
+    // synchronously via /chat/uploads before message/send dispatch.
+    let uploaded: ChatAttachment[] = [];
+    if (filesToSend.length > 0) {
+      setUploading(true);
+      try {
+        uploaded = await uploadChatFiles(workspaceId, filesToSend);
+      } catch (e) {
+        setUploading(false);
+        sendInFlightRef.current = false;
+        setError(e instanceof Error ? `Upload failed: ${e.message}` : "Upload failed");
+        return;
+      }
+      setUploading(false);
+    }

    setInput("");
-    setMessages((prev) => [...prev, createMessage("user", text)]);
+    setPendingFiles([]);
+    setMessages((prev) => [...prev, createMessage("user", text, uploaded)]);
    setSending(true);
    sendingFromAPIRef.current = true;
    setError(null);
@ -356,40 +492,228 @@ function MyChatPanel({ workspaceId, data }: Props) {
        parts: [{ kind: "text", text: m.content }],
      }));

+    // A2A parts: text part (if any) + file parts (per attachment). The
+    // agent sees both in a single turn, matching the A2A spec shape.
+    const parts: A2APart[] = [];
+    if (text) parts.push({ kind: "text", text });
+    for (const att of uploaded) {
+      parts.push({
+        kind: "file",
+        file: {
+          name: att.name,
+          mimeType: att.mimeType,
+          uri: att.uri,
+          size: att.size,
+        },
+      });
+    }
+
+    // A2A calls can legitimately take minutes — LLM latency +
+    // multi-turn tool use is common on slower providers (Hermes+minimax,
+    // Claude Code invoking bash/file tools, etc.). The 15s default
+    // would silently abort the fetch here, leaving the server to
+    // complete the reply and the user staring at
+    // "agent may be unreachable". Match the upload timeout (60s × 2)
+    // for the happy-path ceiling; anything longer is genuinely stuck.
    api.post<A2AResponse>(`/workspaces/${workspaceId}/a2a`, {
      method: "message/send",
      params: {
        message: {
          role: "user",
          messageId: crypto.randomUUID(),
-          parts: [{ kind: "text", text }],
+          parts,
        },
        metadata: { history },
      },
-    })
+    }, { timeoutMs: 120_000 })
      .then((resp) => {
        // Skip if the WS A2A_RESPONSE event already handled this response.
        // Both paths (WS + HTTP) check sendingFromAPIRef — whichever clears
        // it first wins, the other becomes a no-op (no duplicate messages).
        if (!sendingFromAPIRef.current) return;
        const replyText = extractReplyText(resp);
-        if (replyText) {
-          setMessages((prev) => appendMessageDeduped(prev, createMessage("agent", replyText)));
+        const replyFiles = extractFilesFromTask((resp?.result ?? {}) as Record<string, unknown>);
+        if (replyText || replyFiles.length > 0) {
+          setMessages((prev) =>
+            appendMessageDeduped(prev, createMessage("agent", replyText, replyFiles)),
+          );
        }
        setSending(false);
        sendingFromAPIRef.current = false;
+        sendInFlightRef.current = false;
      })
      .catch(() => {
+        // Same dedup guard as .then(): if a WS path (pendingAgentMsgs
+        // or ACTIVITY_LOGGED a2a_receive ok) already delivered the
+        // reply, sendingFromAPIRef is already false and there's
+        // nothing to roll back. Surfacing "Failed to send" here would
+        // contradict the agent reply the user is currently reading —
+        // exactly the false-positive observed when the HTTP request
+        // hung up (proxy idle / 502) after WS already won.
+        if (!sendingFromAPIRef.current) {
+          sendInFlightRef.current = false;
+          return;
+        }
        setSending(false);
        sendingFromAPIRef.current = false;
+        sendInFlightRef.current = false;
        setError("Failed to send message — agent may be unreachable");
      });
  };

+  const onFilesPicked = (fileList: FileList | null) => {
+    if (!fileList) return;
+    const picked = Array.from(fileList);
+    // Deduplicate against current pending set by name+size — user
+    // picking the same file twice shouldn't append it.
+    setPendingFiles((prev) => {
+      const keyed = new Set(prev.map((f) => `${f.name}:${f.size}`));
+      return [...prev, ...picked.filter((f) => !keyed.has(`${f.name}:${f.size}`))];
+    });
+    if (fileInputRef.current) fileInputRef.current.value = "";
+  };
+
+  const removePendingFile = (index: number) =>
+    setPendingFiles((prev) => prev.filter((_, i) => i !== index));
+
+  // Monotonic counter so two paste events within the same wall-clock
+  // second still produce distinct filenames. Without this, on
+  // Firefox (where pasted images have an empty `file.name`), two
+  // pastes ~100ms apart could yield identical synthetic names AND
+  // identical sizes, collapsing into one attachment via the
+  // `name:size` dedup in onFilesPicked.
+  const pasteCounterRef = useRef(0);
+
+  /** Paste-from-clipboard image attachment.
+   *
+   *  Browser clipboard image items arrive as `File`s whose `name` is
+   *  often a generic "image.png" (Chrome) or empty (Firefox/Safari),
+   *  so two consecutive screenshot pastes collide on the name+size
+   *  dedup the file-picker uses. Re-tag each pasted image with a
+   *  per-paste unique name so dedup keeps them apart and the upload
+   *  pipeline (which expects a non-empty filename) is happy.
+   *
+   *  Falls through to onFilesPicked via direct File[] (NOT through
+   *  the DataTransfer constructor — that throws on Safari < 14.1
+   *  and old Edge, silently aborting the paste).
+   *
+   *  Only intercepts the paste when the clipboard has at least one
+   *  image; text-only pastes fall through to the textarea's default
+   *  behaviour. */
+  const mimeToExt = (mime: string): string => {
+    // Avoid raw `mime.split("/")[1]` — that yields `"svg+xml"`,
+    // `"jpeg"`, `"webp"` etc. which produce ugly filenames and may
+    // trip server-side extension allowlists. Map known types
+    // explicitly; unknown falls back to a safe default.
+    if (mime === "image/svg+xml") return "svg";
+    if (mime === "image/jpeg") return "jpg";
+    if (mime === "image/png") return "png";
+    if (mime === "image/gif") return "gif";
+    if (mime === "image/webp") return "webp";
+    if (mime === "image/heic") return "heic";
+    return "png";
+  };
+
+  const onPasteIntoComposer = (e: React.ClipboardEvent<HTMLTextAreaElement>) => {
+    if (!dropEnabled) return;
+    const items = e.clipboardData?.items;
+    if (!items || items.length === 0) return;
+    const imageFiles: File[] = [];
+    for (let i = 0; i < items.length; i++) {
+      const item = items[i];
+      if (!item.type.startsWith("image/")) continue;
+      const file = item.getAsFile();
+      if (!file) continue;
+      const ext = mimeToExt(file.type);
+      const stamp = new Date()
+        .toISOString()
+        .replace(/[:.]/g, "-")
+        .slice(0, 19);
+      const seq = pasteCounterRef.current++;
+      const fname = `pasted-${stamp}-${seq}-${i}.${ext}`;
+      imageFiles.push(new File([file], fname, { type: file.type }));
+    }
+    if (imageFiles.length === 0) return;
+    e.preventDefault();
+    // Reuse the picker path so file-size guards, dedup, and pending-
+    // list state all run through the same code. Build a synthetic
+    // FileList-like object to avoid the DataTransfer constructor —
+    // that's missing on Safari < 14.1 / old Edge and would silently
+    // throw, leaving the paste a no-op.
+    addPastedFiles(imageFiles);
+  };
+
+  // Variant of onFilesPicked that accepts a File[] directly, sidestepping
+  // the DataTransfer-FileList round-trip. Same dedup + state shape.
+  const addPastedFiles = (files: File[]) => {
+    setPendingFiles((prev) => {
+      const keyed = new Set(prev.map((f) => `${f.name}:${f.size}`));
+      return [...prev, ...files.filter((f) => !keyed.has(`${f.name}:${f.size}`))];
+    });
+  };
+
+  // Drag-and-drop staging. dragDepthRef counts enter vs leave events so
+  // the overlay doesn't flicker when the cursor crosses nested children
+  // (textarea, buttons) — dragenter/dragleave fire for every boundary.
+  const [dragOver, setDragOver] = useState(false);
+  const dragDepthRef = useRef(0);
+  const dropEnabled = agentReachable && !sending && !uploading;
+  const isFileDrag = (e: React.DragEvent) =>
+    Array.from(e.dataTransfer.types || []).includes("Files");
+
+  const onDragEnter = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    e.preventDefault();
+    dragDepthRef.current += 1;
+    setDragOver(true);
+  };
+  const onDragOver = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    e.preventDefault();
+    e.dataTransfer.dropEffect = "copy";
+  };
+  const onDragLeave = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    dragDepthRef.current = Math.max(0, dragDepthRef.current - 1);
+    if (dragDepthRef.current === 0) setDragOver(false);
+  };
+  const onDrop = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    e.preventDefault();
+    dragDepthRef.current = 0;
+    setDragOver(false);
+    onFilesPicked(e.dataTransfer.files);
+  };
+
+  const downloadAttachment = (att: ChatAttachment) => {
+    // Errors here are rare but user-visible (401 on a revoked token,
+    // 404 if the agent deleted the file). Surface via the inline
+    // error banner — the message list itself stays untouched.
+    downloadChatFile(workspaceId, att).catch((e) => {
+      setError(e instanceof Error ? `Download failed: ${e.message}` : "Download failed");
+    });
+  };
+
  const isOnline = data.status === "online" || data.status === "degraded";

  return (
-    <div className="flex flex-col h-full">
+    <div
+      className="flex flex-col h-full relative"
+      onDragEnter={onDragEnter}
+      onDragOver={onDragOver}
+      onDragLeave={onDragLeave}
+      onDrop={onDrop}
+    >
+      {dragOver && (
+        <div
+          className="absolute inset-0 z-20 flex items-center justify-center bg-blue-500/10 border-2 border-dashed border-blue-400 rounded pointer-events-none"
+          aria-live="polite"
+        >
+          <div className="bg-zinc-900/90 border border-blue-400/50 rounded-lg px-4 py-2 text-xs text-blue-200">
+            Drop to attach
+          </div>
+        </div>
+      )}
      {/* Messages */}
      <div className="flex-1 overflow-y-auto p-3 space-y-3">
        {loading && (
@ -435,9 +759,23 @@ function MyChatPanel({ workspaceId, data }: Props) {
                    : "bg-zinc-800/80 text-zinc-200 border border-zinc-700/30"
              }`}
            >
-              <div className="prose prose-sm prose-invert max-w-none [&>p]:mb-1 [&>p:last-child]:mb-0">
-                <ReactMarkdown remarkPlugins={[remarkGfm]}>{msg.content}</ReactMarkdown>
-              </div>
+              {msg.content && (
+                <div className="prose prose-sm prose-invert max-w-none [&>p]:mb-1 [&>p:last-child]:mb-0">
+                  <ReactMarkdown remarkPlugins={[remarkGfm]}>{msg.content}</ReactMarkdown>
+                </div>
+              )}
+              {msg.attachments && msg.attachments.length > 0 && (
+                <div className={`flex flex-wrap gap-1 ${msg.content ? "mt-1.5" : ""}`}>
+                  {msg.attachments.map((att, i) => (
+                    <AttachmentChip
+                      key={`${msg.id}-${i}`}
+                      attachment={att}
+                      onDownload={downloadAttachment}
+                      tone={msg.role === "user" ? "user" : "agent"}
+                    />
+                  ))}
+                </div>
+              )}
              <div className="text-[9px] text-zinc-500 mt-1">
                {new Date(msg.timestamp).toLocaleTimeString()}
              </div>
@ -445,8 +783,11 @@ function MyChatPanel({ workspaceId, data }: Props) {
          </div>
        ))}

-        {/* Thinking indicator */}
-        {sending && (
+        {/* Thinking indicator — shows when this tab is awaiting a reply
+           OR when the workspace heartbeat reports an in-flight task
+           (covers the "agent is already busy when I open the tab" case
+           without locking the Send button on a stale currentTask). */}
+        {(sending || !!data.currentTask) && (
          <div className="flex justify-start">
            <div className="bg-zinc-800/50 border border-zinc-700/30 rounded-lg px-3 py-2 max-w-[85%]">
              <div className="flex items-center gap-2 text-xs text-zinc-400">
@ -490,7 +831,37 @@ function MyChatPanel({ workspaceId, data }: Props) {

      {/* Input */}
      <div className="p-3 border-t border-zinc-800">
-        <div className="flex gap-2">
+        {pendingFiles.length > 0 && (
+          <div className="flex flex-wrap gap-1.5 mb-2">
+            {pendingFiles.map((f, i) => (
+              <PendingAttachmentPill
+                key={`${f.name}-${f.size}-${i}`}
+                file={f}
+                onRemove={() => removePendingFile(i)}
+              />
+            ))}
+          </div>
+        )}
+        <div className="flex gap-2 items-end">
+          <input
+            ref={fileInputRef}
+            type="file"
+            multiple
+            className="hidden"
+            onChange={(e) => onFilesPicked(e.target.files)}
+            aria-hidden="true"
+          />
+          <button
+            onClick={() => fileInputRef.current?.click()}
+            disabled={!agentReachable || sending || uploading}
+            aria-label="Attach file"
+            title="Attach file"
+            className="p-2 bg-zinc-800 hover:bg-zinc-700 border border-zinc-700 rounded-lg text-zinc-400 hover:text-zinc-200 transition-colors shrink-0 disabled:opacity-40"
+          >
+            <svg width="14" height="14" viewBox="0 0 16 16" fill="none" aria-hidden="true">
+              <path d="M11 6.5 7 10.5a2 2 0 1 0 2.8 2.8l4-4a3.5 3.5 0 0 0-5-5l-4.5 4.5a5 5 0 0 0 7 7l4-4" stroke="currentColor" strokeWidth="1.4" strokeLinecap="round" strokeLinejoin="round" />
+            </svg>
+          </button>
          <textarea
            aria-label="Message to agent"
            value={input}
@ -501,17 +872,18 @@ function MyChatPanel({ workspaceId, data }: Props) {
                sendMessage();
              }
            }}
-            placeholder={agentReachable ? "Send a message... (Shift+Enter for new line)" : `Agent is ${data.status}`}
+            onPaste={onPasteIntoComposer}
+            placeholder={agentReachable ? "Send a message... (Shift+Enter for new line, paste images to attach)" : `Agent is ${data.status}`}
            disabled={!agentReachable || sending}
            rows={1}
            className="flex-1 bg-zinc-800 border border-zinc-700 rounded-lg px-3 py-2 text-xs text-zinc-200 placeholder-zinc-500 focus:outline-none focus:border-blue-500 resize-none disabled:opacity-50"
          />
          <button
            onClick={sendMessage}
-            disabled={!input.trim() || !agentReachable || sending}
+            disabled={(!input.trim() && pendingFiles.length === 0) || !agentReachable || sending || uploading}
            className="px-4 py-2 bg-blue-600 hover:bg-blue-500 text-xs font-medium rounded-lg text-white disabled:opacity-30 transition-colors shrink-0"
          >
-            Send
+            {uploading ? "Uploading…" : "Send"}
          </button>
        </div>
      </div>
--- a/canvas/src/components/tabs/ConfigTab.tsx
+++ b/canvas/src/components/tabs/ConfigTab.tsx
@ -105,12 +105,17 @@ interface RuntimeOption {
 // Fallback used when /templates can't be fetched (offline, older backend).
 // Keep in sync with manifest.json workspace_templates as a defensive default.
 // Model + env suggestions only flow when the backend is reachable.
+//
 // Runtimes that manage their own config outside the platform's config.yaml
-// template. For these, a missing config.yaml is expected — the user manages
-// config via the runtime's own mechanism (e.g. hermes edits
-// ~/.hermes/config.yaml on the workspace EC2 via the Terminal tab or its
-// own CLI). Showing a "No config.yaml found" error for these is misleading.
-const RUNTIMES_WITH_OWN_CONFIG = new Set<string>(["hermes", "external"]);
+// template. For these, a missing config.yaml is expected and the form
+// genuinely can't edit the runtime's settings (there's no platform file
+// to write). Hermes is NOT on this list: it DOES ship a platform
+// config.yaml via workspace-configs-templates/hermes that controls model,
+// runtime_config, required_env, etc. Editing it through this form is
+// exactly the point of the platform adaptor. The deep `~/.hermes/
+// config.yaml` on the container is a separate runtime-internal file,
+// not this one.
+const RUNTIMES_WITH_OWN_CONFIG = new Set<string>(["external"]);

 const FALLBACK_RUNTIME_OPTIONS: RuntimeOption[] = [
  { value: "", label: "LangGraph (default)", models: [] },
@ -152,9 +157,11 @@ export function ConfigTab({ workspaceId }: Props) {
    // default `LangGraph`. See GH #1894.
    let wsMetadataRuntime = "";
    let wsMetadataModel = "";
+    let wsMetadataTier: number | null = null;
    try {
-      const ws = await api.get<{ runtime?: string }>(`/workspaces/${workspaceId}`);
+      const ws = await api.get<{ runtime?: string; tier?: number }>(`/workspaces/${workspaceId}`);
      wsMetadataRuntime = (ws.runtime || "").trim();
+      if (typeof ws.tier === "number") wsMetadataTier = ws.tier;
    } catch { /* fall back to config.yaml */ }
    try {
      const m = await api.get<{ model?: string }>(`/workspaces/${workspaceId}/model`);
@ -166,11 +173,15 @@ export function ConfigTab({ workspaceId }: Props) {
      const parsed = parseYaml(res.content);
      setOriginalYaml(res.content);
      setRawDraft(res.content);
-      // Merge: config.yaml wins for fields it declares, but workspace metadata
-      // wins for runtime + model when config.yaml doesn't set them.
+      // Merge: workspace-row metadata is authoritative for the DB-backed
+      // fields (tier, runtime, model). config.yaml often lags — handleSave
+      // PATCHes tier/runtime directly and a template snapshot in the
+      // container can differ from the live row. Show the DB value so the
+      // form doesn't contradict the node badge (issue: badge=T3, form=T2).
      const merged = { ...DEFAULT_CONFIG, ...parsed } as ConfigData;
-      if (!merged.runtime && wsMetadataRuntime) merged.runtime = wsMetadataRuntime;
-      if (!merged.model && wsMetadataModel) merged.model = wsMetadataModel;
+      if (wsMetadataRuntime) merged.runtime = wsMetadataRuntime;
+      if (wsMetadataModel) merged.model = wsMetadataModel;
+      if (wsMetadataTier !== null) merged.tier = wsMetadataTier;
      setConfig(merged);
    } catch {
      // No platform-managed config.yaml. Some runtimes (hermes, external)
@ -185,6 +196,7 @@ export function ConfigTab({ workspaceId }: Props) {
        ...DEFAULT_CONFIG,
        runtime: wsMetadataRuntime,
        model: wsMetadataModel,
+        ...(wsMetadataTier !== null ? { tier: wsMetadataTier } : {}),
      } as ConfigData);
    } finally {
      setLoading(false);
--- a/canvas/src/components/tabs/DetailsTab.tsx
+++ b/canvas/src/components/tabs/DetailsTab.tsx
@ -36,7 +36,7 @@ export function DetailsTab({ workspaceId, data }: Props) {
  const [restartError, setRestartError] = useState<string | null>(null);
  const [consoleOpen, setConsoleOpen] = useState(false);
  const updateNodeData = useCanvasStore((s) => s.updateNodeData);
-  const removeNode = useCanvasStore((s) => s.removeNode);
+  const removeSubtree = useCanvasStore((s) => s.removeSubtree);
  const selectNode = useCanvasStore((s) => s.selectNode);
  // Ref for the "Delete Workspace" trigger — Cancel returns focus here
  const deleteButtonRef = useRef<HTMLButtonElement>(null);
@ -94,7 +94,11 @@ export function DetailsTab({ workspaceId, data }: Props) {
    setDeleteError(null);
    try {
      await api.del(`/workspaces/${workspaceId}?confirm=true`);
-      removeNode(workspaceId);
+      // Mirror the server-side cascade — drop the row + every
+      // descendant locally so the canvas reflects the deletion
+      // immediately, even when the WS is dead and the per-descendant
+      // WORKSPACE_REMOVED events never arrive.
+      removeSubtree(workspaceId);
      selectNode(null);
    } catch (e) {
      setDeleteError(e instanceof Error ? e.message : "Failed to delete");
--- a/canvas/src/components/tabs/SkillsTab.tsx
+++ b/canvas/src/components/tabs/SkillsTab.tsx
@ -6,6 +6,14 @@ import { useCanvasStore, summarizeWorkspaceCapabilities, type WorkspaceNodeData
 import { showToast } from "../Toaster";

 interface Props {
+  // The workspace's id is NOT a field on WorkspaceNodeData — that
+  // interface is the React Flow `node.data` blob, while the id lives
+  // on `node.id`. Pass it explicitly (matches every other tab in
+  // SidePanel) so the install/uninstall API calls don't end up
+  // POSTing to /workspaces/undefined/plugins. The interface extending
+  // Record<string, unknown> meant TypeScript silently typed
+  // `data.id` as `unknown` instead of erroring — easy to miss.
+  workspaceId: string;
  data: WorkspaceNodeData;
 }

@ -40,7 +48,7 @@ interface SourceSchemesResponse {
 // Delay before reloading installed plugins after install/uninstall (workspace restarts)
 const PLUGIN_RELOAD_DELAY_MS = 15_000;

-export function SkillsTab({ data }: Props) {
+export function SkillsTab({ workspaceId, data }: Props) {
  const capability = summarizeWorkspaceCapabilities(data);
  const skills = useMemo(() => extractSkills(data.agentCard), [data.agentCard]);
  const setPanelTab = useCanvasStore((s) => s.setPanelTab);
@ -57,32 +65,115 @@ export function SkillsTab({ data }: Props) {
  const reloadTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);

  useEffect(() => {
+    // Re-init `mountedRef.current = true` on every mount. React 18
+    // StrictMode (Next.js dev) double-invokes effects: mount →
+    // cleanup → mount. Without this re-init, the first cleanup sets
+    // mountedRef.current = false, the re-mount runs the effect body
+    // again but never restores the flag, so every subsequent
+    // `if (mountedRef.current) setX(...)` guard skips and the
+    // component appears wedged: fetches complete, state never
+    // updates, "Loading…" sits forever. Production doesn't double-
+    // invoke so the bug only surfaces in dev — but dev is where we
+    // see it, and the cost of being explicit is one assignment.
+    mountedRef.current = true;
    return () => {
      mountedRef.current = false;
      clearTimeout(reloadTimerRef.current);
    };
  }, []);

-  const workspaceId = data.id;
+  // Tracks whether loadInstalled has completed at least once (success
+  // or empty-array success — NOT failure). Without this the auto-
+  // expand effect below would fire on the initial render where
+  // `installed.length === 0` simply because the fetch hasn't returned
+  // yet, and worse, would also fire if the fetch throws (network
+  // blip, auth failure) — both cases falsely look like "no plugins
+  // installed". Gating on a separate "loaded" flag avoids the false
+  // positive.
+  const [installedLoaded, setInstalledLoaded] = useState(false);

  const loadInstalled = useCallback(async () => {
    try {
      const result = await api.get<PluginInfo[]>(`/workspaces/${workspaceId}/plugins`);
-      if (mountedRef.current) setInstalled(Array.isArray(result) ? result : []);
+      if (mountedRef.current) {
+        setInstalled(Array.isArray(result) ? result : []);
+        setInstalledLoaded(true);
+      }
    } catch (e) {
      console.warn("SkillsTab: installed plugins load failed", e);
    }
  }, [workspaceId]);

-  const loadRegistry = useCallback(async () => {
+  // registry-load lifecycle so the UI can show "Loading…" / error /
+  // retry instead of an indistinguishable "No plugins in registry"
+  // banner whether the fetch is in-flight, errored, or genuinely
+  // returned []. The previous silent console.warn-only path made
+  // an auth failure or CORS blip look identical to an empty
+  // registry — exactly the diagnosis dead-end observed when the
+  // server returned 20 plugins via curl but the canvas showed 0.
+  const [registryLoading, setRegistryLoading] = useState(false);
+  const [registryError, setRegistryError] = useState<string | null>(null);
+
+  // Synchronous gate against concurrent loadRegistry runs. Refs survive
+  // Fast Refresh re-renders (ref objects persist across re-runs of
+  // the function body), so a previously-stranded fetch can pin this
+  // ref at true and block every subsequent loadRegistry call. The
+  // `force` parameter on loadRegistry below provides the user-driven
+  // escape hatch for that wedge.
+  const registryFetchInFlight = useRef(false);
+
+  // Reset the in-flight gate on unmount so a Fast Refresh that
+  // tears down + recreates the component without a full page reload
+  // doesn't carry the stuck-true value into the new instance via
+  // dev-server-preserved module state.
+  useEffect(() => {
+    return () => {
+      registryFetchInFlight.current = false;
+    };
+  }, []);
+
+  const loadRegistry = useCallback(async (force = false) => {
+    // Default callers (mount effect, button while not loading) honour
+    // the gate. Explicit force=true callers (Retry button) bypass it
+    // — the user is signalling "forget whatever you thought was in
+    // flight, fetch again now".
+    if (!force && registryFetchInFlight.current) return;
+    registryFetchInFlight.current = true;
+    setRegistryLoading(true);
+    setRegistryError(null);
    try {
-      const result = await api.get<PluginInfo[]>("/plugins");
+      // 10s timeout — tighter than the 15s default. Plugin registry
+      // is local-disk-backed on the platform host (server reads
+      // pluginsDir entries) so a 10s budget is generous. Without
+      // an explicit timeout the UI's "Loading registry…" can sit
+      // for the full 15s + any browser hop time when a Fast
+      // Refresh strands an in-flight promise.
+      const result = await api.get<PluginInfo[]>("/plugins", { timeoutMs: 10_000 });
      if (mountedRef.current) setRegistry(Array.isArray(result) ? result : []);
    } catch (e) {
-      // Registry is the AVAILABLE PLUGINS list. Silent failure here
-      // left the user seeing "No plugins in registry" with no clue
-      // it was a fetch error — log it so devtools shows the cause.
      console.warn("SkillsTab: registry load failed", e);
+      if (mountedRef.current) {
+        // Detect timeout/abort by DOMException.name first — that's
+        // the canonical signal across browsers. Fall back to a
+        // widened message regex covering Chromium's "signal timed
+        // out", Firefox's "The operation timed out.", Safari's
+        // "Aborted". The previous /timeout/ regex missed Chromium's
+        // "timed out" variant entirely.
+        const name = (e as { name?: string })?.name ?? "";
+        const msg = e instanceof Error ? e.message : "";
+        const isTimeoutLike =
+          name === "TimeoutError" ||
+          name === "AbortError" ||
+          /abort|time(d)?\s*out/i.test(msg);
+        setRegistryError(
+          isTimeoutLike
+            ? "Registry fetch timed out (10s). The platform server may be slow or unreachable."
+            : msg || "Failed to load registry",
+        );
+      }
+    } finally {
+      registryFetchInFlight.current = false;
+      if (mountedRef.current) setRegistryLoading(false);
    }
  }, []);

@ -102,17 +193,73 @@ export function SkillsTab({ data }: Props) {
    loadSourceSchemes();
  }, [loadInstalled, loadRegistry, loadSourceSchemes]);

+  // First-time experience: if the workspace has zero plugins
+  // installed but the platform's registry has options to choose
+  // from, expand the registry by default so the user sees what's
+  // available without an extra click. Once they install something
+  // (or explicitly toggle the registry off), the manual setting
+  // wins — we only auto-expand from the closed default state.
+  const hasAutoExpandedRef = useRef(false);
+  useEffect(() => {
+    if (hasAutoExpandedRef.current) return;
+    if (installedLoaded && installed.length === 0 && registry.length > 0) {
+      setShowRegistry(true);
+      hasAutoExpandedRef.current = true;
+    }
+  }, [installedLoaded, installed.length, registry.length]);
+
  const installedNames = useMemo(() => new Set(installed.map((p) => p.name)), [installed]);

  // Install always goes through the source-based API. For registry
  // plugins we build the local:// source on the fly; custom sources
  // (github://, clawhub://, …) are typed into the input below.
-  const installFromSource = async (source: string, labelOverride?: string) => {
+  //
+  // Optional `optimistic` parameter mirrors the uninstall flow's local
+  // state mutation. Without it, the user sees the button revert from
+  // "Installing..." → "Install" the instant the POST returns, and the
+  // green "Installed" tag doesn't appear for ~15s while we wait out
+  // PLUGIN_RELOAD_DELAY_MS for the workspace restart before refetching.
+  // 15s of staring at the same button feels broken. Pushing the
+  // registry entry into `installed` immediately makes the UI reflect
+  // the install instantly; the delayed loadInstalled() reconciles
+  // anything we got wrong (or any server-side filtering we don't
+  // know about locally).
+  const installFromSource = async (
+    source: string,
+    labelOverride?: string,
+    optimistic?: PluginInfo,
+  ) => {
    const label = labelOverride ?? source;
    setInstalling(label);
    try {
      await api.post(`/workspaces/${workspaceId}/plugins`, { source });
      showToast(`Installed ${label} — restarting workspace`, "success");
+      if (optimistic && mountedRef.current) {
+        // Push with `supported_on_runtime` left undefined — the
+        // server's ListInstalled annotates the real value (true /
+        // false) at refetch time. Forcing `true` here would hide the
+        // "inert on this runtime" badge for 15s if the user
+        // installed a plugin that doesn't actually support the
+        // workspace's runtime; the badge only renders on `=== false`,
+        // so undefined keeps it neutral until reconciliation arrives.
+        setInstalled((prev) =>
+          prev.some((p) => p.name === optimistic.name)
+            ? prev
+            : [...prev, { ...optimistic, supported_on_runtime: undefined }],
+        );
+        // Note: we intentionally do NOT set `installedLoaded` here.
+        // That flag means "the initial GET has succeeded at least
+        // once" and gates the auto-expand-registry effect. A fast
+        // optimistic install BEFORE the initial fetch returns must
+        // not flip the gate, or the auto-expand never fires and a
+        // followup loadInstalled racing with the optimistic write
+        // could overwrite our entry with [] mid-restart.
+      }
+      // Drop any prior reload timer before scheduling a new one —
+      // back-to-back installs within PLUGIN_RELOAD_DELAY_MS would
+      // otherwise queue multiple loadInstalled() calls and the
+      // unmount cleanup only clears the latest handle.
+      clearTimeout(reloadTimerRef.current);
      reloadTimerRef.current = setTimeout(() => loadInstalled(), PLUGIN_RELOAD_DELAY_MS);
    } catch (e) {
      showToast(e instanceof Error ? e.message : "Install failed", "error");
@ -121,7 +268,10 @@ export function SkillsTab({ data }: Props) {
    }
  };

-  const handleInstall = (pluginName: string) => installFromSource(`local://${pluginName}`, pluginName);
+  const handleInstall = (pluginName: string) => {
+    const entry = registry.find((p) => p.name === pluginName);
+    return installFromSource(`local://${pluginName}`, pluginName, entry);
+  };

  const handleInstallCustom = async () => {
    const source = customSource.trim();
@ -133,9 +283,12 @@ export function SkillsTab({ data }: Props) {
  const handleUninstall = async (pluginName: string) => {
    setUninstalling(pluginName);
    try {
-      await api.del(`/workspaces/${data.id}/plugins/${pluginName}`);
+      await api.del(`/workspaces/${workspaceId}/plugins/${pluginName}`);
      showToast(`Removed ${pluginName} — restarting workspace`, "success");
      setInstalled((prev) => prev.filter((p) => p.name !== pluginName));
+      // Drop any prior reload timer (see installFromSource for the
+      // back-to-back-action leak rationale).
+      clearTimeout(reloadTimerRef.current);
      reloadTimerRef.current = setTimeout(() => loadInstalled(), PLUGIN_RELOAD_DELAY_MS);
    } catch (e) {
      showToast(e instanceof Error ? e.message : "Uninstall failed", "error");
@ -264,9 +417,53 @@ export function SkillsTab({ data }: Props) {
                Local registry plugins below; paste any scheme URL above for GitHub or other sources.
              </div>
            </div>
-            <div className="text-[10px] uppercase tracking-[0.2em] text-zinc-600 mb-2">Available plugins</div>
-            {registry.length === 0 ? (
-              <div className="text-[10px] text-zinc-600">No plugins in registry</div>
+            <div className="flex items-center justify-between mb-2">
+              <div className="text-[10px] uppercase tracking-[0.2em] text-zinc-600">Available plugins</div>
+              {/* Retry visible whenever registry is empty — including
+                  the loading state — so a stuck fetch (Fast Refresh
+                  stranded promise, slow server, browser quirk) has a
+                  user-driven escape hatch. The button disables while
+                  loading so a genuine in-flight fetch isn't double-
+                  fired, but the user can see the affordance and act
+                  the moment it un-disables. */}
+              {registry.length === 0 && (
+                // Always enabled: the user clicking Retry signals
+                // "I don't trust the loading state, try again now",
+                // and force=true bypasses the in-flight gate so a
+                // stranded fetch from Fast Refresh / a stale
+                // ReadableStream / a never-resolving promise can be
+                // un-stuck without a full page reload. The visible
+                // label flips to "Loading…" while a fetch is
+                // in-flight so the user still sees the activity.
+                <button
+                  type="button"
+                  onClick={() => loadRegistry(true)}
+                  className="text-[10px] text-violet-300 hover:text-violet-200 underline-offset-2 hover:underline"
+                >
+                  {registryLoading ? "Loading… click to retry" : "Retry"}
+                </button>
+              )}
+            </div>
+            {registryLoading && registry.length === 0 ? (
+              <div className="text-[10px] text-zinc-500">Loading registry…</div>
+            ) : registryError ? (
+              <div className="rounded-lg border border-red-800/40 bg-red-950/20 px-2 py-1.5">
+                <div className="text-[10px] text-red-300 font-semibold mb-0.5">
+                  Couldn't load the plugin registry
+                </div>
+                <div className="text-[10px] text-red-400/80">{registryError}</div>
+                <div className="mt-1 text-[10px] text-zinc-500">
+                  Check the platform server is reachable at /plugins. The Retry button is in the header above.
+                </div>
+              </div>
+            ) : registry.length === 0 ? (
+              <div className="rounded-lg border border-zinc-800/40 bg-zinc-950/40 px-2 py-1.5">
+                <div className="text-[10px] text-zinc-400 mb-0.5">Registry returned 0 plugins.</div>
+                <div className="text-[10px] text-zinc-600">
+                  This usually means the platform's plugins/ directory is empty.
+                  Run scripts/clone-manifest.sh to populate it from the standalone repos.
+                </div>
+              </div>
            ) : (
              <div className="space-y-1.5">
                {registry.map((p) => {
--- a/canvas/src/components/tabs/tests/ConfigTab.hermes.test.tsx
+++ b/canvas/src/components/tabs/tests/ConfigTab.hermes.test.tsx
@ -128,7 +128,13 @@ describe("ConfigTab — hermes workspace", () => {
    });
  });

-  it("shows hermes-specific info banner pointing to Terminal tab (#1894)", async () => {
+  it("does NOT show the hermes-specific info banner (removed in #2061)", async () => {
+    // Banner-text inversion: the multilevel-layout-UX PR drops "hermes"
+    // from RUNTIMES_WITH_OWN_CONFIG (now {"external"} only). Hermes now
+    // shows the normal Config form — the banner "Hermes manages its own
+    // config" is reserved for the "external" runtime, not hermes itself.
+    // If this ever flips back, revisit the banner/error UX before
+    // unpinning this assertion.
    wireApi({
      workspaceRuntime: "hermes",
      configYamlContent: null,
@ -137,9 +143,11 @@ describe("ConfigTab — hermes workspace", () => {

    render(<ConfigTab workspaceId="ws-test" />);

-    await waitFor(() => {
-      expect(screen.getByText(/Hermes manages its own config/i)).toBeTruthy();
-    });
+    // Wait for the render+loads to settle (template list drives the runtime combobox).
+    await waitFor(() =>
+      screen.getByRole("combobox", { name: /runtime/i }),
+    );
+    expect(screen.queryByText(/Hermes manages its own config/i)).toBeNull();
  });

  it("DOES show 'No config.yaml found' error for langgraph workspace (default runtime)", async () => {
@ -161,14 +169,28 @@ describe("ConfigTab — hermes workspace", () => {
 });

 describe("ConfigTab — config.yaml on disk", () => {
-  it("config.yaml runtime/model wins when present, workspace metadata is fallback", async () => {
-    // If the workspace DB has runtime=langgraph but config.yaml declares
-    // runtime: crewai, the form should show crewai (config.yaml wins).
-    // Prevents silent runtime drift across reads.
+  it("workspace metadata (DB) wins over config.yaml when both are present (#2061)", async () => {
+    // Priority inversion in #2061: previously config.yaml overrode DB, so
+    // the tier-on-node badge and runtime-in-form could drift when the
+    // user edited config.yaml on disk. The multilevel-layout-UX PR made
+    // the DB authoritative — config.yaml is read for non-DB keys (tools,
+    // MCP server list, etc.) but runtime/model/tier come from the
+    // workspace row so the node badge matches the form.
+    //
+    // Scenario: DB says "hermes", config.yaml says "crewai". The form
+    // must show hermes (DB wins).
+    //
+    // We pick hermes (not langgraph) on the DB side because "langgraph"
+    // is collapsed to the empty-string "LangGraph (default)" option in
+    // the runtime dropdown — so a "langgraph" DB value would render as
+    // the empty-valued option and obscure whether the DB-wins logic
+    // actually fired. Hermes has its own non-empty option value and
+    // gives the assertion a clean signal.
    wireApi({
-      workspaceRuntime: "langgraph", // DB
+      workspaceRuntime: "hermes", // DB — authoritative
      configYamlContent: 'runtime: crewai\nmodel: "claude-opus"\n',
      templates: [
+        { id: "t-hermes", name: "Hermes", runtime: "hermes", models: [] },
        { id: "t-crewai", name: "CrewAI", runtime: "crewai", models: [] },
      ],
    });
@ -176,6 +198,6 @@ describe("ConfigTab — config.yaml on disk", () => {
    render(<ConfigTab workspaceId="ws-test" />);

    const select = await waitFor(() => screen.getByRole("combobox", { name: /runtime/i }));
-    expect((select as HTMLSelectElement).value).toBe("crewai");
+    expect((select as HTMLSelectElement).value).toBe("hermes");
  });
 });
--- a/canvas/src/components/tabs/chat/AgentCommsPanel.tsx
+++ b/canvas/src/components/tabs/chat/AgentCommsPanel.tsx
@ -1,13 +1,17 @@
 "use client";

 import { useState, useEffect, useRef } from "react";
+import ReactMarkdown from "react-markdown";
+import remarkGfm from "remark-gfm";
 import { api } from "@/lib/api";
 import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
 import { WS_URL } from "@/store/socket";
 import { closeWebSocketGracefully } from "@/lib/ws-close";
+import { showToast } from "../../Toaster";
 import { extractResponseText, extractRequestText } from "./message-parser";
+import { inferA2AErrorHint } from "./a2aErrorHint";

-interface ActivityEntry {
+export interface ActivityEntry {
  id: string;
  activity_type: string;
  source_id: string | null;
@ -22,11 +26,29 @@ interface ActivityEntry {

 interface CommMessage {
  id: string;
-  direction: "in" | "out";
+  /** UI-facing flow from THIS workspace's point of view:
+   *
+   *    "out" — this workspace either initiated the call (a2a_send)
+   *            OR self-logged the reply from a peer it had called
+   *            (a2a_receive with source_id == workspaceId).
+   *    "in"  — a peer initiated the call to us (a2a_receive with
+   *            source_id != workspaceId).
+   *
+   *  Distinct from activity_type because the agent runtime self-
+   *  logs its outbound calls' replies as `a2a_receive` rows; without
+   *  this normalisation the UI labels would render those as
+   *  incoming ("← From X") and right-justify them on the wrong
+   *  side, even though from the user's perspective the call WAS
+   *  outgoing. See toCommMessage for the resolution rules. */
+  flow: "in" | "out";
  peerName: string;
  peerId: string;
  text: string;
  responseText: string | null;
+  /** "ok" | "error" — surfaces failed deliveries with their own
+   *  visual treatment + recovery actions instead of an opaque
+   *  "[A2A_ERROR]" body the user can't act on. */
+  status: string;
  timestamp: string;
 }

@ -36,9 +58,31 @@ function resolveName(id: string): string {
  return (node?.data as WorkspaceNodeData)?.name || id.slice(0, 8);
 }

-function toCommMessage(entry: ActivityEntry, workspaceId: string): CommMessage | null {
-  const isOutgoing = entry.activity_type === "a2a_send";
-  const peerId = isOutgoing ? (entry.target_id || "") : (entry.source_id || "");
+export function toCommMessage(entry: ActivityEntry, workspaceId: string): CommMessage | null {
+  // a2a_receive activity rows come in two shapes:
+  //
+  //   1. Real incoming call (a peer called us): source_id = the peer,
+  //      target_id = us. peerId is source_id, flow is "in".
+  //
+  //   2. Self-logged response to an outbound call (the workspace's own
+  //      runtime calls report_activity("a2a_receive", ...) after
+  //      delegating; see workspace/a2a_tools.py:181). source_id =
+  //      our own workspace_id, target_id = the peer that replied.
+  //      peerId must come from target_id (otherwise the peer-name
+  //      resolves to "us" and Restart would target THIS workspace),
+  //      and flow is "out" — from the user's perspective this row
+  //      belongs to the outbound thread, not an incoming one.
+  //
+  // a2a_send rows are always outbound from us: source_id = us,
+  // target_id = the peer.
+  const isSendActivity = entry.activity_type === "a2a_send";
+  const isSelfLoggedReceive =
+    entry.activity_type === "a2a_receive" && entry.source_id === workspaceId;
+  const flow: "in" | "out" = isSendActivity || isSelfLoggedReceive ? "out" : "in";
+  const peerId =
+    isSendActivity || isSelfLoggedReceive
+      ? entry.target_id || ""
+      : entry.source_id || "";
  if (!peerId) return null;

  const text = extractRequestText(entry.request_body) || entry.summary || "";
@ -46,15 +90,35 @@ function toCommMessage(entry: ActivityEntry, workspaceId: string): CommMessage |

  return {
    id: entry.id,
-    direction: isOutgoing ? "out" : "in",
+    flow,
    peerName: resolveName(peerId),
    peerId,
    text,
    responseText,
+    status: entry.status || "ok",
    timestamp: entry.created_at,
  };
 }

+/** Strip the [A2A_ERROR] sentinel prefix the workspace runtime adds
+ *  to failed delegation responses, so the UI can render the underlying
+ *  message (or fall back to a generic explanation when the inner text
+ *  is empty — currently common because httpx exceptions often
+ *  stringify as ""). */
+const A2A_ERROR_PREFIX = "[A2A_ERROR]";
+
+function unwrapErrorText(raw: string | null): string {
+  if (!raw) return "";
+  const trimmed = raw.trim();
+  if (trimmed.startsWith(A2A_ERROR_PREFIX)) {
+    return trimmed.slice(A2A_ERROR_PREFIX.length).trim();
+  }
+  return trimmed;
+}
+
+// inferA2AErrorHint moved to ./a2aErrorHint so the Activity tab and
+// this panel render identical hints for the same symptom.
+
 export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {
  const [messages, setMessages] = useState<CommMessage[]>([]);
  const [loading, setLoading] = useState(true);
@ -67,22 +131,45 @@ export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {
    setLoading(true);
    api.get<ActivityEntry[]>(`/workspaces/${workspaceId}/activity?source=agent&limit=50`)
      .then((entries) => {
-        const filtered = entries
+        const filtered = (entries ?? [])
          .filter((e) => e.activity_type === "a2a_send" || e.activity_type === "a2a_receive")
          .reverse();
        const msgs: CommMessage[] = [];
        for (const e of filtered) {
-          const m = toCommMessage(e, workspaceId);
-          if (m) {
-            const key = `${m.timestamp}:${m.direction}:${m.peerId}`;
-            msgs.push(m);
-            seenKeys.current.add(key);
+          // Per-row try/catch so a single malformed activity row
+          // (e.g. unexpected request_body shape) doesn't kill the
+          // batch — the previous code threw out of the for-loop and
+          // setMessages([3 items]) never ran, leaving the panel
+          // stuck on the empty state with no diagnostic in the
+          // console because the outer .catch silently swallowed
+          // everything.
+          try {
+            const m = toCommMessage(e, workspaceId);
+            if (m) {
+              const key = `${m.timestamp}:${m.flow}:${m.peerId}`;
+              msgs.push(m);
+              seenKeys.current.add(key);
+            }
+          } catch (rowErr) {
+            console.warn(
+              "AgentCommsPanel: failed to map activity row",
+              { id: e.id, type: e.activity_type, err: rowErr },
+            );
          }
        }
        setMessages(msgs);
        setLoading(false);
      })
-      .catch(() => setLoading(false));
+      .catch((err) => {
+        // Surface the failure in the console so a stuck panel is
+        // diagnosable without a debugger. Previous bare
+        // `.catch(() => setLoading(false))` swallowed every load
+        // failure (network errors, JSON parse errors, throws inside
+        // the .then body) — the panel just sat on the empty state
+        // with zero signal.
+        console.warn("AgentCommsPanel: load activity failed", err);
+        setLoading(false);
+      });
  }, [workspaceId]);

  // Live updates via WebSocket
@ -115,7 +202,7 @@ export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {
          };
          const m = toCommMessage(entry, workspaceId);
          if (m) {
-            const key = `${m.timestamp}:${m.direction}:${m.peerId}`;
+            const key = `${m.timestamp}:${m.flow}:${m.peerId}`;
            if (seenKeys.current.has(key)) return;
            seenKeys.current.add(key);
            setMessages((prev) => [...prev, m]);
@ -148,31 +235,177 @@ export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {

  return (
    <div className="flex-1 overflow-y-auto p-3 space-y-2">
-      {messages.map((msg) => (
-        <div key={msg.id} className={`flex ${msg.direction === "out" ? "justify-end" : "justify-start"}`}>
-          <div
-            className={`max-w-[85%] rounded-lg px-3 py-2 text-xs ${
-              msg.direction === "out"
-                ? "bg-cyan-900/30 text-cyan-100 border border-cyan-700/20"
-                : "bg-zinc-800/80 text-zinc-200 border border-zinc-700/30"
-            }`}
-          >
-            <div className="text-[9px] text-zinc-500 mb-1">
-              {msg.direction === "out" ? `→ To ${msg.peerName}` : `← From ${msg.peerName}`}
-            </div>
-            <div className="text-zinc-300">{msg.text || "(no message text)"}</div>
-            {msg.responseText && (
-              <div className="mt-1.5 pt-1.5 border-t border-zinc-700/30 text-zinc-400">
-                {msg.responseText}
-              </div>
-            )}
-            <div className="text-[9px] text-zinc-500 mt-1">
-              {new Date(msg.timestamp).toLocaleTimeString()}
-            </div>
-          </div>
-        </div>
-      ))}
+      {messages.map((msg) =>
+        msg.status === "error" ? (
+          <ErrorMessage key={msg.id} msg={msg} />
+        ) : (
+          <NormalMessage key={msg.id} msg={msg} />
+        ),
+      )}
      <div ref={bottomRef} />
    </div>
  );
 }
+
+function NormalMessage({ msg }: { msg: CommMessage }) {
+  return (
+    <div className={`flex ${msg.flow === "out" ? "justify-end" : "justify-start"}`}>
+      <div
+        className={`max-w-[85%] rounded-lg px-3 py-2 text-xs ${
+          msg.flow === "out"
+            ? "bg-cyan-900/30 text-cyan-100 border border-cyan-700/20"
+            : "bg-zinc-800/80 text-zinc-200 border border-zinc-700/30"
+        }`}
+      >
+        <div className="text-[9px] text-zinc-500 mb-1">
+          {msg.flow === "out" ? `→ To ${msg.peerName}` : `← From ${msg.peerName}`}
+        </div>
+        {msg.text ? (
+          <MarkdownBody className="text-zinc-300">{msg.text}</MarkdownBody>
+        ) : (
+          <div className="text-zinc-300">(no message text)</div>
+        )}
+        {msg.responseText && (
+          <MarkdownBody className="mt-1.5 pt-1.5 border-t border-zinc-700/30 text-zinc-400">
+            {msg.responseText}
+          </MarkdownBody>
+        )}
+        <div className="text-[9px] text-zinc-500 mt-1">
+          {new Date(msg.timestamp).toLocaleTimeString()}
+        </div>
+      </div>
+    </div>
+  );
+}
+
+/** Failure-state row. Replaces the unactionable "X failed [A2A_ERROR]"
+ *  bubble with: a clear banner naming the peer, the underlying
+ *  error text (if any), an inferred cause hint, and recovery
+ *  actions — Restart workspace, Open workspace.
+ *
+ *  Recovery actions show on BOTH directions because both target the
+ *  same peer (toCommMessage now resolves peerId to the peer in
+ *  either case): an outbound delivery failure ("we called X and it
+ *  errored"), an inbound runtime failure ("X called us and our
+ *  reply errored" — rare), or the agent-self-logged "I called X and
+ *  got an error back" pattern that is the most common shape. The
+ *  user always wants to restart or inspect the failing peer. */
+function ErrorMessage({ msg }: { msg: CommMessage }) {
+  const selectNode = useCanvasStore((s) => s.selectNode);
+  const [restarting, setRestarting] = useState(false);
+  const errorText = unwrapErrorText(msg.responseText);
+  const hint = inferA2AErrorHint(errorText);
+
+  // Guard against acting on a peer whose workspace has been deleted
+  // since this row was logged. Without the guard, restart 404s
+  // surface as a generic toast and Open silently sets a dangling
+  // selection that renders nothing in the side panel.
+  const peerExists = (): boolean => {
+    return useCanvasStore.getState().nodes.some((n) => n.id === msg.peerId);
+  };
+
+  const handleRestart = async () => {
+    if (restarting) return;
+    if (!peerExists()) {
+      showToast(`${msg.peerName} no longer exists`, "error");
+      return;
+    }
+    setRestarting(true);
+    try {
+      await api.post(`/workspaces/${msg.peerId}/restart`, {});
+      showToast(`Restarting ${msg.peerName}…`, "success");
+    } catch (e) {
+      showToast(
+        `Restart failed: ${e instanceof Error ? e.message : "unknown error"}`,
+        "error",
+      );
+    } finally {
+      setRestarting(false);
+    }
+  };
+
+  const handleOpen = () => {
+    if (!peerExists()) {
+      showToast(`${msg.peerName} no longer exists`, "error");
+      return;
+    }
+    selectNode(msg.peerId);
+  };
+
+  return (
+    <div className={`flex ${msg.flow === "out" ? "justify-end" : "justify-start"}`}>
+      <div className="max-w-[85%] rounded-lg border border-red-800/50 bg-red-950/30 px-3 py-2 text-xs">
+        <div className="flex items-center gap-1.5 text-[10px] text-red-300 font-semibold uppercase tracking-wide mb-1.5">
+          <span aria-hidden="true">⚠</span>
+          {msg.flow === "out"
+            ? `Failed to deliver to ${msg.peerName}`
+            : `${msg.peerName} returned an error`}
+        </div>
+
+        {msg.text && (
+          <div className="text-[10px] text-zinc-500 mb-1.5">
+            <span className="uppercase tracking-wide">Task</span>
+            <MarkdownBody className="text-zinc-400">{msg.text}</MarkdownBody>
+          </div>
+        )}
+
+        <div className="rounded bg-zinc-950/60 border border-red-900/40 px-2 py-1.5 mb-1.5">
+          <div className="text-[9px] uppercase tracking-wide text-red-400 mb-0.5">
+            Underlying error
+          </div>
+          <code className="text-[11px] font-mono text-red-200 whitespace-pre-wrap break-words">
+            {errorText || "(no detail returned)"}
+          </code>
+        </div>
+
+        <p className="text-[10px] text-zinc-400 leading-snug mb-2">{hint}</p>
+
+        {msg.peerId && (
+          <div className="flex flex-wrap items-center gap-1.5">
+            <button
+              type="button"
+              onClick={handleRestart}
+              disabled={restarting}
+              className="px-2 py-0.5 rounded bg-red-900/50 hover:bg-red-800/60 border border-red-700/40 text-[10px] text-red-200 disabled:opacity-50 transition-colors"
+            >
+              {restarting ? "Restarting…" : `Restart ${msg.peerName}`}
+            </button>
+            <button
+              type="button"
+              onClick={handleOpen}
+              className="px-2 py-0.5 rounded bg-zinc-800 hover:bg-zinc-700 border border-zinc-700/50 text-[10px] text-zinc-300 transition-colors"
+            >
+              Open {msg.peerName}
+            </button>
+          </div>
+        )}
+
+        <div className="text-[9px] text-zinc-500 mt-1.5">
+          {new Date(msg.timestamp).toLocaleTimeString()}
+        </div>
+      </div>
+    </div>
+  );
+}
+
+/** Tiny markdown wrapper matching ChatTab's My Chat styling. Same
+ *  remark-gfm pipeline (tables, strikethrough, task lists) plus the
+ *  prose tweaks that keep paragraphs tight inside a small bubble.
+ *  Code blocks get an `overflow-x-auto` so a long line of code doesn't
+ *  blow out the bubble's max-width — agent-to-agent replies routinely
+ *  ship code samples and JSON. */
+function MarkdownBody({
+  children,
+  className,
+}: {
+  children: string;
+  className?: string;
+}) {
+  return (
+    <div
+      className={`prose prose-sm prose-invert max-w-none [&>p]:mb-1 [&>p:last-child]:mb-0 [&_pre]:overflow-x-auto [&_table]:block [&_table]:overflow-x-auto ${className ?? ""}`}
+    >
+      <ReactMarkdown remarkPlugins={[remarkGfm]}>{children}</ReactMarkdown>
+    </div>
+  );
+}
--- a/canvas/src/components/tabs/chat/AttachmentViews.tsx
+++ b/canvas/src/components/tabs/chat/AttachmentViews.tsx
@ -0,0 +1,94 @@
+"use client";
+
+// Small presentational components for chat attachments. Kept in a
+// separate file so ChatTab.tsx stays focused on state + send/receive
+// orchestration. Both variants share the file-icon + name + size
+// layout; the only difference is the trailing action (remove for
+// pending, download for completed).
+
+import type { ChatAttachment } from "./types";
+
+function formatSize(bytes: number | undefined): string {
+  if (bytes == null) return "";
+  if (bytes < 1024) return `${bytes} B`;
+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`;
+  return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
+}
+
+/** Inline pill for a file that the user has picked but not yet sent.
+ *  Renders above the textarea; clicking × pops it from the pending
+ *  list without uploading. */
+export function PendingAttachmentPill({
+  file,
+  onRemove,
+}: {
+  file: File;
+  onRemove: () => void;
+}) {
+  return (
+    <div className="flex items-center gap-1.5 rounded-md border border-zinc-700/60 bg-zinc-800/80 px-2 py-1 text-[10px] text-zinc-300 max-w-[200px]">
+      <FileGlyph className="text-zinc-400 shrink-0" />
+      <span className="truncate" title={file.name}>{file.name}</span>
+      <span className="text-zinc-500 shrink-0 tabular-nums">{formatSize(file.size)}</span>
+      <button
+        onClick={onRemove}
+        aria-label={`Remove ${file.name}`}
+        className="ml-0.5 text-zinc-500 hover:text-zinc-200 transition-colors shrink-0"
+      >
+        <svg width="10" height="10" viewBox="0 0 16 16" fill="none" aria-hidden="true">
+          <path d="M4 4l8 8M12 4l-8 8" stroke="currentColor" strokeWidth="1.6" strokeLinecap="round" />
+        </svg>
+      </button>
+    </div>
+  );
+}
+
+/** Chip rendered inside a message bubble for a sent/received file.
+ *  Clicking triggers the download via the passed onDownload callback
+ *  so the parent controls workspace-scoped URL resolution. */
+export function AttachmentChip({
+  attachment,
+  onDownload,
+  tone,
+}: {
+  attachment: ChatAttachment;
+  onDownload: (a: ChatAttachment) => void;
+  tone: "user" | "agent";
+}) {
+  const toneClasses =
+    tone === "user"
+      ? "border-blue-400/30 bg-blue-600/20 hover:bg-blue-600/30 text-blue-100"
+      : "border-zinc-600/50 bg-zinc-700/40 hover:bg-zinc-600/50 text-zinc-100";
+  return (
+    <button
+      onClick={() => onDownload(attachment)}
+      title={`Download ${attachment.name}`}
+      className={`flex items-center gap-1.5 rounded-md border px-2 py-1 text-[10px] transition-colors max-w-full ${toneClasses}`}
+    >
+      <FileGlyph className="shrink-0 opacity-70" />
+      <span className="truncate">{attachment.name}</span>
+      {attachment.size != null && (
+        <span className="opacity-60 shrink-0 tabular-nums">{formatSize(attachment.size)}</span>
+      )}
+      <DownloadGlyph className="opacity-70 shrink-0" />
+    </button>
+  );
+}
+
+function FileGlyph({ className }: { className?: string }) {
+  return (
+    <svg width="10" height="10" viewBox="0 0 16 16" fill="none" className={className} aria-hidden="true">
+      <path d="M4 2h5l3 3v9a1 1 0 0 1-1 1H4a1 1 0 0 1-1-1V3a1 1 0 0 1 1-1Z" stroke="currentColor" strokeWidth="1.3" strokeLinejoin="round" />
+      <path d="M9 2v3h3" stroke="currentColor" strokeWidth="1.3" strokeLinejoin="round" />
+    </svg>
+  );
+}
+
+function DownloadGlyph({ className }: { className?: string }) {
+  return (
+    <svg width="10" height="10" viewBox="0 0 16 16" fill="none" className={className} aria-hidden="true">
+      <path d="M8 2v9M4 7l4 4 4-4" stroke="currentColor" strokeWidth="1.4" strokeLinecap="round" strokeLinejoin="round" />
+      <path d="M3 13h10" stroke="currentColor" strokeWidth="1.4" strokeLinecap="round" />
+    </svg>
+  );
+}
--- a/canvas/src/components/tabs/chat/tests/AgentCommsPanel.test.ts
+++ b/canvas/src/components/tabs/chat/tests/AgentCommsPanel.test.ts
@ -0,0 +1,113 @@
+// @vitest-environment jsdom
+import { describe, it, expect, vi } from "vitest";
+
+// Stub the canvas store before importing the SUT — toCommMessage calls
+// useCanvasStore.getState() inside resolveName to look up peer names,
+// which would otherwise hit the real Zustand store.
+vi.mock("@/store/canvas", () => ({
+  useCanvasStore: {
+    getState: () => ({
+      nodes: [
+        { id: "ws-self", data: { name: "Self" } },
+        { id: "ws-peer", data: { name: "Peer Agent" } },
+      ],
+    }),
+  },
+}));
+
+import { toCommMessage, type ActivityEntry } from "../AgentCommsPanel";
+
+const SELF = "ws-self";
+const PEER = "ws-peer";
+
+function makeEntry(overrides: Partial<ActivityEntry> = {}): ActivityEntry {
+  return {
+    id: "act-1",
+    activity_type: "a2a_send",
+    source_id: SELF,
+    target_id: PEER,
+    method: "message/send",
+    summary: "Delegating to Peer Agent",
+    request_body: null,
+    response_body: null,
+    status: "ok",
+    created_at: "2026-04-25T18:00:00Z",
+    ...overrides,
+  };
+}
+
+describe("toCommMessage — flow derivation", () => {
+  it("a2a_send is always outbound (flow=out, peer=target)", () => {
+    const m = toCommMessage(
+      makeEntry({ activity_type: "a2a_send", source_id: SELF, target_id: PEER }),
+      SELF,
+    );
+    expect(m).toBeTruthy();
+    expect(m!.flow).toBe("out");
+    expect(m!.peerId).toBe(PEER);
+    expect(m!.peerName).toBe("Peer Agent");
+  });
+
+  it("a2a_receive from a peer (peer-initiated call) is inbound", () => {
+    // Real incoming call: source = peer, target = us.
+    const m = toCommMessage(
+      makeEntry({
+        activity_type: "a2a_receive",
+        source_id: PEER,
+        target_id: SELF,
+      }),
+      SELF,
+    );
+    expect(m!.flow).toBe("in");
+    expect(m!.peerId).toBe(PEER);
+    expect(m!.peerName).toBe("Peer Agent");
+  });
+
+  it("a2a_receive self-logged by our runtime AFTER an outbound call is OUTBOUND from the user's POV", () => {
+    // workspace/a2a_tools.py:181 self-logs an a2a_receive on the
+    // CALLER's workspace_id with source_id=us, target_id=peer.
+    // From the user's perspective this row belongs to the outbound
+    // delegation thread — render flow=out + peer=target so the
+    // bubble right-justifies under "Delegating to peer" and the
+    // Restart button targets the actual peer (NOT us). Regression
+    // for the bug where these rows rendered as "← From Self" with
+    // a Restart button that would have restarted the user's own
+    // workspace.
+    const m = toCommMessage(
+      makeEntry({
+        activity_type: "a2a_receive",
+        source_id: SELF,
+        target_id: PEER,
+        summary: "Peer Agent failed",
+        status: "error",
+      }),
+      SELF,
+    );
+    expect(m!.flow).toBe("out");
+    expect(m!.peerId).toBe(PEER);
+    expect(m!.peerName).toBe("Peer Agent");
+    expect(m!.status).toBe("error");
+  });
+
+  it("returns null when no peer can be resolved", () => {
+    // a2a_receive with both ids null — discard rather than render a
+    // ghost bubble pointing at "Unknown".
+    const m = toCommMessage(
+      makeEntry({
+        activity_type: "a2a_receive",
+        source_id: null,
+        target_id: null,
+      }),
+      SELF,
+    );
+    expect(m).toBeNull();
+  });
+
+  it("propagates status through to the message (drives error rendering)", () => {
+    const m = toCommMessage(
+      makeEntry({ status: "error", activity_type: "a2a_send" }),
+      SELF,
+    );
+    expect(m!.status).toBe("error");
+  });
+});
--- a/canvas/src/components/tabs/chat/tests/a2aErrorHint.test.ts
+++ b/canvas/src/components/tabs/chat/tests/a2aErrorHint.test.ts
@ -0,0 +1,67 @@
+import { describe, it, expect } from "vitest";
+import { inferA2AErrorHint } from "../a2aErrorHint";
+
+// Pure logic. Pin every named pattern so a future contributor adding a
+// new symptom doesn't accidentally collapse the buckets — and so the
+// "most specific first" ordering can't drift without a test failing.
+
+describe("inferA2AErrorHint", () => {
+  it("matches the Claude Code SDK init wedge specifically", () => {
+    const hint = inferA2AErrorHint("Control request timeout: initialize");
+    expect(hint).toMatch(/Claude Code SDK is wedged/);
+  });
+
+  it("does NOT misfire on user tasks containing 'initialize' generally", () => {
+    // Regression: an earlier bare-`initialize` pattern would have
+    // false-positived "failed to initialize database" into the SDK
+    // wedge hint. Confirm the full-phrase guard holds.
+    const hint = inferA2AErrorHint("failed to initialize database connection");
+    expect(hint).not.toMatch(/Claude Code SDK/);
+  });
+
+  it("recognises httpx ReadTimeout / ConnectTimeout class names", () => {
+    expect(inferA2AErrorHint("ReadTimeout: timeout")).toMatch(/proxy timeout/);
+    expect(inferA2AErrorHint("ConnectTimeout: ...")).toMatch(/proxy timeout/);
+  });
+
+  it("recognises generic timeout / deadline-exceeded language", () => {
+    expect(inferA2AErrorHint("deadline exceeded after 300s")).toMatch(/proxy timeout/);
+    expect(inferA2AErrorHint("Operation timeout")).toMatch(/proxy timeout/);
+  });
+
+  it("handles connection-reset family (RemoteProtocolError, ConnectionReset, no-message)", () => {
+    expect(inferA2AErrorHint("RemoteProtocolError: ...")).toMatch(/connection.*dropped/);
+    expect(inferA2AErrorHint("ConnectionResetError")).toMatch(/connection.*dropped/);
+    expect(inferA2AErrorHint("connection reset by peer")).toMatch(/connection.*dropped/);
+    expect(inferA2AErrorHint("RemoteProtocolError (no message — likely connection reset)")).toMatch(/connection.*dropped/);
+  });
+
+  it("recognises agent-runtime exceptions", () => {
+    expect(inferA2AErrorHint("Agent error: ValueError raised")).toMatch(/runtime threw an exception/);
+    expect(inferA2AErrorHint("RuntimeException in tool call")).toMatch(/runtime threw an exception/);
+  });
+
+  it("recognises peer-unreachable cases (Activity-tab originals)", () => {
+    expect(inferA2AErrorHint("workspace not found")).toMatch(/can't be reached/);
+    expect(inferA2AErrorHint("not accessible")).toMatch(/can't be reached/);
+    expect(inferA2AErrorHint("workspace is offline")).toMatch(/can't be reached/);
+  });
+
+  it("returns the empty-detail-specific hint when input is exactly empty", () => {
+    expect(inferA2AErrorHint("")).toMatch(/no error detail/);
+  });
+
+  it("returns a generic fallback for unrecognised text", () => {
+    const hint = inferA2AErrorHint("some completely novel error nobody has matched yet");
+    expect(hint).toMatch(/Check the workspace logs|delivery failure/);
+  });
+
+  it("Claude SDK wedge wins over the more general timeout pattern", () => {
+    // Both 'control request timeout' and 'timeout' match the same
+    // input. The SDK wedge hint is more actionable; the ordering in
+    // the function must keep it first. Lock that priority in.
+    const hint = inferA2AErrorHint("Control request timeout: initialize");
+    expect(hint).toMatch(/Claude Code SDK/);
+    expect(hint).not.toMatch(/proxy timeout/);
+  });
+});
--- a/canvas/src/components/tabs/chat/tests/activityLog.test.ts
+++ b/canvas/src/components/tabs/chat/tests/activityLog.test.ts
@ -0,0 +1,41 @@
+import { describe, it, expect } from "vitest";
+import { ACTIVITY_LOG_WINDOW, appendActivityLine } from "../activityLog";
+
+describe("appendActivityLine", () => {
+  it("appends a fresh line", () => {
+    expect(appendActivityLine([], "📄 Read /a")).toEqual(["📄 Read /a"]);
+  });
+
+  it("collapses an immediate duplicate", () => {
+    const prev = ["📄 Read /a"];
+    // Same exact string twice in a row is noise — the helper should
+    // return the original array reference, not a new one.
+    expect(appendActivityLine(prev, "📄 Read /a")).toBe(prev);
+  });
+
+  it("keeps non-adjacent duplicates", () => {
+    const prev = ["📄 Read /a", "⚡ Bash: ls"];
+    expect(appendActivityLine(prev, "📄 Read /a")).toEqual([
+      "📄 Read /a",
+      "⚡ Bash: ls",
+      "📄 Read /a",
+    ]);
+  });
+
+  it("rolls off the oldest line when the window fills", () => {
+    const seed = Array.from({ length: ACTIVITY_LOG_WINDOW }, (_, i) => `line-${i}`);
+    const next = appendActivityLine(seed, "newest");
+    expect(next.length).toBe(ACTIVITY_LOG_WINDOW);
+    expect(next[next.length - 1]).toBe("newest");
+    // Oldest entry is dropped — line-0 is gone.
+    expect(next[0]).toBe("line-1");
+  });
+
+  it("keeps the original array reference when below the window cap", () => {
+    const prev = ["a", "b"];
+    const next = appendActivityLine(prev, "c");
+    // Returned a new array (we appended); must NOT mutate prev.
+    expect(prev).toEqual(["a", "b"]);
+    expect(next).toEqual(["a", "b", "c"]);
+  });
+});
--- a/canvas/src/components/tabs/chat/tests/message-parser.test.ts
+++ b/canvas/src/components/tabs/chat/tests/message-parser.test.ts
@ -4,6 +4,7 @@ import {
  extractResponseText,
  extractAgentText,
  extractTextsFromParts,
+  extractFilesFromTask,
 } from "../message-parser";

 describe("extractRequestText", () => {
@ -99,6 +100,67 @@ describe("extractResponseText", () => {
  it("returns empty when result has no parts", () => {
    expect(extractResponseText({ result: { other: true } })).toBe("");
  });
+
+  // Regression: Claude Code (and other long-reply runtimes) emits
+  // multi-part text replies. The previous implementation returned
+  // only the first part, silently truncating the rest. Observed
+  // 2026-04-25 on a 15k-char Wave 1 brief that rendered as just the
+  // markdown table header.
+  it("joins all text parts when result.parts has multiple", () => {
+    const body = {
+      result: {
+        parts: [
+          { kind: "text", text: "# Header" },
+          { kind: "text", text: "| Col |" },
+          { kind: "text", text: "| --- |" },
+          { kind: "text", text: "| Row |" },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("# Header\n| Col |\n| --- |\n| Row |");
+  });
+
+  it("joins all text parts across multiple artifacts", () => {
+    const body = {
+      result: {
+        artifacts: [
+          { parts: [{ kind: "text", text: "First artifact" }] },
+          { parts: [{ kind: "text", text: "Second artifact" }] },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("First artifact\nSecond artifact");
+  });
+
+  it("joins all .root.text variants when present", () => {
+    const body = {
+      result: {
+        parts: [
+          { root: { text: "alpha" } },
+          { root: { text: "beta" } },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("alpha\nbeta");
+  });
+
+  // Regression: when a response carries BOTH parts and artifacts
+  // (Hermes tool-call replies do this — summary in parts, detail in
+  // artifacts), the early-return-on-parts implementation silently
+  // dropped the artifacts body. The collected-from-every-source
+  // implementation must surface both.
+  it("collects text from BOTH result.parts AND result.artifacts when both present", () => {
+    const body = {
+      result: {
+        parts: [{ kind: "text", text: "Summary" }],
+        artifacts: [
+          { parts: [{ kind: "text", text: "Detail block one" }] },
+          { parts: [{ kind: "text", text: "Detail block two" }] },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("Summary\nDetail block one\nDetail block two");
+  });
 });

 describe("extractTextsFromParts", () => {
@ -133,3 +195,71 @@ describe("extractTextsFromParts", () => {
    expect(extractTextsFromParts(parts)).toBe("Only text");
  });
 });
+
+describe("extractFilesFromTask", () => {
+  it("pulls A2A file parts out of a result", () => {
+    const task = {
+      parts: [
+        { kind: "text", text: "here's the report" },
+        {
+          kind: "file",
+          file: { name: "report.pdf", mimeType: "application/pdf", uri: "workspace:/reports/report.pdf", size: 4096 },
+        },
+      ],
+    };
+    const files = extractFilesFromTask(task);
+    expect(files).toEqual([
+      { name: "report.pdf", mimeType: "application/pdf", uri: "workspace:/reports/report.pdf", size: 4096 },
+    ]);
+  });
+
+  it("recovers a filename from the URI when `name` is absent", () => {
+    const task = {
+      parts: [
+        { kind: "file", file: { uri: "workspace:/workspace/out/graph.png" } },
+      ],
+    };
+    const files = extractFilesFromTask(task);
+    expect(files[0].name).toBe("graph.png");
+  });
+
+  it("skips file parts without a URI (inline bytes are not supported yet)", () => {
+    const task = {
+      parts: [
+        { kind: "file", file: { name: "inline.bin", bytes: "AAA=" } },
+      ],
+    };
+    expect(extractFilesFromTask(task)).toEqual([]);
+  });
+
+  it("walks artifacts[] so file parts nested inside artifact envelopes are found", () => {
+    const task = {
+      artifacts: [
+        {
+          parts: [
+            { kind: "file", file: { name: "trace.log", uri: "workspace:/logs/trace.log" } },
+          ],
+        },
+      ],
+    };
+    const files = extractFilesFromTask(task);
+    expect(files[0]).toMatchObject({ name: "trace.log", uri: "workspace:/logs/trace.log" });
+  });
+
+  it("returns [] on malformed input rather than throwing", () => {
+    expect(extractFilesFromTask({})).toEqual([]);
+    expect(extractFilesFromTask({ parts: "not-an-array" } as unknown as Record<string, unknown>)).toEqual([]);
+  });
+
+  it("walks result.message.parts — the non-task reply shape some A2A servers use", () => {
+    const task = {
+      message: {
+        parts: [
+          { kind: "file", file: { name: "out.txt", uri: "workspace:/workspace/out.txt" } },
+        ],
+      },
+    };
+    const files = extractFilesFromTask(task);
+    expect(files[0]).toMatchObject({ name: "out.txt", uri: "workspace:/workspace/out.txt" });
+  });
+});
--- a/canvas/src/components/tabs/chat/tests/uploads.test.ts
+++ b/canvas/src/components/tabs/chat/tests/uploads.test.ts
@ -0,0 +1,41 @@
+import { describe, it, expect } from "vitest";
+import { resolveAttachmentHref } from "../uploads";
+
+describe("resolveAttachmentHref — URI scheme normalisation", () => {
+  const wsId = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee";
+
+  it("rewrites the canonical workspace:<path> scheme to /chat/download", () => {
+    const url = resolveAttachmentHref(wsId, "workspace:/workspace/report.pdf");
+    expect(url).toContain(`/workspaces/${wsId}/chat/download`);
+    expect(url).toContain(encodeURIComponent("/workspace/report.pdf"));
+  });
+
+  it("accepts bare absolute container paths (some agents omit the scheme)", () => {
+    const url = resolveAttachmentHref(wsId, "/workspace/report.pdf");
+    expect(url).toContain(`/workspaces/${wsId}/chat/download`);
+    expect(url).toContain(encodeURIComponent("/workspace/report.pdf"));
+  });
+
+  it("accepts file:/// URIs pointing into an allowed root", () => {
+    const url = resolveAttachmentHref(wsId, "file:///workspace/report.pdf");
+    expect(url).toContain(`/workspaces/${wsId}/chat/download`);
+    expect(url).toContain(encodeURIComponent("/workspace/report.pdf"));
+  });
+
+  it("passes through HTTP(S) URIs unchanged so off-platform artefacts still render", () => {
+    const external = "https://example.com/static/report.pdf";
+    expect(resolveAttachmentHref(wsId, external)).toBe(external);
+  });
+
+  it("passes through container paths that are not under any allowed root", () => {
+    // /etc/passwd looks like a path but isn't one of the allowed
+    // roots — falling back to raw passthrough forces the caller into
+    // the external-URL branch, which opens a new tab and lets the
+    // browser refuse. Rewriting would 400 anyway server-side.
+    expect(resolveAttachmentHref(wsId, "/etc/passwd")).toBe("/etc/passwd");
+  });
+
+  it("passes through unknown schemes unchanged", () => {
+    expect(resolveAttachmentHref(wsId, "s3://bucket/key")).toBe("s3://bucket/key");
+  });
+});
--- a/canvas/src/components/tabs/chat/a2aErrorHint.ts
+++ b/canvas/src/components/tabs/chat/a2aErrorHint.ts
@ -0,0 +1,54 @@
+/**
+ * Maps an A2A delivery-failure detail string (the bit AFTER stripping
+ * the [A2A_ERROR] sentinel prefix) to a one-line operator-actionable
+ * hint. Pattern matches are lowercase substring checks, ordered most-
+ * specific first so the right hint wins when multiple patterns
+ * overlap (e.g. "control request timeout" wins over generic "timeout").
+ *
+ * Used by both the chat Agent Comms panel and the Activity tab so the
+ * same symptom reads identically across surfaces. Two prior copies
+ * had already drifted (Activity tab gained `not found`/`offline`
+ * cases AgentCommsPanel never picked up) — this module is the merged
+ * superset and the only place hint text should change.
+ */
+export function inferA2AErrorHint(detail: string): string {
+  const t = detail.toLowerCase();
+
+  // "control request timeout" is the specific Claude Code SDK init
+  // wedge symptom. Pattern on the full phrase, not bare "initialize"
+  // — a user task containing "failed to initialize database" would
+  // false-positive into the SDK-wedge hint.
+  if (t.includes("control request timeout")) {
+    return "The remote agent's Claude Code SDK is wedged on initialization (often after a long idle period or OAuth refresh). A workspace restart usually clears it.";
+  }
+  if (
+    t.includes("readtimeout") ||
+    t.includes("connecttimeout") ||
+    t.includes("deadline exceeded") ||
+    t.includes("timeout")
+  ) {
+    return "The remote agent didn't respond within the proxy timeout. It may be busy with a long task, or the runtime is stuck — restart the workspace if this repeats.";
+  }
+  if (
+    t.includes("connectionreset") ||
+    t.includes("remoteprotocolerror") ||
+    t.includes("connection reset") ||
+    t.includes("no message")
+  ) {
+    return "The connection to the remote agent dropped before a reply arrived. Usually a transient network blip — retry once. If it repeats, the remote container may have crashed mid-request; check its logs.";
+  }
+  if (t.includes("agent error") || t.includes("exception")) {
+    return "The remote agent's runtime threw an exception. Check the workspace's container logs for the traceback. Restart usually clears transient runtime crashes.";
+  }
+  if (
+    t.includes("not found") ||
+    t.includes("not accessible") ||
+    t.includes("offline")
+  ) {
+    return "The remote workspace can't be reached — it may be stopped, removed, or outside the access control list. Verify the peer is online before retrying.";
+  }
+  if (detail === "") {
+    return "The remote agent returned no error detail (the underlying httpx exception had an empty message — typically a connection-reset or silent timeout). A workspace restart is the safe first move.";
+  }
+  return "The remote agent reported a delivery failure. Check the workspace logs or try restarting.";
+}
--- a/canvas/src/components/tabs/chat/activityLog.ts
+++ b/canvas/src/components/tabs/chat/activityLog.ts
@ -0,0 +1,23 @@
+/**
+ * Sliding-window log for the in-chat activity feed (the live progress
+ * lines under the spinner while a chat reply is in flight).
+ *
+ * Sized to fit the spinner area without forcing a scroll; per-tool-use
+ * rows from the workspace's _report_tool_use can fire dozens per turn
+ * (Read 5 files + Grep + Bash + Edits + delegations), so a too-small
+ * window flushes useful early context before the user can read it.
+ *
+ * Consecutive identical lines collapse to a single entry — the same
+ * tool repeated on the same target (e.g. Read of the same file twice
+ * within a turn) is noise, not new progress.
+ */
+export const ACTIVITY_LOG_WINDOW = 20;
+
+export function appendActivityLine(prev: string[], line: string): string[] {
+  if (prev[prev.length - 1] === line) return prev; // collapse duplicates
+  const next =
+    prev.length >= ACTIVITY_LOG_WINDOW
+      ? prev.slice(-(ACTIVITY_LOG_WINDOW - 1))
+      : prev;
+  return [...next, line];
+}
--- a/canvas/src/components/tabs/chat/message-parser.ts
+++ b/canvas/src/components/tabs/chat/message-parser.ts
@ -32,6 +32,64 @@ export function extractTextsFromParts(parts: unknown): string | null {
  return texts.length > 0 ? texts.join("\n") : null;
 }

+export interface ParsedFilePart {
+  name: string;
+  uri: string;
+  mimeType?: string;
+  size?: number;
+}
+
+/** Extract file parts from an A2A response. Walks parts[] + artifacts[].
+ *  Per the A2A spec a file part looks like:
+ *    { kind: "file", file: { name, mimeType, uri | bytes } }
+ *  We only surface parts that carry a `uri` — inline bytes would
+ *  require a different renderer (data URL) and are out of scope for
+ *  MVP. Names fall back to the URI's basename when absent. */
+export function extractFilesFromTask(task: Record<string, unknown>): ParsedFilePart[] {
+  const out: ParsedFilePart[] = [];
+  const pushFromParts = (parts: unknown) => {
+    if (!Array.isArray(parts)) return;
+    for (const raw of parts as Array<Record<string, unknown>>) {
+      if (raw.kind !== "file" && raw.type !== "file") continue;
+      const file = (raw.file ?? raw) as Record<string, unknown>;
+      const uri = typeof file.uri === "string" ? file.uri : "";
+      if (!uri) continue;
+      const name = (typeof file.name === "string" && file.name) || basename(uri);
+      out.push({
+        name,
+        uri,
+        mimeType: typeof file.mimeType === "string" ? file.mimeType : undefined,
+        size: typeof file.size === "number" ? file.size : undefined,
+      });
+    }
+  };
+  try {
+    pushFromParts(task.parts);
+    const artifacts = task.artifacts as Array<Record<string, unknown>> | undefined;
+    if (artifacts) for (const a of artifacts) pushFromParts(a.parts);
+    const status = task.status as Record<string, unknown> | undefined;
+    if (status?.message) {
+      const msg = status.message as Record<string, unknown>;
+      pushFromParts(msg.parts);
+    }
+    // Some A2A servers wrap a non-task reply as
+    // {result: {message: {parts: [...]}}} rather than {result: {parts}}.
+    // Without this branch we'd silently drop file parts returned by
+    // third-party implementations.
+    const message = task.message as Record<string, unknown> | undefined;
+    if (message) pushFromParts(message.parts);
+  } catch {
+    /* tolerate malformed shapes — chat falls through to text-only */
+  }
+  return out;
+}
+
+function basename(uri: string): string {
+  const cleaned = uri.replace(/^workspace:/, "").replace(/^https?:\/\//, "");
+  const slash = cleaned.lastIndexOf("/");
+  return slash >= 0 ? cleaned.slice(slash + 1) : cleaned || "file";
+}
+
 /** Extract user message text from an activity log request_body */
 export function extractRequestText(body: Record<string, unknown> | null): string {
  if (!body) return "";
@ -41,22 +99,54 @@ export function extractRequestText(body: Record<string, unknown> | null): string
  return (parts?.[0]?.text as string) || "";
 }

-/** Extract text from an activity log response_body (multiple possible formats) */
+/** Extract text from an activity log response_body (multiple possible formats).
+ *
+ *  Collects from EVERY source — top-level `parts[].text`, `parts[].root.text`
+ *  (older nested shape), and `artifacts[].parts[].text` (task-shaped
+ *  replies) — and joins them with "\n". Two reasons to collect rather
+ *  than early-return:
+ *
+ *    1. Claude Code and other long-reply runtimes emit multiple text
+ *       parts in a single `parts` array. Returning just the first
+ *       silently truncates 15k-char briefs to their leading line
+ *       (observed UX A/B Lab Wave 1, 2026-04-25).
+ *
+ *    2. Some producers emit a summary in `parts[].text` AND details in
+ *       `artifacts[].parts[].text` (Hermes does this for tool calls).
+ *       The previous "first source wins" returned only the summary;
+ *       artifacts dropped silently. */
 export function extractResponseText(body: Record<string, unknown>): string {
  try {
    // {result: "text"} — from MCP server delegation logs
    if (typeof body.result === "string") return body.result;

-    // A2A JSON-RPC response: {result: {parts: [{kind: "text", text: "..."}]}}
    const result = body.result as Record<string, unknown> | undefined;
    if (result) {
+      const collected: string[] = [];
+
+      // A2A JSON-RPC: {result: {parts: [{kind: "text", text: "..."}]}}
+      const fromParts = extractTextsFromParts(result.parts);
+      if (fromParts) collected.push(fromParts);
+
+      // Older nested shape: {parts: [{root: {text: "..."}}]}
      const parts = (result.parts || []) as Array<Record<string, unknown>>;
+      const rootTexts: string[] = [];
      for (const p of parts) {
-        const t = (p.text as string) || "";
-        if (t) return t;
        const root = p.root as Record<string, unknown> | undefined;
-        if (root?.text) return root.text as string;
+        if (root?.text) rootTexts.push(root.text as string);
      }
+      if (rootTexts.length > 0) collected.push(rootTexts.join("\n"));
+
+      // Task shape: {result: {artifacts: [{parts: [...]}]}}
+      const artifacts = result.artifacts as Array<Record<string, unknown>> | undefined;
+      if (artifacts) {
+        for (const a of artifacts) {
+          const t = extractTextsFromParts(a.parts);
+          if (t) collected.push(t);
+        }
+      }
+
+      if (collected.length > 0) return collected.join("\n");
    }

    // {task: "text"} — request body format, shouldn't be in response but handle it
--- a/canvas/src/components/tabs/chat/types.ts
+++ b/canvas/src/components/tabs/chat/types.ts
@ -1,12 +1,38 @@
+/** One file attached to a chat message. Shared shape for both
+ *  directions: when a user attaches a file the UI uploads it and
+ *  stashes the returned metadata here; when an agent returns a
+ *  `kind: file` part in an A2A response, the parser populates the
+ *  same fields. `uri` uses the `workspace:<abs-path>` scheme the
+ *  server returns — the renderer translates that to a download
+ *  request against GET /workspaces/:id/chat/download. */
+export interface ChatAttachment {
+  name: string;
+  uri: string;
+  mimeType?: string;
+  size?: number;
+}
+
 export interface ChatMessage {
  id: string;
  role: "user" | "agent" | "system";
  content: string;
+  /** Attachments sent with or returned alongside this message. */
+  attachments?: ChatAttachment[];
  timestamp: string; // ISO string for serialization
 }

-export function createMessage(role: ChatMessage["role"], content: string): ChatMessage {
-  return { id: crypto.randomUUID(), role, content, timestamp: new Date().toISOString() };
+export function createMessage(
+  role: ChatMessage["role"],
+  content: string,
+  attachments?: ChatAttachment[],
+): ChatMessage {
+  return {
+    id: crypto.randomUUID(),
+    role,
+    content,
+    attachments: attachments && attachments.length > 0 ? attachments : undefined,
+    timestamp: new Date().toISOString(),
+  };
 }

 // appendMessageDeduped adds a ChatMessage to `prev` unless the tail
@ -25,11 +51,23 @@ export function createMessage(role: ChatMessage["role"], content: string): ChatM
 // messages ("hi", "hi") from a real user/agent still render.
 export function appendMessageDeduped(prev: ChatMessage[], msg: ChatMessage, dedupeWindowMs = 3000): ChatMessage[] {
  const cutoff = Date.now() - dedupeWindowMs;
+  const sig = attachmentSignature(msg.attachments);
  const alreadyThere = prev.some((m) => {
    if (m.role !== msg.role || m.content !== msg.content) return false;
+    // Attachments participate in the dedupe key so a text-only push
+    // doesn't shadow the file-carrying HTTP response (and vice versa).
+    // When both carry the same text AND the same files, collapse.
+    if (attachmentSignature(m.attachments) !== sig) return false;
    const t = Date.parse(m.timestamp);
    return !Number.isNaN(t) && t >= cutoff;
  });
  if (alreadyThere) return prev;
  return [...prev, msg];
 }
+
+function attachmentSignature(atts: ChatAttachment[] | undefined): string {
+  if (!atts || atts.length === 0) return "";
+  // URI is the stable identity — name can differ across delivery
+  // paths (agent vs our parser's basename fallback).
+  return atts.map((a) => a.uri).sort().join("|");
+}
--- a/canvas/src/components/tabs/chat/uploads.ts
+++ b/canvas/src/components/tabs/chat/uploads.ts
@ -0,0 +1,135 @@
+import { PLATFORM_URL } from "@/lib/api";
+import { getTenantSlug } from "@/lib/tenant";
+import type { ChatAttachment } from "./types";
+
+/** Chat attachments are intentionally uploaded via a direct fetch()
+ *  instead of the `api.post` helper — `api.post` JSON-stringifies the
+ *  body, which would 500 on a Blob. Mirrors the header plumbing
+ *  (tenant slug, admin token, credentials) so SaaS + self-hosted
+ *  callers work the same way. */
+export async function uploadChatFiles(
+  workspaceId: string,
+  files: File[],
+): Promise<ChatAttachment[]> {
+  if (files.length === 0) return [];
+
+  const form = new FormData();
+  for (const f of files) form.append("files", f, f.name);
+
+  const headers: Record<string, string> = {};
+  const slug = getTenantSlug();
+  if (slug) headers["X-Molecule-Org-Slug"] = slug;
+  const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
+  if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
+
+  // Uploads legitimately take a while on cold cache (tar write +
+  // docker cp into the container). 60s is comfortable for the 25MB/
+  // 50MB caps the server enforces.
+  const res = await fetch(`${PLATFORM_URL}/workspaces/${workspaceId}/chat/uploads`, {
+    method: "POST",
+    headers,
+    body: form,
+    credentials: "include",
+    signal: AbortSignal.timeout(60_000),
+  });
+  if (!res.ok) {
+    const text = await res.text().catch(() => "");
+    throw new Error(`upload failed: ${res.status} ${text}`);
+  }
+  const json = (await res.json()) as { files: ChatAttachment[] };
+  return json.files ?? [];
+}
+
+/** Resolve a file URI into a browser-downloadable URL. Accepts:
+ *    - `workspace:<abs-path>` (our canonical form)
+ *    - `file:///workspace/...` (some agents emit this)
+ *    - `/workspace/...` (bare absolute path inside the container)
+ *  Everything that looks like an allowed-root container path is
+ *  rewritten to the authenticated /chat/download endpoint. HTTP(S)
+ *  URIs pass through unchanged so we can also render links to
+ *  artefacts hosted off-platform. Unknown schemes fall back to the
+ *  raw URI — the caller gets to decide how to render it. */
+export function resolveAttachmentHref(
+  workspaceId: string,
+  uri: string,
+): string {
+  const containerPath = normalizeWorkspaceUri(uri);
+  if (containerPath) {
+    return `${PLATFORM_URL}/workspaces/${workspaceId}/chat/download?path=${encodeURIComponent(containerPath)}`;
+  }
+  return uri;
+}
+
+/** Extracts the absolute container path from a workspace-scoped URI,
+ *  or null if the URI isn't a container path. The matching roots
+ *  mirror the server's `allowedRoots` allowlist. */
+const ALLOWED_CONTAINER_ROOTS = ["/configs", "/workspace", "/home", "/plugins"];
+
+function normalizeWorkspaceUri(uri: string): string | null {
+  let path: string | null = null;
+  if (uri.startsWith("workspace:")) {
+    path = uri.slice("workspace:".length);
+  } else if (uri.startsWith("file:///")) {
+    path = uri.slice("file://".length); // keep the leading slash
+  } else if (uri.startsWith("/")) {
+    path = uri;
+  }
+  if (!path) return null;
+  // Only rewrite when the path lands in an allowed root; otherwise
+  // return null so the caller falls through to raw-URI handling
+  // (which will open a new tab for HTTP-ish schemes).
+  for (const root of ALLOWED_CONTAINER_ROOTS) {
+    if (path === root || path.startsWith(root + "/")) return path;
+  }
+  return null;
+}
+
+/** Trigger a browser download for an attachment. Uses fetch+blob
+ *  rather than an anchor navigation because the download endpoint
+ *  requires workspace auth — and the browser won't attach
+ *  `Authorization: Bearer` or `X-Molecule-Org-Slug` to a bare anchor
+ *  click. A 25MB per-file cap server-side keeps the blob buffer
+ *  bounded. HTTP(S) URIs skip the fetch path and open directly
+ *  since they're off-platform artefacts that we don't own auth for. */
+export async function downloadChatFile(
+  workspaceId: string,
+  attachment: ChatAttachment,
+): Promise<void> {
+  const href = resolveAttachmentHref(workspaceId, attachment.uri);
+  const isContainerPath = normalizeWorkspaceUri(attachment.uri) !== null;
+  if (!isContainerPath) {
+    // External URL — let the browser navigate. Opens in new tab so
+    // the canvas context survives a navigation. `href` here is the
+    // raw URI (http(s), or anything else the agent sent back).
+    window.open(href, "_blank", "noopener,noreferrer");
+    return;
+  }
+
+  const headers: Record<string, string> = {};
+  const slug = getTenantSlug();
+  if (slug) headers["X-Molecule-Org-Slug"] = slug;
+  const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
+  if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
+
+  const res = await fetch(href, {
+    headers,
+    credentials: "include",
+    signal: AbortSignal.timeout(60_000),
+  });
+  if (!res.ok) {
+    throw new Error(`download failed: ${res.status}`);
+  }
+  const blob = await res.blob();
+  // Revoke the object URL after the click — browsers hold the blob
+  // until the URL is either revoked or the document unloads. 30s is
+  // plenty of headroom for the click → save dialog round-trip.
+  const url = URL.createObjectURL(blob);
+  const a = document.createElement("a");
+  a.href = url;
+  a.download = attachment.name;
+  a.rel = "noopener";
+  document.body.appendChild(a);
+  a.click();
+  a.remove();
+  setTimeout(() => URL.revokeObjectURL(url), 30_000);
+}
--- a/canvas/src/hooks/useTemplateDeploy.tsx
+++ b/canvas/src/hooks/useTemplateDeploy.tsx
@ -0,0 +1,170 @@
+"use client";
+
+import { useCallback, useState, type ReactNode } from "react";
+import { api } from "@/lib/api";
+import {
+  checkDeploySecrets,
+  resolveRuntime,
+  type PreflightResult,
+  type Template,
+} from "@/lib/deploy-preflight";
+import { MissingKeysModal } from "@/components/MissingKeysModal";
+
+/**
+ * useTemplateDeploy — shared preflight + POST + modal wiring for
+ * every surface that deploys a workspace from a template.
+ *
+ * Owns: `checkDeploySecrets` call, `MissingKeysModal` render, the
+ * `POST /workspaces` that follows, and per-template `deploying`
+ * state. Returns `modal` as a `ReactNode` ready to place inline.
+ *
+ * Why a hook rather than two copies: the runtime-fallback table
+ * (`resolveRuntime`) and the preflight wiring were previously
+ * copy-pasted between TemplatePalette and EmptyState. When the
+ * copies drifted (palette had the full id-to-runtime map,
+ * empty-state had only the `-default` strip), the two surfaces
+ * could silently disagree on future templates that need a
+ * non-identity mapping. Single owner closes the drift surface.
+ */
+export interface UseTemplateDeployOptions {
+  /** Compute canvas coords for the new workspace. Called once per
+   *  successful deploy. Defaults to random coords in the [100, 500] ×
+   *  [100, 400] band, matching the sidebar palette's historical
+   *  placement. Override for surfaces that want deterministic
+   *  placement (e.g. EmptyState's first-deploy "center-ish" target). */
+  canvasCoords?: () => { x: number; y: number };
+
+  /** Optional post-deploy side effect — passed the id of the new
+   *  workspace. EmptyState uses this to auto-select the node and
+   *  flip the side panel to Chat so a fresh tenant sees something
+   *  useful. */
+  onDeployed?: (workspaceId: string) => void;
+}
+
+/** Paired template + preflight result carried through the "user
+ *  clicked deploy → modal opens → keys saved → retry" loop. Named
+ *  so the `useState` generic and any future signature change have
+ *  a single place to track. */
+interface MissingKeysInfo {
+  template: Template;
+  preflight: PreflightResult;
+}
+
+export interface UseTemplateDeployResult {
+  /** Template id currently being deployed (incl. the preflight
+   *  network call), or null when idle. Callers pass this to disable
+   *  the relevant button and show a spinner. */
+  deploying: string | null;
+
+  /** Last deploy error message, or null. Cleared on next `deploy`
+   *  call. */
+  error: string | null;
+
+  /** Kick off a deploy. Opens the missing-keys modal if preflight
+   *  returns not-ok; otherwise fires POST /workspaces directly. */
+  deploy: (template: Template) => Promise<void>;
+
+  /** The missing-keys modal, ready to place inline. Always non-null
+   *  (the underlying component self-gates on `open`), so the caller
+   *  can drop `{modal}` anywhere without conditionals. */
+  modal: ReactNode;
+}
+
+export function useTemplateDeploy(
+  options: UseTemplateDeployOptions = {},
+): UseTemplateDeployResult {
+  const [deploying, setDeploying] = useState<string | null>(null);
+  const [error, setError] = useState<string | null>(null);
+  const [missingKeysInfo, setMissingKeysInfo] = useState<MissingKeysInfo | null>(null);
+
+  const { canvasCoords, onDeployed } = options;
+
+  /** Actually execute the POST /workspaces call. Split from `deploy`
+   *  so the "modal → keys added → retry" path can reuse it without
+   *  re-running preflight (the user just proved the keys are now set). */
+  const executeDeploy = useCallback(
+    async (template: Template) => {
+      setDeploying(template.id);
+      setError(null);
+      try {
+        const coords = canvasCoords
+          ? canvasCoords()
+          : {
+              x: Math.random() * 400 + 100,
+              y: Math.random() * 300 + 100,
+            };
+        const ws = await api.post<{ id: string }>("/workspaces", {
+          name: template.name,
+          template: template.id,
+          tier: template.tier,
+          canvas: coords,
+        });
+        onDeployed?.(ws.id);
+      } catch (e) {
+        setError(e instanceof Error ? e.message : "Deploy failed");
+      } finally {
+        setDeploying(null);
+      }
+    },
+    [canvasCoords, onDeployed],
+  );
+
+  const deploy = useCallback(
+    async (template: Template) => {
+      setDeploying(template.id);
+      setError(null);
+      let preflight: PreflightResult;
+      try {
+        const runtime = template.runtime ?? resolveRuntime(template.id);
+        preflight = await checkDeploySecrets({
+          runtime,
+          models: template.models,
+          required_env: template.required_env,
+        });
+      } catch (e) {
+        // Preflight network failure used to strand `deploying` — the
+        // button stayed disabled forever because the throw bypassed
+        // the setDeploying(null) in the non-ok branch below. Any
+        // future refactor that drops this try block will regress the
+        // same way; keep it narrow around just the preflight call
+        // so a successful preflight still lets executeDeploy own
+        // its own error path.
+        setError(e instanceof Error ? e.message : "Preflight check failed");
+        setDeploying(null);
+        return;
+      }
+      if (!preflight.ok) {
+        setMissingKeysInfo({ template, preflight });
+        setDeploying(null);
+        return;
+      }
+      await executeDeploy(template);
+    },
+    [executeDeploy],
+  );
+
+  // No useCallback here — consumers call this on every render anyway
+  // (it's placed inline in JSX), and useCallback's deps would
+  // invalidate on every state change, making the memoisation a wash.
+  // Plain ReactNode is simpler and equally performant.
+  const modal: ReactNode = (
+    <MissingKeysModal
+      open={!!missingKeysInfo}
+      missingKeys={missingKeysInfo?.preflight.missingKeys ?? []}
+      providers={missingKeysInfo?.preflight.providers ?? []}
+      runtime={missingKeysInfo?.preflight.runtime ?? ""}
+      onKeysAdded={() => {
+        if (missingKeysInfo) {
+          const template = missingKeysInfo.template;
+          setMissingKeysInfo(null);
+          // Intentional fire-and-forget — executeDeploy manages
+          // its own error state via setError.
+          void executeDeploy(template);
+        }
+      }}
+      onCancel={() => setMissingKeysInfo(null)}
+    />
+  );
+
+  return { deploying, error, deploy, modal };
+}
--- a/canvas/src/lib/tests/api.test.ts
+++ b/canvas/src/lib/tests/api.test.ts
@ -7,7 +7,7 @@ import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
 const mockFetch = vi.fn();
 globalThis.fetch = mockFetch;

-import { api } from "../api";
+import { api, PlatformUnavailableError } from "../api";

 // ---------------------------------------------------------------------------
 // Helpers
@ -380,3 +380,99 @@ describe("api – request timeout signal", () => {
    expect(sigA).not.toBe(sigB);
  });
 });
+
+// ---------------------------------------------------------------------------
+// PlatformUnavailableError classification
+// ---------------------------------------------------------------------------
+//
+// When the platform's wsauth middleware can't reach Postgres/Redis to
+// validate a token, it returns 503 + {error, code:"platform_unavailable"}.
+// api.ts must surface that as a typed error so the page-level renderer
+// can show a dedicated diagnostic instead of a generic 5xx toast.
+
+describe("PlatformUnavailableError classification", () => {
+  beforeEach(() => {
+    mockFetch.mockReset();
+  });
+
+  function mock503Platform(detail = "platform datastore unavailable — retry shortly") {
+    const body = JSON.stringify({ error: detail, code: "platform_unavailable" });
+    mockFetch.mockResolvedValueOnce({
+      ok: false,
+      status: 503,
+      json: () => Promise.reject(new Error("not used")),
+      text: () => Promise.resolve(body),
+    } as unknown as Response);
+  }
+
+  it("throws PlatformUnavailableError on 503 + code=platform_unavailable", async () => {
+    mock503Platform();
+    let thrown: unknown;
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      thrown = e;
+    }
+    expect(thrown).toBeInstanceOf(PlatformUnavailableError);
+    expect((thrown as PlatformUnavailableError).code).toBe("platform_unavailable");
+  });
+
+  it("preserves the server-provided error string as the Error message", async () => {
+    mock503Platform("Postgres unreachable");
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      expect(e).toBeInstanceOf(PlatformUnavailableError);
+      expect((e as Error).message).toBe("Postgres unreachable");
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+
+  it("does NOT classify a generic 503 (no platform_unavailable code) as PlatformUnavailableError", async () => {
+    // Generic upstream-busy 503 — should keep the legacy generic-Error
+    // path so existing busy-retry UX isn't disrupted.
+    mockFetch.mockResolvedValueOnce({
+      ok: false,
+      status: 503,
+      json: () => Promise.reject(new Error("not used")),
+      text: () => Promise.resolve(JSON.stringify({ error: "upstream busy" })),
+    } as unknown as Response);
+    try {
+      await api.get("/workspaces/x/a2a");
+    } catch (e) {
+      expect(e).not.toBeInstanceOf(PlatformUnavailableError);
+      expect((e as Error).message).toContain("503");
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+
+  it("does NOT classify on 500 (server kept legacy 500 for true internal errors)", async () => {
+    mockFailure(500, "boom");
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      expect(e).not.toBeInstanceOf(PlatformUnavailableError);
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+
+  it("falls back to generic Error when 503 body isn't JSON", async () => {
+    mockFetch.mockResolvedValueOnce({
+      ok: false,
+      status: 503,
+      json: () => Promise.reject(new Error("not used")),
+      text: () => Promise.resolve("Service Unavailable"),
+    } as unknown as Response);
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      expect(e).not.toBeInstanceOf(PlatformUnavailableError);
+      expect((e as Error).message).toContain("503");
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+});
--- a/canvas/src/lib/api.ts
+++ b/canvas/src/lib/api.ts
@ -107,11 +107,39 @@ async function request<T>(
  }
  if (!res.ok) {
    const text = await res.text();
+    // Recognise the platform's structured "datastore unreachable"
+    // shape (returned by wsauth_middleware.abortAuthLookupError when
+    // Postgres/Redis is down). Surface as a typed error so callers
+    // can render a dedicated diagnostic instead of a generic toast.
+    if (res.status === 503 && text) {
+      try {
+        const parsed = JSON.parse(text) as { code?: string; error?: string };
+        if (parsed.code === "platform_unavailable") {
+          throw new PlatformUnavailableError(parsed.error || "platform datastore unavailable");
+        }
+      } catch (err) {
+        // Re-throw the typed error if that's what we just constructed.
+        // JSON.parse failures fall through to the generic Error below.
+        if (err instanceof PlatformUnavailableError) throw err;
+      }
+    }
    throw new Error(`API ${method} ${path}: ${res.status} ${text}`);
  }
  return res.json();
 }

+/** Thrown when the platform reports its datastore (Postgres/Redis) is
+ *  unreachable. Surface with a dedicated diagnostic UI rather than a
+ *  generic API-error toast — the user's next action is to check local
+ *  services, not to retry the API call. */
+export class PlatformUnavailableError extends Error {
+  readonly code = "platform_unavailable" as const;
+  constructor(message: string) {
+    super(message);
+    this.name = "PlatformUnavailableError";
+  }
+}
+
 export const api = {
  get: <T>(path: string, options?: RequestOptions) => request<T>("GET", path, undefined, 0, options),
  post: <T>(path: string, body?: unknown, options?: RequestOptions) => request<T>("POST", path, body, 0, options),
--- a/canvas/src/lib/deploy-preflight.ts
+++ b/canvas/src/lib/deploy-preflight.ts
@ -33,6 +33,46 @@ export interface TemplateLike {
  required_env?: string[];
 }

+/** Full /templates response shape shared by TemplatePalette (sidebar)
+ *  and EmptyState (welcome grid). Was previously re-declared in each
+ *  with subtly different fields — EmptyState's narrower shape silently
+ *  dropped `runtime`, `models`, and `required_env`, so the preflight
+ *  couldn't see provider alternatives the template declared. Keep this
+ *  the single source of truth.  */
+export interface Template extends TemplateLike {
+  id: string;
+  name: string;
+  description: string;
+  tier: number;
+  model: string;
+  skills: string[];
+  skill_count: number;
+}
+
+/** Map from a template id to the runtime name the per-workspace
+ *  preflight expects. Used only when the server's `/templates`
+ *  response predates the `runtime` field on the summary (legacy
+ *  installs) — modern responses carry it verbatim. Strip `-default`
+ *  for the claude-code template and identity-map everything else
+ *  that matches our current runtime registry.
+ *
+ *  Lives in the preflight module (not TemplatePalette) so EmptyState
+ *  uses the SAME fallback table. A previous duplication in both call
+ *  sites left EmptyState with only the `-default` suffix strip, which
+ *  would silently disagree with TemplatePalette on templates whose
+ *  id needs a non-identity mapping. */
+export function resolveRuntime(templateId: string): string {
+  const runtimeMap: Record<string, string> = {
+    langgraph: "langgraph",
+    "claude-code-default": "claude-code",
+    openclaw: "openclaw",
+    deepagents: "deepagents",
+    crewai: "crewai",
+    autogen: "autogen",
+  };
+  return runtimeMap[templateId] ?? templateId.replace(/-default$/, "");
+}
+
 export interface SecretEntry {
  key: string;
  has_value: boolean;
--- a/canvas/src/store/tests/canvas-batch-partial-failure.test.ts
+++ b/canvas/src/store/tests/canvas-batch-partial-failure.test.ts
@ -5,27 +5,34 @@ import { describe, it, expect, beforeEach, vi } from "vitest";
 global.fetch = vi.fn();

 import { useCanvasStore } from "../canvas";
-import type { WorkspaceData } from "../socket";
+import type { WorkspaceNodeData } from "../canvas";

-function makeWS(overrides: Partial<WorkspaceData> & { id: string }): WorkspaceData {
+function makeWS(
+  overrides: Partial<WorkspaceNodeData> & { id: string },
+): WorkspaceNodeData {
+  // makeWS builds a minimal WorkspaceNodeData for tests that set state
+  // directly on the store (bypassing hydrate). The `id` override is
+  // ignored — node IDs live on the outer Node<> wrapper, not inside
+  // `data`. It's accepted here so callers can keep their existing
+  // `makeWS({ id: "ws-foo" })` call sites even though the id is only
+  // used on the Node<> wrapper at the call site.
+  void overrides.id;
  return {
    name: "WS",
    role: "agent",
    tier: 1,
    status: "online",
-    agent_card: null,
+    agentCard: null,
    url: "http://localhost:9000",
-    parent_id: null,
-    active_tasks: 0,
-    last_error_rate: 0,
-    last_sample_error: "",
-    uptime_seconds: 60,
-    current_task: "",
-    x: 0,
-    y: 0,
+    parentId: null,
+    activeTasks: 0,
+    lastErrorRate: 0,
+    lastSampleError: "",
+    currentTask: "",
    collapsed: false,
    runtime: "",
-    budget_limit: null,
+    needsRestart: false,
+    budgetLimit: null,
    ...overrides,
  };
 }
@ -148,13 +155,13 @@ describe("batchRestart — partial failure", () => {
          id: "ws-ok",
          type: "workspace",
          position: { x: 0, y: 0 },
-          data: { ...makeWS({ id: "ws-ok" }), needsRestart: true } as WorkspaceData & { needsRestart: boolean },
+          data: { ...makeWS({ id: "ws-ok" }), needsRestart: true } as WorkspaceNodeData,
        },
        {
          id: "ws-fail",
          type: "workspace",
          position: { x: 0, y: 0 },
-          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceData & { needsRestart: boolean },
+          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceNodeData,
        },
      ],
      selectedNodeIds: new Set(["ws-ok", "ws-fail"]),
@ -166,7 +173,7 @@ describe("batchRestart — partial failure", () => {
    });

    const byId = Object.fromEntries(
-      useCanvasStore.getState().nodes.map((n) => [n.id, n.data as WorkspaceData & { needsRestart?: boolean }])
+      useCanvasStore.getState().nodes.map((n) => [n.id, n.data as WorkspaceNodeData])
    );
    expect(byId["ws-ok"].needsRestart).toBe(false);
    expect(byId["ws-fail"].needsRestart).toBe(true);
@ -179,7 +186,7 @@ describe("batchRestart — partial failure", () => {
          id: "ws-fail",
          type: "workspace",
          position: { x: 0, y: 0 },
-          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceData & { needsRestart: boolean },
+          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceNodeData,
        },
      ],
      selectedNodeIds: new Set(["ws-fail"]),
--- a/canvas/src/store/tests/canvas-events-pan.test.ts
+++ b/canvas/src/store/tests/canvas-events-pan.test.ts
@ -67,7 +67,19 @@ describe("canvas-events – molecule:pan-to-node dispatch", () => {
    vi.restoreAllMocks();
  });

-  it("dispatches molecule:pan-to-node with the new nodeId for a NEW provision", () => {
+  it("dispatches both molecule:pan-to-node AND molecule:fit-deploying-org for a NEW root-level provision", () => {
+    // Two custom events are dispatched on NEW root-level provision:
+    //   1. molecule:fit-deploying-org — tells useCanvasViewport to
+    //      frame the whole deploying subtree. Fires for root nodes
+    //      too (commit 5adc8a74) so the canvas centers the just-
+    //      landed root immediately instead of waiting for the
+    //      first child to arrive.
+    //   2. molecule:pan-to-node — pans/zooms to the single node;
+    //      only for standalone creates (no parent), so org-import
+    //      children don't chase the spawn animation.
+    // A previous version of this test expected only #2 and failed
+    // when #1 was added for roots. If only one of these ever fires
+    // again, this test should flag the regression.
    const { get, set } = makeStore([]);
    const dispatched: Event[] = [];
    const spy = vi.spyOn(window, "dispatchEvent").mockImplementation((e) => {
@ -81,9 +93,15 @@ describe("canvas-events – molecule:pan-to-node dispatch", () => {
      set
    );

-    expect(dispatched).toHaveLength(1);
-    expect(dispatched[0].type).toBe("molecule:pan-to-node");
-    expect((dispatched[0] as CustomEvent).detail?.nodeId).toBe("ws-new");
+    expect(dispatched).toHaveLength(2);
+    const panEvent = dispatched.find((e) => e.type === "molecule:pan-to-node");
+    const fitEvent = dispatched.find((e) => e.type === "molecule:fit-deploying-org");
+    expect(panEvent, "molecule:pan-to-node should fire for standalone create").toBeDefined();
+    expect(fitEvent, "molecule:fit-deploying-org should fire so the viewport frames the root").toBeDefined();
+    expect((panEvent as CustomEvent).detail?.nodeId).toBe("ws-new");
+    expect((fitEvent as CustomEvent).detail?.rootId).toBe("ws-new");
+
+    spy.mockRestore();
  });

  it("does NOT dispatch molecule:pan-to-node when restarting an existing node", () => {
--- a/canvas/src/store/tests/canvas-topology.test.ts
+++ b/canvas/src/store/tests/canvas-topology.test.ts
@ -149,6 +149,75 @@ describe("buildNodesAndEdges – parent + child workspaces", () => {
  });
 });

+describe("buildNodesAndEdges – auto-rescue respects live grown parent size", () => {
+  // Regression: child the user dragged into a user-grown area was
+  // false-rescued by every periodic rehydrate (socket health check
+  // every 30s) because the rescue heuristic used the initial
+  // grid-derived parent bbox, not the currently-grown size. Result:
+  // child snapped to a stale grid slot, then settled back ~1 frame
+  // later when growParentsToFitChildren re-ran. Observed 2026-04-25
+  // as "child jumps to weird location, then 30s later it's fine".
+
+  it("does NOT rescue a child placed inside the user-grown parent area", () => {
+    // Parent's initial grid-derived size is small; user has since grown it
+    // to 800×600. Child sits at relative (700, 400) — inside the grown
+    // bbox but outside the initial bbox. Without currentParentSizes,
+    // the rescue would re-place the child into a default grid slot.
+    const parentAbs = { x: 100, y: 100 };
+    const childAbs = { x: parentAbs.x + 700, y: parentAbs.y + 400 };
+    const workspaces = [
+      makeWS({ id: "parent", x: parentAbs.x, y: parentAbs.y }),
+      makeWS({ id: "child", parent_id: "parent", x: childAbs.x, y: childAbs.y }),
+    ];
+    const grownDims = new Map([
+      ["parent", { width: 800, height: 600 }],
+    ]);
+
+    const { nodes } = buildNodesAndEdges(workspaces, new Map(), grownDims);
+    const child = nodes.find((n) => n.id === "child")!;
+    // Child's relative position should match what we passed in.
+    expect(child.position).toEqual({ x: 700, y: 400 });
+  });
+
+  it("DOES rescue a child whose stored position is outside even the grown parent", () => {
+    // Same parent but child is way outside (relative 5000, 5000).
+    // The rescue must still fire — the heuristic isn't "always trust
+    // the user", it's "trust the user up to the current parent bbox".
+    const parentAbs = { x: 100, y: 100 };
+    const childAbs = { x: parentAbs.x + 5000, y: parentAbs.y + 5000 };
+    const workspaces = [
+      makeWS({ id: "parent", x: parentAbs.x, y: parentAbs.y }),
+      makeWS({ id: "child", parent_id: "parent", x: childAbs.x, y: childAbs.y }),
+    ];
+    const grownDims = new Map([
+      ["parent", { width: 800, height: 600 }],
+    ]);
+
+    const { nodes } = buildNodesAndEdges(workspaces, new Map(), grownDims);
+    const child = nodes.find((n) => n.id === "child")!;
+    // Rescued: NOT the original (5000, 5000); some grid slot instead.
+    expect(child.position.x).toBeLessThan(5000);
+    expect(child.position.y).toBeLessThan(5000);
+  });
+
+  it("falls back to initial-min bbox when no live size is provided (preserves legacy behavior)", () => {
+    // Empty currentParentSizes — first hydrate or test without store
+    // priming. Child outside the initial bbox should still be rescued.
+    const parentAbs = { x: 100, y: 100 };
+    const childAbs = { x: parentAbs.x + 700, y: parentAbs.y + 400 };
+    const workspaces = [
+      makeWS({ id: "parent", x: parentAbs.x, y: parentAbs.y }),
+      makeWS({ id: "child", parent_id: "parent", x: childAbs.x, y: childAbs.y }),
+    ];
+
+    const { nodes } = buildNodesAndEdges(workspaces);
+    const child = nodes.find((n) => n.id === "child")!;
+    // Without a live size hint, the initial bbox applies — rescue
+    // fires, child gets a fresh slot, NOT the user-supplied (700,400).
+    expect(child.position).not.toEqual({ x: 700, y: 400 });
+  });
+});
+
 describe("buildNodesAndEdges – deeply nested hierarchy", () => {
  it("handles three levels of nesting", () => {
    const workspaces = [
--- a/canvas/src/store/tests/canvas.test.ts
+++ b/canvas/src/store/tests/canvas.test.ts
@ -484,6 +484,70 @@ describe("removeNode", () => {
  });
 });

+// ---------- removeSubtree ----------
+
+describe("removeSubtree", () => {
+  beforeEach(() => {
+    useCanvasStore.getState().hydrate([
+      makeWS({ id: "root" }),
+      makeWS({ id: "mid", parent_id: "root" }),
+      makeWS({ id: "leaf", parent_id: "mid" }),
+      makeWS({ id: "sibling", parent_id: "root" }),
+      makeWS({ id: "unrelated" }), // separate root
+    ]);
+  });
+
+  it("removes the root and every descendant in one shot", () => {
+    useCanvasStore.getState().removeSubtree("root");
+    const ids = useCanvasStore
+      .getState()
+      .nodes.map((n) => n.id)
+      .sort();
+    expect(ids).toEqual(["unrelated"]);
+  });
+
+  it("removes a mid-level node and its descendants but leaves siblings + ancestors", () => {
+    useCanvasStore.getState().removeSubtree("mid");
+    const ids = useCanvasStore
+      .getState()
+      .nodes.map((n) => n.id)
+      .sort();
+    expect(ids).toEqual(["root", "sibling", "unrelated"]);
+  });
+
+  it("removing a leaf is a no-op cascade (just drops the leaf)", () => {
+    useCanvasStore.getState().removeSubtree("leaf");
+    const ids = useCanvasStore
+      .getState()
+      .nodes.map((n) => n.id)
+      .sort();
+    expect(ids).toEqual(["mid", "root", "sibling", "unrelated"]);
+  });
+
+  it("clears selection when the selected node is anywhere in the removed subtree", () => {
+    useCanvasStore.getState().selectNode("leaf");
+    useCanvasStore.getState().removeSubtree("root");
+    expect(useCanvasStore.getState().selectedNodeId).toBeNull();
+  });
+
+  it("preserves selection when the selected node is outside the removed subtree", () => {
+    useCanvasStore.getState().selectNode("unrelated");
+    useCanvasStore.getState().removeSubtree("root");
+    expect(useCanvasStore.getState().selectedNodeId).toBe("unrelated");
+  });
+
+  it("drops edges incident to any removed node", () => {
+    // The hydrate-built edges connect parent → child. After removing
+    // `root`, no edge involving root/mid/leaf/sibling should remain.
+    useCanvasStore.getState().removeSubtree("root");
+    const remaining = useCanvasStore.getState().edges;
+    for (const e of remaining) {
+      expect(["root", "mid", "leaf", "sibling"]).not.toContain(e.source);
+      expect(["root", "mid", "leaf", "sibling"]).not.toContain(e.target);
+    }
+  });
+});
+
 // ---------- isDescendant ----------

 describe("isDescendant", () => {
--- a/canvas/src/store/tests/socket.test.ts
+++ b/canvas/src/store/tests/socket.test.ts
@ -1,7 +1,7 @@
 import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";

 // ---------------------------------------------------------------------------
-// Mock the canvas store before importing socket.ts
+// Mock the canvas store and api before importing socket.ts
 // ---------------------------------------------------------------------------
 vi.mock("../canvas", () => ({
  useCanvasStore: {
@ -13,6 +13,7 @@ vi.mock("../canvas", () => ({
  },
 }));

+
 // ---------------------------------------------------------------------------
 // Mock WebSocket
 // ---------------------------------------------------------------------------
@ -76,7 +77,6 @@ function getLastWS(): MockWebSocket {
 beforeEach(() => {
  MockWebSocket.instances = [];
  vi.useFakeTimers();
-
  // Reset mocked store state
  vi.mocked(useCanvasStore.getState).mockReturnValue({
    applyEvent: vi.fn(),
@ -263,13 +263,59 @@ describe("WebSocket onclose – auto-reconnect", () => {
    const ws = getLastWS();
    ws.triggerClose();

-    // Fast-forward timers to trigger the reconnect
-    vi.runAllTimers();
+    // First reconnect attempt is scheduled at 1s (Math.min(1000 * 2^0,
+    // 30000)). Advance just past that — vi.runAllTimers() would
+    // additionally re-fire the fallback poll setInterval forever and
+    // hit the 10000-timer abort.
+    vi.advanceTimersByTime(1100);

    expect(MockWebSocket.instances.length).toBeGreaterThan(1);
  });
 });

+describe("HTTP fallback poll while WS unhealthy", () => {
+  it("starts a setInterval after onclose so /workspaces stays fresh", () => {
+    const setIntervalSpy = vi.spyOn(globalThis, "setInterval");
+    connectSocket();
+    const ws = getLastWS();
+    ws.triggerClose();
+    // The fallback poll runs at 10s; the reconnect uses setTimeout, so
+    // any setInterval registered between connect and close must be the
+    // fallback poll.
+    const fallbackCalls = setIntervalSpy.mock.calls.filter(
+      ([, delay]) => delay === 10_000,
+    );
+    expect(fallbackCalls.length).toBeGreaterThan(0);
+    setIntervalSpy.mockRestore();
+  });
+
+  it("clears the fallback poll once the WS reconnects (onopen)", () => {
+    const clearIntervalSpy = vi.spyOn(globalThis, "clearInterval");
+    connectSocket();
+    const ws = getLastWS();
+    ws.triggerClose(); // starts fallback poll
+    clearIntervalSpy.mockClear();
+    // Advance past the first reconnect delay so a fresh ws exists,
+    // then trigger its open.
+    vi.advanceTimersByTime(1100);
+    const ws2 = getLastWS();
+    ws2.triggerOpen();
+    expect(clearIntervalSpy).toHaveBeenCalled();
+    clearIntervalSpy.mockRestore();
+  });
+
+  it("clears the fallback poll on disconnect", () => {
+    const clearIntervalSpy = vi.spyOn(globalThis, "clearInterval");
+    connectSocket();
+    const ws = getLastWS();
+    ws.triggerClose(); // starts fallback poll
+    clearIntervalSpy.mockClear();
+    disconnectSocket();
+    expect(clearIntervalSpy).toHaveBeenCalled();
+    clearIntervalSpy.mockRestore();
+  });
+});
+
 // ---------------------------------------------------------------------------
 // onerror handler
 // ---------------------------------------------------------------------------
@ -328,3 +374,45 @@ describe("health check", () => {
    clearIntervalSpy.mockRestore();
  });
 });
+
+// Rehydrate dedup logic itself is exercised by `RehydrateDedup` unit
+// tests in this file (below). End-to-end coupling through the
+// dynamic-imported `@/lib/api` was non-trivial under our existing
+// fake-timer setup; isolating the gate in a pure helper keeps
+// regression coverage without that mocking complexity.
+
+import { RehydrateDedup } from "../socket";
+
+describe("RehydrateDedup", () => {
+  it("first call passes the gate (no prior fetch)", () => {
+    const d = new RehydrateDedup(1500);
+    expect(d.shouldSkip(0)).toBe(false);
+  });
+
+  it("blocks while a fetch is in flight", () => {
+    const d = new RehydrateDedup(1500);
+    d.beginFetch();
+    expect(d.shouldSkip(100)).toBe(true);
+  });
+
+  it("blocks within the post-completion window", () => {
+    const d = new RehydrateDedup(1500);
+    d.beginFetch();
+    d.completeFetch(1_000);
+    // 1100 - 1000 = 100 < 1500 → skip
+    expect(d.shouldSkip(1_100)).toBe(true);
+    // 2600 - 1000 = 1600 > 1500 → allow
+    expect(d.shouldSkip(2_600)).toBe(false);
+  });
+
+  it("a completed fetch followed by another beginFetch blocks for the new in-flight", () => {
+    const d = new RehydrateDedup(1500);
+    d.beginFetch();
+    d.completeFetch(1_000);
+    // First wait out the dedup window
+    expect(d.shouldSkip(2_600)).toBe(false);
+    d.beginFetch();
+    // Now a second fetch is in flight; further calls block again
+    expect(d.shouldSkip(2_700)).toBe(true);
+  });
+});
--- a/canvas/src/store/canvas-events.ts
+++ b/canvas/src/store/canvas-events.ts
@ -1,7 +1,7 @@
 import type { Node, Edge } from "@xyflow/react";
 import type { WSMessage } from "./socket";
 import type { WorkspaceNodeData } from "./canvas";
-import { extractResponseText } from "@/components/tabs/chat/message-parser";
+import { extractResponseText, extractFilesFromTask } from "@/components/tabs/chat/message-parser";

 // ---------------------------------------------------------------------------
 // Monotonically increasing counter used to assign grid positions.
@ -21,13 +21,46 @@ import { extractResponseText } from "@/components/tabs/chat/message-parser";
 //
 // A monotonic counter is immune to deletions: it only ever increases.
 // ---------------------------------------------------------------------------
+import { appendClass, removeClass, scheduleNodeClassRemoval } from "./classNames";
+
 let _provisioningSequence = 0;

 /** Reset the sequence counter — exposed for test teardown only. */
 export function resetProvisioningSequence(): void {
  _provisioningSequence = 0;
+  _pendingOnline.clear();
 }

+/** WORKSPACE_ONLINE events that arrived BEFORE the matching
+ *  WORKSPACE_PROVISIONING — buffered here so the late-arriving
+ *  provision event can immediately flip to the correct status
+ *  instead of leaving the node stuck as "provisioning" forever.
+ *  Cleared when applied, or on module reset (tests). */
+const _pendingOnline = new Set<string>();
+
+/** Debounced parent-grow. Each child arrival schedules this; the
+ *  timer keeps resetting as more siblings land, so the actual
+ *  width/height update runs ONCE after arrivals go quiet. Avoids
+ *  the visible size-pulse that happened when growParentsToFitChildren
+ *  ran per event. */
+let _growTimer: ReturnType<typeof setTimeout> | null = null;
+function scheduleParentGrow(): void {
+  if (typeof window === "undefined") return;
+  if (_growTimer) clearTimeout(_growTimer);
+  _growTimer = setTimeout(() => {
+    _growTimer = null;
+    import("./canvas").then(({ useCanvasStore }) => {
+      useCanvasStore.getState().growParentsToFitChildren?.();
+    });
+  }, 300);
+}
+
+// (absoluteNodePosition was used by an earlier "spawn from parent"
+// revision that subtracted parent absolute coords from server-sent
+// absolute child coords. The server now ships parent-relative coords
+// directly, so the walk is no longer needed. Deleted rather than
+// kept as dead code.)
+
 /**
 * Standalone event handler extracted from the canvas store.
 * Applies a single WebSocket event to the current node/edge state.
@ -38,7 +71,7 @@ export function handleCanvasEvent(
    nodes: Node<WorkspaceNodeData>[];
    edges: Edge[];
    selectedNodeId: string | null;
-    agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string }>>;
+    agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string; attachments?: Array<{ name: string; uri: string; mimeType?: string; size?: number }> }>>;
  },
  set: (partial: Record<string, unknown>) => void,
 ): void {
@ -47,14 +80,44 @@ export function handleCanvasEvent(
  switch (msg.event) {
    case "WORKSPACE_ONLINE": {
      const existing = nodes.find((n) => n.id === msg.workspace_id);
-      if (existing) {
-        set({
-          nodes: nodes.map((n) =>
-            n.id === msg.workspace_id
-              ? { ...n, data: { ...n.data, status: "online" } }
-              : n
-          ),
-        });
+      if (!existing) {
+        // PROVISIONING event hasn't been applied yet (WS reorder or
+        // this tab joined mid-deploy). Buffer so the later PROVISIONING
+        // handler can flip status in one pass instead of leaving the
+        // node stuck in "provisioning" forever.
+        _pendingOnline.add(msg.workspace_id);
+        break;
+      }
+      // Flip incoming edge from blueprint → laser so the link is
+      // drawn solid the moment this child is live. The laser class
+      // plays the stroke-dashoffset keyframe once; after ~500ms the
+      // edge falls back to the default solid style (see
+      // org-deploy.css and the follow-up setTimeout below).
+      const updatedEdges = edges.map((e) =>
+        e.target === msg.workspace_id && e.className?.includes("mol-deploy-edge-blueprint")
+          ? { ...e, className: "mol-deploy-edge-laser" }
+          : e,
+      );
+      set({
+        edges: updatedEdges,
+        nodes: nodes.map((n) =>
+          n.id === msg.workspace_id
+            ? { ...n, data: { ...n.data, status: "online" } }
+            : n,
+        ),
+      });
+      // Remove the laser class after its keyframe ends so the edge
+      // settles into the app's default solid styling. Fire-and-forget.
+      if (typeof window !== "undefined") {
+        const targetEdgeId = `${existing.data.parentId ?? ""}-${msg.workspace_id}`;
+        window.setTimeout(() => {
+          const s = get();
+          set({
+            edges: s.edges.map((e) =>
+              e.id === targetEdgeId ? { ...e, className: undefined } : e,
+            ),
+          });
+        }, 600);
      }
      break;
    }
@ -113,25 +176,73 @@ export function handleCanvasEvent(
          ),
        });
      } else {
-        // Spread new nodes in a grid so they don't stack at the viewport origin.
-        // Use the monotonic _provisioningSequence counter (not nodes.length) so
-        // deletions never cause two live nodes to share a grid slot.
-        const GRID_COLS = 4;
-        const COL_SPACING = 320;
-        const ROW_SPACING = 160;
-        const GRID_ORIGIN_X = 100;
-        const GRID_ORIGIN_Y = 100;
-        const idx = _provisioningSequence++;
-        const x = GRID_ORIGIN_X + (idx % GRID_COLS) * COL_SPACING;
-        const y = GRID_ORIGIN_Y + Math.floor(idx / GRID_COLS) * ROW_SPACING;
+        // Payload may carry parent_id + final x/y (org import broadcasts
+        // these so the canvas can animate the "spawn from parent" motion).
+        // Standalone workspace creates still omit them — fall back to the
+        // grid-slot behaviour that handled that case historically.
+        const parentIdRaw = (msg.payload.parent_id as string | undefined) ?? null;
+        const finalX = msg.payload.x as number | undefined;
+        const finalY = msg.payload.y as number | undefined;

+        let spawnX: number;
+        let spawnY: number;
+        let targetX: number;
+        let targetY: number;
+        let parentId: string | null = null;
+
+        // Place the node at its final slot immediately — no
+        // spring-from-parent motion. The earlier "materialize from
+        // parent then tween to target" was expensive (two set()
+        // calls + rAF) and produced wrong offsets because the
+        // server sends absolute coords computed against the template's
+        // own coord system while the client had placed the parent at
+        // a grid slot, so the target math always landed off-grid.
+        // Now: server coords are parent-relative (see org_import.go),
+        // we trust them verbatim.
+        const parentInStore = parentIdRaw
+          ? nodes.find((n) => n.id === parentIdRaw)
+          : undefined;
+        if (parentIdRaw && parentInStore && finalX !== undefined && finalY !== undefined) {
+          targetX = finalX;
+          targetY = finalY;
+          parentId = parentIdRaw;
+        } else {
+          // Standalone create OR org-child whose parent hasn't arrived
+          // yet (rare WS reorder) — monotonic-grid placement. The
+          // follow-up hydrate pass reconciles parent_id + the correct
+          // nested position if parent lands later.
+          const GRID_COLS = 4;
+          const COL_SPACING = 320;
+          const ROW_SPACING = 160;
+          const GRID_ORIGIN_X = 100;
+          const GRID_ORIGIN_Y = 100;
+          const idx = _provisioningSequence++;
+          targetX = GRID_ORIGIN_X + (idx % GRID_COLS) * COL_SPACING;
+          targetY = GRID_ORIGIN_Y + Math.floor(idx / GRID_COLS) * ROW_SPACING;
+        }
+        spawnX = targetX;
+        spawnY = targetY;
+
+        // Parent→child relationship is already visible via React
+        // Flow's nested rendering (the child card sits INSIDE the
+        // parent container). An explicit edge on top of that was
+        // visual double-counting and made the canvas look busy;
+        // removed per demo feedback. A2A edges (showA2AEdges) still
+        // render when enabled — those represent runtime traffic,
+        // which nesting doesn't express.
        set({
          nodes: [
            ...nodes,
            {
              id: msg.workspace_id,
              type: "workspaceNode",
-              position: { x, y },
+              position: { x: spawnX, y: spawnY },
+              // React Flow's parentId (distinct from data.parentId)
+              // triggers parent-relative positioning. Set it when the
+              // server told us this is an org-import child so the
+              // node renders nested inside the parent container.
+              ...(parentId ? { parentId } : {}),
+              className: "mol-deploy-spawn",
              data: {
                name: (msg.payload.name as string) ?? "New Workspace",
                status: "provisioning",
@ -143,7 +254,7 @@ export function handleCanvasEvent(
                lastErrorRate: 0,
                lastSampleError: "",
                url: "",
-                parentId: null,
+                parentId, // data.parentId mirrors React Flow's parentId
                currentTask: "",
                runtime: (msg.payload.runtime as string) ?? "",
                needsRestart: false,
@ -152,8 +263,76 @@ export function handleCanvasEvent(
          ],
        });

-        // Pan the canvas to the new node
+        // Grow the parent to fit the just-landed child. DEBOUNCED
+        // across rapid sibling arrivals — firing width/height updates
+        // on every child made the parent card visibly pulse in size
+        // as each kid landed, which read as the parent "flashing
+        // around". One grow pass ~300ms after the last arrival
+        // coalesces the whole burst into a single layout change.
+        if (parentId && typeof window !== "undefined") {
+          scheduleParentGrow();
+        }
+        // Parent-border pulse removed per demo feedback — the soft
+        // box-shadow ring on each arrival compounded with the size
+        // grow to make the whole parent card look unstable. The
+        // dim-light signal on the provisioning child is sufficient
+        // acknowledgement that something is happening.
+
+        // Remove the one-shot spawn class after the keyframe ends so
+        // future re-renders don't replay it.
+        scheduleNodeClassRemoval(msg.workspace_id, "mol-deploy-spawn", 400, get, set);
+
+        // Auto-pan+zoom to the whole deploying org after each
+        // arrival so the user always sees the full picture — unless
+        // they've panned themselves (handled by the viewport hook,
+        // which aborts the fit when the user moved after the last
+        // auto-fit). Event name matches the existing handler in
+        // useCanvasViewport that knows how to compute subtree bounds.
+        //
+        // Fire for roots too (not just children) so the canvas
+        // centers on the just-landed root immediately instead of
+        // waiting for the first child to arrive ~2s later. The
+        // viewport hook walks UP to find the true root, so passing
+        // the node's own id when there's no parent is equivalent
+        // to passing the root.
        if (typeof window !== "undefined") {
+          window.dispatchEvent(
+            new CustomEvent("molecule:fit-deploying-org", {
+              detail: { rootId: parentIdRaw ?? msg.workspace_id },
+            }),
+          );
+        }
+
+        // Race handling: if a WORKSPACE_ONLINE event beat the
+        // matching PROVISIONING to this tab, the online flag was
+        // buffered in _pendingOnline. Apply it now so the node
+        // doesn't stay stuck as "provisioning" forever.
+        //
+        // Only flip to "online" if the current status is still
+        // "provisioning" at drain time. Otherwise a WORKSPACE_DEGRADED
+        // / FAILED / PAUSED that arrived between the set() above and
+        // the scheduled drain would be silently clobbered — the
+        // buffered ONLINE is stale by then.
+        if (_pendingOnline.has(msg.workspace_id)) {
+          _pendingOnline.delete(msg.workspace_id);
+          if (typeof window !== "undefined") {
+            window.setTimeout(() => {
+              const s = get();
+              set({
+                nodes: s.nodes.map((n) =>
+                  n.id === msg.workspace_id && n.data.status === "provisioning"
+                    ? { ...n, data: { ...n.data, status: "online" } }
+                    : n,
+                ),
+              });
+            }, 0);
+          }
+        }
+
+        // Pan the canvas to the new node (standalone create only —
+        // during an org import, zooming to every child chases the
+        // spawn animation around the viewport which is jarring).
+        if (!parentIdRaw && typeof window !== "undefined") {
          window.dispatchEvent(
            new CustomEvent("molecule:pan-to-node", {
              detail: { nodeId: msg.workspace_id },
@ -252,12 +431,19 @@ export function handleCanvasEvent(
    }

    case "A2A_RESPONSE": {
-      // A2A proxy completed — extract response text and store as agent message.
-      // This gives the ChatTab instant response delivery via WebSocket instead of polling.
+      // A2A proxy completed — extract response text AND any `kind: file`
+      // parts. Without the file extraction, agent-returned attachments
+      // delivered via this WebSocket path would disappear (the canvas
+      // would render a text-only message while the HTTP fallback
+      // rendered the same reply with download chips, depending on
+      // which delivery path raced to completion first).
      const responseBody = msg.payload.response_body as Record<string, unknown> | undefined;
      if (responseBody) {
        const text = extractResponseText(responseBody);
-        if (text) {
+        const attachments = extractFilesFromTask(
+          (responseBody.result ?? responseBody) as Record<string, unknown>,
+        );
+        if (text || attachments.length > 0) {
          const { agentMessages } = get();
          const existing = agentMessages[msg.workspace_id] || [];
          set({
@ -265,7 +451,12 @@ export function handleCanvasEvent(
              ...agentMessages,
              [msg.workspace_id]: [
                ...existing,
-                { id: crypto.randomUUID(), content: text, timestamp: new Date().toISOString() },
+                {
+                  id: crypto.randomUUID(),
+                  content: text,
+                  timestamp: new Date().toISOString(),
+                  attachments: attachments.length > 0 ? attachments : undefined,
+                },
              ],
            },
          });
--- a/canvas/src/store/canvas-topology.ts
+++ b/canvas/src/store/canvas-topology.ts
@ -280,6 +280,15 @@ export function computeAutoLayout(
 * Accepts an optional layoutOverrides map (from computeAutoLayout) to override
 * positions for workspaces that were at 0,0.
 *
+ * `currentParentSizes` carries the LIVE measured/grown dimensions of parent
+ * nodes from the existing client store. The auto-rescue heuristic below
+ * (line ~445) compares each child's stored relative position against its
+ * parent's bbox; without the live size, the bbox is whatever the
+ * grid-derived initial min-size formula produced. That falsely rescued
+ * children dragged into the user-grown area on every periodic rehydrate
+ * (socket.ts:87 fires every 30s if no WS events seen) — observed
+ * 2026-04-25 as "child jumps to weird location, then settles 30s later".
+ *
 * Parent/child rendering model: every workspace is a first-class React Flow
 * node (full card). When a workspace has parent_id set, its RF `parentId` is
 * set to the parent's id and its position is stored RELATIVE to the parent
@ -290,7 +299,8 @@ export function computeAutoLayout(
 */
 export function buildNodesAndEdges(
  workspaces: WorkspaceData[],
-  layoutOverrides: Map<string, { x: number; y: number }> = new Map()
+  layoutOverrides: Map<string, { x: number; y: number }> = new Map(),
+  currentParentSizes: Map<string, { width: number; height: number }> = new Map(),
 ): {
  nodes: Node<WorkspaceNodeData>[];
  edges: Edge[];
@ -439,7 +449,23 @@ export function buildNodesAndEdges(
      //     child.left = 500 < parent.right = 800 → overlaps → kept
      //   legacy huge positive (position.x = 50000):
      //     child.left = 50000 >= parent.right → no overlap → rescued
-      const psize = parentSize.get(ws.parent_id!)!;
+      const initialPsize = parentSize.get(ws.parent_id!)!;
+      // Use the larger of (initial min, currently grown) for the bbox
+      // test. Without this, a child the user dragged into the grown
+      // area appears "outside" the (smaller) initial bbox and the
+      // rescue below false-fires on every periodic rehydrate, jumping
+      // the child to a stale grid slot. Live grown dims arrive via
+      // currentParentSizes from hydrate(); on first load (empty
+      // store), the map is empty and we fall back to the initial min
+      // — preserving the original rescue semantics for genuinely
+      // detached legacy data.
+      const liveParentSize = currentParentSizes.get(ws.parent_id!);
+      const psize = liveParentSize
+        ? {
+            width: Math.max(initialPsize.width, liveParentSize.width),
+            height: Math.max(initialPsize.height, liveParentSize.height),
+          }
+        : initialPsize;
      const myW = subtreeSize.get(ws.id)?.width ?? CHILD_DEFAULT_WIDTH;
      const myH = subtreeSize.get(ws.id)?.height ?? CHILD_DEFAULT_HEIGHT;
      const overlapsX =
--- a/canvas/src/store/canvas.ts
+++ b/canvas/src/store/canvas.ts
@ -138,6 +138,16 @@ interface CanvasState {
  updateNodeData: (id: string, data: Partial<WorkspaceNodeData>) => void;
  restartWorkspace: (id: string) => Promise<void>;
  removeNode: (id: string) => void;
+  /** Remove a node AND every descendant in one atomic update. Mirrors
+   *  the server-side cascade — `DELETE /workspaces/:id?confirm=true`
+   *  drops the row plus every descendant in one transaction. The
+   *  caller (Canvas / DetailsTab delete handlers) used to call
+   *  `removeNode(rootId)` and rely on per-descendant WORKSPACE_REMOVED
+   *  WS events to clear the rest. When the WS is unhealthy those
+   *  events never arrive and the children orphan to the root until a
+   *  manual page refresh — `removeSubtree` makes the cascade
+   *  WS-independent. */
+  removeSubtree: (rootId: string) => void;
  setDragOverNode: (id: string | null) => void;
  nestNode: (draggedId: string, targetId: string | null) => Promise<void>;
  isDescendant: (ancestorId: string, nodeId: string) => boolean;
@ -177,6 +187,15 @@ interface CanvasState {
  setPendingDelete: (
    v: { id: string; name: string; hasChildren: boolean; children: { id: string; name: string }[] } | null
  ) => void;
+  /** Node IDs whose DELETE request is in flight. Populated the moment
+   *  the user confirms a cascade delete; drained as WORKSPACE_REMOVED
+   *  events strip the nodes (or all-at-once on request failure). Lets
+   *  the canvas render the "don't touch — something is happening"
+   *  treatment (dim + non-draggable) during the network round trip
+   *  and the server-side cascade, matching the deploy-lock UX. */
+  deletingIds: Set<string>;
+  beginDelete: (ids: Iterable<string>) => void;
+  endDelete: (ids: Iterable<string>) => void;
  searchOpen: boolean;
  setSearchOpen: (open: boolean) => void;
  viewport: { x: number; y: number; zoom: number };
@ -190,8 +209,8 @@ interface CanvasState {
  batchPause: () => Promise<void>;
  batchDelete: () => Promise<void>;
  /** Agent-pushed messages keyed by workspace ID. ChatTab consumes and clears these. */
-  agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string }>>;
-  consumeAgentMessages: (workspaceId: string) => Array<{ id: string; content: string; timestamp: string }>;
+  agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string; attachments?: Array<{ name: string; uri: string; mimeType?: string; size?: number }> }>>;
+  consumeAgentMessages: (workspaceId: string) => Array<{ id: string; content: string; timestamp: string; attachments?: Array<{ name: string; uri: string; mimeType?: string; size?: number }> }>;
  /** WebSocket connection status — drives the live indicator in the Toolbar. */
  wsStatus: "connected" | "connecting" | "disconnected";
  setWsStatus: (status: "connected" | "connecting" | "disconnected") => void;
@ -309,6 +328,17 @@ export const useCanvasStore = create<CanvasState>((set, get) => ({
  closeContextMenu: () => set({ contextMenu: null }),
  pendingDelete: null,
  setPendingDelete: (v) => set({ pendingDelete: v }),
+  deletingIds: new Set<string>(),
+  beginDelete: (ids) => {
+    const next = new Set(get().deletingIds);
+    for (const id of ids) next.add(id);
+    set({ deletingIds: next });
+  },
+  endDelete: (ids) => {
+    const next = new Set(get().deletingIds);
+    for (const id of ids) next.delete(id);
+    set({ deletingIds: next });
+  },
  searchOpen: false,
  setSearchOpen: (open) => set({ searchOpen: open }),
  agentMessages: {},
@ -775,9 +805,69 @@ export const useCanvasStore = create<CanvasState>((set, get) => ({
    });
  },

+  removeSubtree: (rootId) => {
+    const { nodes, edges, selectedNodeId } = get();
+    // Build a parentId → childIds index once so the descent is O(N),
+    // not O(N · depth). The store typically holds <500 nodes; even
+    // doing a linear scan per parent would be fine, but the index
+    // keeps the cost predictable as orgs grow.
+    const childrenByParent = new Map<string, string[]>();
+    for (const n of nodes) {
+      const p = n.data.parentId ?? null;
+      if (p === null) continue;
+      const arr = childrenByParent.get(p);
+      if (arr) arr.push(n.id);
+      else childrenByParent.set(p, [n.id]);
+    }
+    const removed = new Set<string>([rootId]);
+    const stack = [rootId];
+    while (stack.length) {
+      const cur = stack.pop()!;
+      const kids = childrenByParent.get(cur);
+      if (!kids) continue;
+      for (const k of kids) {
+        if (!removed.has(k)) {
+          removed.add(k);
+          stack.push(k);
+        }
+      }
+    }
+    set({
+      nodes: nodes.filter((n) => !removed.has(n.id)),
+      edges: edges.filter((e) => !removed.has(e.source) && !removed.has(e.target)),
+      selectedNodeId:
+        selectedNodeId !== null && removed.has(selectedNodeId)
+          ? null
+          : selectedNodeId,
+    });
+  },
+
  hydrate: (workspaces: WorkspaceData[]) => {
    const layoutOverrides = computeAutoLayout(workspaces);
-    const { nodes, edges } = buildNodesAndEdges(workspaces, layoutOverrides);
+    // Carry the live measured/grown parent sizes from the existing
+    // store into the rebuild. buildNodesAndEdges runs an auto-rescue
+    // pass on each child to detach orphans whose stored relative
+    // position falls outside the parent bbox — without the live
+    // size, the bbox is the initial grid-derived minimum, which
+    // false-flags any child the user has dragged into the
+    // user-grown area. Periodic rehydrate (socket.ts health check,
+    // 30s) was reasserting the rescue against legitimate user
+    // placements, causing the "child jumps to weird location, then
+    // settles" symptom.
+    const current = get().nodes;
+    const currentParentSizes = new Map<string, { width: number; height: number }>();
+    for (const n of current) {
+      const w = (n.measured?.width ?? n.width) as number | undefined;
+      const h = (n.measured?.height ?? n.height) as number | undefined;
+      if (typeof w === "number" && typeof h === "number") {
+        currentParentSizes.set(n.id, { width: w, height: h });
+      }
+    }
+    const { nodes, edges } = buildNodesAndEdges(
+      workspaces,
+      layoutOverrides,
+      currentParentSizes,
+    );
    set({ nodes, edges });
    for (const [nodeId, { x, y }] of layoutOverrides) {
      api.patch(`/workspaces/${nodeId}`, { x, y }).catch(() => {});
--- a/canvas/src/store/classNames.ts
+++ b/canvas/src/store/classNames.ts
@ -0,0 +1,53 @@
+/**
+ * React Flow className helpers shared across the store and canvas
+ * hooks. React Flow's Node.className / Edge.className is a single
+ * space-separated string, so every call site was previously doing
+ * the same `.split/.filter/.join` dance — centralise it here so
+ * any future class manipulation follows one policy.
+ */
+
+/** Add `cls` to the existing className, de-duplicating. Returns
+ *  the (possibly new) string; undefined/empty input → just `cls`. */
+export function appendClass(existing: string | undefined, cls: string): string {
+  if (!existing) return cls;
+  const parts = existing.split(/\s+/).filter(Boolean);
+  if (parts.includes(cls)) return existing;
+  parts.push(cls);
+  return parts.join(" ");
+}
+
+/** Remove `cls` if present. Returns the (possibly empty) string. */
+export function removeClass(existing: string | undefined, cls: string): string {
+  if (!existing) return "";
+  return existing
+    .split(/\s+/)
+    .filter((c) => c && c !== cls)
+    .join(" ");
+}
+
+/** Schedule `removeClass(nodeId, cls)` on the `nodes` slice after
+ *  `delayMs`. The callers used to inline this twice — once for
+ *  parent-pulse cleanup, once for spawn-class cleanup — and now
+ *  share the same impl so future one-shot animation classes land
+ *  consistently.
+ *
+ *  No-ops when `window` is undefined (SSR). Accepts the store's
+ *  get/set pair directly rather than a store reference so it
+ *  composes with the existing handleCanvasEvent signature. */
+export function scheduleNodeClassRemoval(
+  nodeId: string,
+  cls: string,
+  delayMs: number,
+  get: () => { nodes: Array<{ id: string; className?: string }> },
+  set: (partial: Record<string, unknown>) => void,
+): void {
+  if (typeof window === "undefined") return;
+  window.setTimeout(() => {
+    const state = get();
+    set({
+      nodes: state.nodes.map((n) =>
+        n.id === nodeId ? { ...n, className: removeClass(n.className, cls) } : n,
+      ),
+    });
+  }, delayMs);
+}
--- a/canvas/src/store/socket.ts
+++ b/canvas/src/store/socket.ts
@ -12,30 +12,129 @@ export interface WSMessage {
  payload: Record<string, unknown>;
 }

+/** Window during which a freshly-completed rehydrate is reused
+ *  instead of firing a new GET. Picked to absorb the connect→health-
+ *  check sequence (rehydrate runs once on onopen, then the first
+ *  health-check tick fires immediately after — both should share the
+ *  same fetch) without holding back legitimately-spaced rehydrates
+ *  triggered by genuine WS silence later. */
+const REHYDRATE_DEDUP_WINDOW_MS = 1_500;
+
+/** Pure dedup gate for rehydrate(). Tracks two states:
+ *
+ *    - in-flight (between beginFetch and completeFetch): every
+ *      shouldSkip returns true.
+ *    - post-completion window (now < completedAt + windowMs):
+ *      shouldSkip returns true.
+ *
+ *  Extracted from ReconnectingSocket so the gate is unit-testable
+ *  without mocking dynamic imports or fake timers. The class itself
+ *  is stateful but tiny — instances are not shared across sockets. */
+export class RehydrateDedup {
+  private inFlight = false;
+  // -Infinity so the very first shouldSkip(now) call always passes
+  // (now - (-Infinity) > windowMs). Initializing to 0 would false-
+  // trip on test runs where now is also 0 (vi.useFakeTimers default
+  // clock) AND on real runs in the first 1.5s after epoch on
+  // clock-skewed systems.
+  private completedAt = Number.NEGATIVE_INFINITY;
+  constructor(private readonly windowMs: number) {}
+
+  shouldSkip(now: number): boolean {
+    if (this.inFlight) return true;
+    if (now - this.completedAt < this.windowMs) return true;
+    return false;
+  }
+
+  beginFetch(): void {
+    this.inFlight = true;
+  }
+
+  completeFetch(now: number = Date.now()): void {
+    this.inFlight = false;
+    this.completedAt = now;
+  }
+}
+
+/** Cadence for the HTTP fallback rehydrate that runs while the WS is
+ *  in connecting/disconnected limbo. 10s is short enough that the user
+ *  sees STARTING → ONLINE within one tick after the platform finishes
+ *  provisioning, but long enough to not pound /workspaces if the
+ *  network truly is down. The dedup gate inside rehydrate() collapses
+ *  this against the post-onopen rehydrate, so reconnect doesn't pay
+ *  for a duplicate fetch. */
+const FALLBACK_POLL_MS = 10_000;
+
 class ReconnectingSocket {
  private ws: WebSocket | null = null;
  private attempt = 0;
  private url: string;
  private lastEventTime = 0;
  private healthCheckTimer: ReturnType<typeof setInterval> | null = null;
+  private reconnectTimer: ReturnType<typeof setTimeout> | null = null;
+  // Polls /workspaces while the WS is unhealthy so the canvas reflects
+  // truth even when realtime events aren't arriving. Without this the
+  // store can stay frozen for minutes — e.g. workspaces transition
+  // STARTING → ONLINE on the platform but the canvas keeps showing
+  // STARTING until the WS finally reconnects, triggering false
+  // "Provisioning Timeout" banners on already-online workspaces.
+  private fallbackPollTimer: ReturnType<typeof setInterval> | null = null;
+  // disposed signals that disconnect() has been called. Any in-flight
+  // reconnect / handshake must abort early rather than attach to a
+  // socket the caller no longer owns — otherwise React StrictMode's
+  // effect double-invoke (and any future intentional disconnect)
+  // leaves a zombie WebSocket alive forever.
+  private disposed = false;
+  // In-flight singleton + dedup window for rehydrate. Two reasons to
+  // collapse rapid calls:
+  //   1. connect.onopen fires rehydrate immediately, and the very next
+  //      health-check tick may fire it again before the first GET
+  //      returns — wasted round trip + rebuild churn that resets the
+  //      mid-flight UI state (auto-rescue heuristics, grow passes).
+  //   2. Future call sites (a manual "Refresh" button, post-import
+  //      hydrate, error-recovery rehydrate) might pile up.
+  // Keeping rehydrate idempotent at the call-site level means each
+  // caller can fire-and-forget without coordinating.
+  private rehydrateInFlight: Promise<void> | null = null;
+  private rehydrateDedup = new RehydrateDedup(REHYDRATE_DEDUP_WINDOW_MS);

  constructor(url: string) {
    this.url = url;
  }

  connect() {
+    if (this.disposed) return;
    useCanvasStore.getState().setWsStatus("connecting");
-    this.ws = new WebSocket(this.url);
+    // Start the HTTP fallback poll up-front, not just on onclose. Two
+    // scenarios this guards against:
+    //   1. The very first connect attempt — onclose hasn't fired yet
+    //      because we never had a successful onopen.
+    //   2. A failed handshake where the browser takes tens of seconds
+    //      to surface as onclose (Chrome can hold a SYN-SENT WebSocket
+    //      open for ~75s before giving up).
+    // Idempotent — startFallbackPoll early-returns if a timer is
+    // already running, so calling it from both places is cheap.
+    this.startFallbackPoll();
+    const ws = new WebSocket(this.url);
+    this.ws = ws;

-    this.ws.onopen = () => {
+    ws.onopen = () => {
+      if (this.disposed || this.ws !== ws) {
+        // Late-open on an abandoned socket. Close it cleanly; the
+        // caller already moved on.
+        try { ws.close(); } catch { /* noop */ }
+        return;
+      }
      this.attempt = 0;
      this.lastEventTime = Date.now();
      useCanvasStore.getState().setWsStatus("connected");
+      this.stopFallbackPoll();
      this.rehydrate();
      this.startHealthCheck();
    };

-    this.ws.onmessage = (event) => {
+    ws.onmessage = (event) => {
+      if (this.disposed || this.ws !== ws) return;
      this.lastEventTime = Date.now();
      try {
        const msg: WSMessage = JSON.parse(event.data);
@ -45,15 +144,21 @@ class ReconnectingSocket {
      }
    };

-    this.ws.onclose = () => {
+    ws.onclose = () => {
+      // Fired on intentional close (disposed) OR server/network drop.
+      // Only schedule a reconnect when the socket is still live AND
+      // corresponds to the WS we just tore down (prevents a stale
+      // onclose from a zombie socket from re-arming the loop).
+      if (this.disposed || this.ws !== ws) return;
      this.stopHealthCheck();
      useCanvasStore.getState().setWsStatus("connecting");
+      this.startFallbackPoll();
      const delay = Math.min(1000 * 2 ** this.attempt, 30000);
      this.attempt++;
-      setTimeout(() => this.connect(), delay);
+      this.reconnectTimer = setTimeout(() => this.connect(), delay);
    };

-    this.ws.onerror = () => {
+    ws.onerror = () => {
      // Suppressed — onclose handles reconnection. onerror fires before onclose
      // and the Event object doesn't contain useful info (serializes to {}).
    };
@ -80,20 +185,78 @@ class ReconnectingSocket {
    }
  }

-  private async rehydrate() {
-    try {
-      const { api } = await import("@/lib/api");
-      const workspaces = await api.get<WorkspaceData[]>("/workspaces");
-      useCanvasStore.getState().hydrate(workspaces);
-    } catch {
-      // Rehydration failed — will retry on next health check cycle
+  /** While the WS is in connecting/disconnected limbo, poll /workspaces
+   *  so the store stays fresh. The reconnect attempts continue in
+   *  parallel; whichever recovers first wins. rehydrate()'s own dedup
+   *  gate prevents this from racing with the open-time rehydrate. */
+  private startFallbackPoll() {
+    if (this.fallbackPollTimer) return;
+    this.fallbackPollTimer = setInterval(() => {
+      if (this.disposed) {
+        this.stopFallbackPoll();
+        return;
+      }
+      void this.rehydrate();
+    }, FALLBACK_POLL_MS);
+  }
+
+  private stopFallbackPoll() {
+    if (this.fallbackPollTimer) {
+      clearInterval(this.fallbackPollTimer);
+      this.fallbackPollTimer = null;
    }
  }

+  private rehydrate(): Promise<void> {
+    // Reuse an in-flight fetch — a second caller during the GET
+    // shouldn't kick off a parallel one.
+    if (this.rehydrateInFlight) return this.rehydrateInFlight;
+    if (this.rehydrateDedup.shouldSkip(Date.now())) {
+      return Promise.resolve();
+    }
+
+    // beginFetch lives INSIDE the IIFE's try so any future code added
+    // between gate-check and IIFE-construction can't throw and leave
+    // the gate stuck at inFlight=true forever. Today there's nothing
+    // that can throw here, but the cost of being defensive is one
+    // extra microtask of "in flight" status — negligible.
+    const promise = (async () => {
+      this.rehydrateDedup.beginFetch();
+      try {
+        const { api } = await import("@/lib/api");
+        const workspaces = await api.get<WorkspaceData[]>("/workspaces");
+        if (this.disposed) return;
+        useCanvasStore.getState().hydrate(workspaces);
+      } catch {
+        // Rehydration failed — will retry on next health check cycle.
+      } finally {
+        this.rehydrateDedup.completeFetch(Date.now());
+        this.rehydrateInFlight = null;
+      }
+    })();
+    this.rehydrateInFlight = promise;
+    return promise;
+  }
+
  disconnect() {
+    this.disposed = true;
    this.stopHealthCheck();
+    this.stopFallbackPoll();
+    if (this.reconnectTimer) {
+      clearTimeout(this.reconnectTimer);
+      this.reconnectTimer = null;
+    }
    if (this.ws) {
-      this.ws.close();
+      // Detach listeners before close() so we don't route the close
+      // event through our onclose → scheduleReconnect path. Belt +
+      // braces on top of the `disposed` check, because StrictMode
+      // cycles through so fast that an attached onclose can fire
+      // after disposed=true is set but before this assignment runs.
+      this.ws.onopen = null;
+      this.ws.onmessage = null;
+      this.ws.onclose = null;
+      this.ws.onerror = null;
+      try { this.ws.close(); } catch { /* noop */ }
      this.ws = null;
    }
    useCanvasStore.getState().setWsStatus("disconnected");
--- a/canvas/src/styles/org-deploy.css
+++ b/canvas/src/styles/org-deploy.css
@ -0,0 +1,151 @@
+/**
+ * Org-deploy animation module.
+ *
+ * Loaded globally (see app/globals.css). All values come from
+ * theme-tokens.css so a theme swap needs zero edits here.
+ *
+ * Component contract — canvas/src/components/canvas code adds
+ * these classes to the React Flow node / edge wrappers:
+ *
+ *   .mol-deploy-spawn             One-shot entry animation on a
+ *                                 node that just arrived. Applied
+ *                                 by canvas-events.ts for 600 ms
+ *                                 then removed.
+ *   .mol-deploy-shimmer           Persistent border shimmer while
+ *                                 a node's status === "provisioning".
+ *                                 Removed when status flips to
+ *                                 "online" / "failed".
+ *   .mol-deploy-parent-pulse      One-shot acknowledgement pulse
+ *                                 on the parent when a child lands.
+ *                                 Applied for parent-pulse duration
+ *                                 then removed.
+ *   .mol-deploy-locked            Applied to every non-root node
+ *                                 inside a deploying org so it dims
+ *                                 and the cursor signals un-
+ *                                 draggable.
+ *   .mol-deploy-root-complete     One-shot pop + glow on the root
+ *                                 when the last child comes online.
+ *
+ * Edges use React Flow edge data to pick styling — see the
+ * selectors below the node keyframes.
+ *
+ * Reduced motion is handled at the bottom via the same guard
+ * globals.css already installs for other animations.
+ */
+
+/* ────────────────────────────────────────────────────────
+   Keyframes — kept terse; values come from variables so
+   duplication across themes is nil.
+   ──────────────────────────────────────────────────────── */
+
+@keyframes mol-deploy-spawn {
+  /* Gentle fade-in-place. The earlier "spring from parent" motion
+     collided with the server-computed grid positions (parent and
+     child used different coord origins once the parent was placed
+     on the client's grid instead of the template's absolute
+     coords), which landed children in wrong slots. Keeping the
+     animation to a simple opacity+scale lets the server's layout
+     win — and reads as "node arrived" without the over-engineered
+     spring. */
+  from { opacity: 0; transform: scale(0.85); }
+  to   { opacity: 1; transform: scale(1);    }
+}
+
+/* mol-deploy-parent-pulse keyframe removed with the effect — the
+   box-shadow expanding ring made the parent card visibly "flash" on
+   every child arrival when the grow pass also bumped width/height.
+   Kept as a deliberate non-class so the theme-tokens vars can drop
+   with it on the next theme pass. */
+
+@keyframes mol-deploy-root-complete {
+  0%   { transform: scale(1);    box-shadow: 0 0 0 0 transparent;   }
+  40%  { transform: scale(var(--mol-deploy-root-scale-peak)); box-shadow: var(--mol-deploy-root-glow); }
+  100% { transform: scale(1);    box-shadow: 0 0 0 0 transparent;   }
+}
+
+/* (mol-deploy-edge-draw keyframe removed with the edge effects.) */
+
+@keyframes mol-deploy-cancel-pulse {
+  0%, 100% { box-shadow: 0 0 0 0   var(--mol-deploy-cancel-ring); }
+  50%      { box-shadow: 0 0 0 10px transparent;                   }
+}
+
+/* ────────────────────────────────────────────────────────
+   Node classes
+   ──────────────────────────────────────────────────────── */
+
+/* Qualify with .react-flow__node so this rule beats the default
+   `node-appear` animation defined later in globals.css. Without
+   the qualifier, CSS source-order wins and the standard
+   node-appear overrides our scale/opacity keyframe, visually
+   dropping the "spawn from parent" motion. */
+.react-flow__node.mol-deploy-spawn {
+  animation:
+    mol-deploy-spawn var(--mol-duration-spawn) var(--mol-easing-bounce-out) both;
+}
+
+/* Provisioning signal — the earlier rotating conic-gradient border
+   read as distracting "spinner" clutter during a 15-child org
+   import (dozens of them spinning simultaneously). A static dim
+   (reduced opacity + saturation) communicates "this one is still
+   coming online" without the motion noise. The locked-child style
+   already uses the same pattern — we reuse the filter values so
+   a provisioning ROOT node and a locked CHILD look consistent. */
+.mol-deploy-shimmer {
+  filter: saturate(var(--mol-deploy-locked-saturation)) opacity(var(--mol-deploy-locked-opacity));
+  transition: filter var(--mol-duration-base) var(--mol-easing-standard);
+}
+
+.mol-deploy-locked {
+  filter: saturate(var(--mol-deploy-locked-saturation)) opacity(var(--mol-deploy-locked-opacity));
+  cursor: not-allowed !important;
+  transition: filter var(--mol-duration-base) var(--mol-easing-standard);
+}
+
+.react-flow__node.mol-deploy-root-complete {
+  animation: mol-deploy-root-complete var(--mol-duration-root-complete) var(--mol-easing-emphasize) both;
+}
+
+/* ────────────────────────────────────────────────────────
+   Edge classes — intentionally inert.
+
+   Earlier revisions painted incoming edges with a dashed-blueprint
+   → animated-laser-trace effect as the child landed. User feedback
+   on the first demo was "remove connection line effects" — the
+   moving dashes read as noise during a multi-child deploy. Keeping
+   the class hooks so canvas-events.ts event handlers can still
+   apply/strip them without blowing up, but the styling is a no-op
+   (edges fall through to the default styling in globals.css).
+   If a future demo wants the effect back, wire the rules below.
+   ──────────────────────────────────────────────────────── */
+
+/* ────────────────────────────────────────────────────────
+   Cancel-deployment pill — rendered by OrgCancelButton.tsx
+   attached to the root node during deploy. Class `.mol-deploy-cancel`
+   is always applied; the pulse is additive.
+   ──────────────────────────────────────────────────────── */
+.mol-deploy-cancel {
+  background: var(--mol-deploy-cancel-bg);
+  color: var(--mol-deploy-cancel-text);
+  transition: background var(--mol-duration-fast) var(--mol-easing-standard);
+}
+.mol-deploy-cancel:hover {
+  background: var(--mol-deploy-cancel-bg-hover);
+}
+.mol-deploy-cancel-pulse {
+  animation: mol-deploy-cancel-pulse var(--mol-duration-parent-pulse) var(--mol-easing-standard) infinite;
+}
+
+/* ────────────────────────────────────────────────────────
+   Reduced-motion guard — mirror globals.css's policy so this
+   module stays WCAG 2.3.3 compliant without relying on the
+   global file being loaded first.
+   ──────────────────────────────────────────────────────── */
+@media (prefers-reduced-motion: reduce) {
+  .react-flow__node.mol-deploy-spawn,
+  .react-flow__node.mol-deploy-root-complete,
+  .mol-deploy-cancel-pulse {
+    animation: none !important;
+  }
+  /* Dim-light signal is already static; no override needed. */
+}
--- a/canvas/src/styles/theme-tokens.css
+++ b/canvas/src/styles/theme-tokens.css
@ -0,0 +1,69 @@
+/**
+ * Canvas theme tokens — single source of truth for colors, durations,
+ * easings, and sizes used by every animated / stateful canvas
+ * component. Importable from any stylesheet; individual feature
+ * modules (org-deploy.css, settings-panel.css, ...) only reference
+ * variables defined here so a future theme swap touches this one
+ * file.
+ *
+ * Adding a theme:
+ *   Put a scoped override block like `[data-theme="light"] { ... }`
+ *   and set only the tokens whose values differ from the default
+ *   dark theme. Unset tokens inherit the default.
+ *
+ * Naming convention:
+ *   --mol-<feature>-<semantic-role>       → values the user sees
+ *   --mol-duration-<name>                 → motion timings
+ *   --mol-easing-<name>                   → motion curves
+ * Prefix `mol-` avoids collisions with Tailwind / React Flow vars.
+ */
+
+:root {
+  /* ────────────────────────────────────────────────────────
+     Motion primitives — pick one of these; don't hardcode ms
+     values in feature stylesheets. If a new feature genuinely
+     needs a bespoke duration, add a token here and reference it.
+     ──────────────────────────────────────────────────────── */
+  --mol-duration-fast: 150ms;
+  --mol-duration-base: 300ms;
+  --mol-duration-spawn: 350ms;
+  --mol-duration-root-complete: 700ms;
+  --mol-duration-fit-view: 800ms;
+
+  --mol-easing-standard: cubic-bezier(0.2, 0, 0, 1);
+  --mol-easing-bounce-out: cubic-bezier(0.2, 0.8, 0.2, 1.05);
+  --mol-easing-emphasize: cubic-bezier(0.3, 0, 0, 1);
+
+  /* ────────────────────────────────────────────────────────
+     Org-deploy animation palette (dark theme defaults)
+     ──────────────────────────────────────────────────────── */
+
+  /* Root-complete moment — one-shot glow when the last child lands. */
+  --mol-deploy-root-glow: 0 0 36px 6px rgba(59, 130, 246, 0.55);
+  --mol-deploy-root-scale-peak: 1.05;
+
+  /* Locked-child visual — non-root nodes during deploy cannot be
+     dragged; this dims them so the user's attention stays on the
+     active spawn. Saturation + opacity instead of a badge keeps
+     the card recognisable while signalling "not available". */
+  --mol-deploy-locked-saturation: 0.55;
+  --mol-deploy-locked-opacity: 0.78;
+
+  /* Cancel-deployment pill attached to the root node. Red, pulsing,
+     one button that kills the whole tree. */
+  --mol-deploy-cancel-bg: rgba(220, 38, 38, 0.92);     /* red-600/92 */
+  --mol-deploy-cancel-bg-hover: rgba(239, 68, 68, 1);  /* red-500 */
+  --mol-deploy-cancel-ring: rgba(239, 68, 68, 0.45);
+  --mol-deploy-cancel-text: #fff;
+}
+
+/* Example template for a future light theme. Intentionally empty
+   — product hasn't shipped a light theme yet but this shows the
+   override surface any future theme must fill. Uncomment + tune
+   when the light theme lands.
+[data-theme="light"] {
+  --mol-deploy-shimmer-from: rgba(37, 99, 235, 0.08);
+  --mol-deploy-shimmer-to:   rgba(37, 99, 235, 0.9);
+  ...
+}
+*/
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -126,6 +126,13 @@ services:
      REDIS_URL: redis://redis:6379
      PORT: "${PLATFORM_PORT:-8080}"
      PLATFORM_URL: "http://platform:${PLATFORM_PORT:-8080}"
+      # Default MOLECULE_ENV=development so the WorkspaceAuth / AdminAuth
+      # middleware fail-open path activates when ADMIN_TOKEN is unset —
+      # otherwise the canvas (which runs without a bearer in pure local
+      # dev) gets 401 "missing workspace auth token" on every request.
+      # Override to "production" for SaaS/staged deploys; in those modes
+      # ADMIN_TOKEN must also be set or every request rejects.
+      MOLECULE_ENV: "${MOLECULE_ENV:-development}"
      CORS_ORIGINS: ${CORS_ORIGINS:-http://localhost:${CANVAS_PUBLISH_PORT:-3000},http://127.0.0.1:${CANVAS_PUBLISH_PORT:-3000},http://localhost:3001}
      RATE_LIMIT: "${RATE_LIMIT:-1000}"
      CONFIGS_DIR: /configs
@ -153,6 +160,24 @@ services:
      HIBERNATION_IDLE_MINUTES: "${HIBERNATION_IDLE_MINUTES:-}"
      # Plugin supply chain hardening (issue #768 / PR #775). Never set in production.
      PLUGIN_ALLOW_UNPINNED: "${PLUGIN_ALLOW_UNPINNED:-}"
+      # Force ImagePull/ContainerCreate to request linux/amd64 manifests
+      # for the workspace-template-* images. The templates ship single-arch
+      # amd64 today; without this override, an arm64 host (Apple Silicon)
+      # asks the daemon for linux/arm64/v8, which doesn't match the manifest
+      # and the pull fails with "no matching manifest". Apple Silicon will
+      # run the amd64 image under Rosetta — slower (~2-3×) but functional.
+      # Override to "" or another platform when the templates start shipping
+      # multi-arch (then this hardcoded amd64 becomes unnecessary).
+      MOLECULE_IMAGE_PLATFORM: "${MOLECULE_IMAGE_PLATFORM:-linux/amd64}"
+      # GHCR auth for the workspace-images refresh endpoint
+      # (POST /admin/workspace-images/refresh). When set, the platform's
+      # Docker SDK ImagePull on private workspace-template-* images
+      # succeeds without per-host `docker login`. GHCR_USER is the GitHub
+      # username; GHCR_TOKEN is a fine-grained PAT with `read:packages`
+      # on the Molecule-AI org. Both unset → endpoint can only pull
+      # public images (current state for all 8 templates).
+      GHCR_USER: "${GHCR_USER:-}"
+      GHCR_TOKEN: "${GHCR_TOKEN:-}"
    volumes:
      - ./workspace-configs-templates:/configs
      - ./org-templates:/org-templates:ro
--- a/docs/architecture/database-schema.md
+++ b/docs/architecture/database-schema.md
@ -77,7 +77,7 @@ CREATE TABLE workspace_secrets (
 );
 ```

-Stores API keys, credentials, and other secrets needed by workspace agents. Values are encrypted with AES-256 at the application layer. The encryption key comes from the `SECRETS_ENCRYPTION_KEY` environment variable on the platform — never stored in the database.
+Stores API keys, credentials, and other secrets needed by workspace agents. Values are encrypted with AES-256-GCM at the application layer. The encryption key comes from the tenant's `SECRETS_ENCRYPTION_KEY` environment variable, provisioned at tenant boot by the control plane (which holds the master key in AWS KMS — see [secrets-key-custody.md](./secrets-key-custody.md)). The key is never stored in the database.

 The provisioner reads secrets from this table, decrypts them, and injects them as environment variables when spinning up workspace containers. Secrets are never included in bundles (see [Constraints — Rule 5](../development/constraints-and-rules.md)).

--- a/docs/architecture/molecule-technical-doc.md
+++ b/docs/architecture/molecule-technical-doc.md
@ -902,7 +902,7 @@ Postgres + Redis + Langfuse only (for local development without containerized wo
 | `REDIS_URL` | `redis://localhost:6379` | Redis connection |
 | `PORT` | `8080` | Platform listen port |
 | `PLATFORM_URL` | `http://host.docker.internal:8080` | Injected to workspace containers |
-| `SECRETS_ENCRYPTION_KEY` | Optional | AES-256 key (32 bytes) for secret encryption |
+| `SECRETS_ENCRYPTION_KEY` | Optional | AES-256-GCM key (32 bytes) for tenant secret encryption. Provisioned at tenant boot by the control plane, which holds the master key in AWS KMS — see [secrets-key-custody.md](./secrets-key-custody.md). |
 | `CONFIGS_DIR` | `/configs` | Workspace config template directory |
 | `PLUGINS_DIR` | `/plugins` | Shared plugin directory |
 | `ACTIVITY_RETENTION_DAYS` | `7` | Activity log retention |
--- a/docs/architecture/secrets-key-custody.md
+++ b/docs/architecture/secrets-key-custody.md
@ -0,0 +1,85 @@
+# Secrets Key Custody
+
+How the encryption keys that protect Molecule workspace secrets are managed, where each key lives, and what an attacker who compromises one layer can or cannot read.
+
+This document exists because the platform repo (`workspace-server`) reads `SECRETS_ENCRYPTION_KEY` from its process env, which on its own looks like "encryption-at-rest theater." The full custody chain runs through the control plane (`molecule-controlplane`) where AWS KMS holds the key material at rest. Anyone reading only the platform repo sees half the picture.
+
+## Two modes
+
+The control plane's `internal/crypto.Envelope` ships in two modes, picked at boot from env:
+
+| Mode | Trigger | At-rest format | Recommended for |
+|------|---------|----------------|-----------------|
+| **KMS envelope** | `KMS_KEY_ARN` set | Per-blob KMS-wrapped DEK + AES-256-GCM ciphertext | Production, multi-tenant SaaS |
+| **Static key** | Only `SECRETS_ENCRYPTION_KEY` set | AES-256-GCM with one process-wide key | Dev, self-hosted single-tenant |
+
+`Envelope.Decrypt` is dual-mode — it can read either format on the way out, so a deployment can flip from static-key to KMS envelope without re-encrypting historical rows. Code: `molecule-controlplane/internal/crypto/kms.go`.
+
+## KMS envelope flow
+
+When `KMS_KEY_ARN` is configured, every secret write looks like:
+
+1. CP calls `kms.GenerateDataKey(KeyId=KMS_KEY_ARN, KeySpec=AES_256)` → returns `{Plaintext, CiphertextBlob}`.
+2. CP encrypts the secret with AES-256-GCM using `Plaintext` as the key.
+3. CP discards `Plaintext` from memory; persists the blob:
+
+   ```
+   [0x02 prefix][uint16 BE: encrypted_dek_len][encrypted_dek][nonce(12)][ct+tag]
+   ```
+
+   The `0x02` byte distinguishes v2 (KMS-wrapped) blobs from legacy static-key blobs.
+
+4. To read: CP calls `kms.Decrypt(CiphertextBlob)` → recovers the AES key → unwraps the GCM ciphertext.
+
+KMS calls cost ~$0.03 per 10k requests. We do not cache DEKs — provisioning rate is orders below steady-state reads, and not caching keeps key rotation reasoning simple.
+
+## What lives where
+
+| Layer | Key custody | Plaintext key in memory? |
+|-------|-------------|--------------------------|
+| AWS KMS | KMS-resident, never leaves the HSM | No (hardware) |
+| `molecule-controlplane` process | KMS client + IAM role | Briefly per-secret-op only |
+| CP database (`database_url_encrypted`, tenant secrets) | KMS-wrapped blobs | Never |
+| Per-tenant `workspace-server` env (`SECRETS_ENCRYPTION_KEY`) | Provisioned at tenant boot by CP | Yes, for the tenant's process lifetime |
+| Tenant Postgres (`workspace_secrets.value`) | AES-256-GCM with the tenant's key | Never |
+
+The "plaintext in tenant memory" row is the standard envelope-encryption trade-off: a DEK has to be unwrapped somewhere to be used. The blast radius of compromising one tenant's process is one tenant's secrets — not the whole fleet.
+
+## Threat model
+
+| Attacker capability | Can they read tenant secrets? |
+|---------------------|-------------------------------|
+| Reads CP database backup | No — KMS unwrap requires IAM-scoped `kms:Decrypt` |
+| Steals `KMS_KEY_ARN` value | No — ARN alone does nothing without IAM access |
+| Compromises CP IAM role | Yes — can `kms:Decrypt` any wrapped DEK |
+| Reads tenant Postgres (one tenant) | No — `SECRETS_ENCRYPTION_KEY` lives only in the tenant's own EC2 process env, not in DB |
+| Compromises one tenant's EC2 | Yes for that tenant's secrets, no for any other tenant |
+| Compromises CP host | Game over (CP can provision arbitrary tenants) |
+
+The two boundaries the design protects:
+
+- **DB-only compromise (incl. backups)** → secrets remain encrypted; attacker needs separate access to either KMS (prod) or CP env (dev).
+- **One-tenant compromise** → blast radius limited to that tenant; no cross-tenant key reuse.
+
+## Rotation
+
+- **Tenant key rotation** (per-tenant `SECRETS_ENCRYPTION_KEY`): re-encrypt the tenant's `workspace_secrets` rows under a new key, then swap the env var. Static-key mode requires this for all rotation; KMS mode only requires it on suspected key compromise.
+- **KMS CMK rotation**: AWS KMS handles annual automatic rotation of the customer master key. Re-wrapping data keys is unnecessary because each `Decrypt` call routes through the current CMK version automatically (KMS keeps prior versions for decrypt-only).
+
+## Audit / compliance posture
+
+For SOC2 / ISO 27001 / customer security questionnaires:
+
+- **Key custody**: AWS KMS (FIPS 140-2 Level 3 HSM-backed)
+- **Key isolation**: per-tenant DEK; no shared keys across tenants
+- **Access control**: IAM-scoped `kms:Decrypt`, audited via CloudTrail
+- **At-rest encryption**: AES-256-GCM (NIST-approved, authenticated)
+- **In-transit encryption**: TLS 1.2+ for KMS, CP-to-tenant, tenant-to-DB
+- **Rotation**: AWS-managed CMK rotation annually; manual DEK rotation on incident
+
+## Pointers
+
+- KMS envelope code: [`molecule-controlplane/internal/crypto/kms.go`](https://github.com/Molecule-AI/molecule-controlplane/blob/main/internal/crypto/kms.go)
+- Static-key fallback: [`molecule-controlplane/internal/crypto/aes.go`](https://github.com/Molecule-AI/molecule-controlplane/blob/main/internal/crypto/aes.go)
+- Tenant secrets handler: [`workspace-server/internal/crypto/aes.go`](../../workspace-server/internal/crypto/aes.go)
+- Tenant secrets schema: [database-schema.md](./database-schema.md#workspace_secrets)
--- a/docs/development/constraints-and-rules.md
+++ b/docs/development/constraints-and-rules.md
@ -56,7 +56,7 @@ Direct A2A calls between workspaces are unauthenticated in MVP. Access control i

 ## 11. Secrets in Postgres, Encrypted

-Workspace secrets (API keys, credentials) are stored in Postgres with AES-256 encryption at the application layer. The encryption key comes from the `SECRETS_ENCRYPTION_KEY` environment variable. Secrets are never included in bundles, never logged, never exposed via API responses.
+Workspace secrets (API keys, credentials) are stored in Postgres with AES-256-GCM encryption at the application layer. The tenant's `SECRETS_ENCRYPTION_KEY` is provisioned at boot by the control plane, which holds the master key material in AWS KMS (envelope encryption, dual-mode with a static-key fallback for dev). Full custody chain in [secrets-key-custody.md](../architecture/secrets-key-custody.md). Secrets are never included in bundles, never logged, never exposed via API responses.

 ## 12. Last-Write-Wins for MVP

--- a/docs/workspace-runtime-package.md
+++ b/docs/workspace-runtime-package.md
@ -2,29 +2,67 @@

 ## Overview

-The shared workspace runtime infrastructure lives in two places:
+The shared workspace runtime infrastructure has **one editable source** and
+**one published artifact**:

-1. **Source of truth (monorepo):** `workspace/` — this is where all development happens
-2. **Published package:** [`molecule-ai-workspace-runtime`](https://pypi.org/project/molecule-ai-workspace-runtime/) on PyPI
+1. **Source of truth (monorepo, editable):** `workspace/` — every runtime
+   change lands here. Edit it like any other monorepo code.
+2. **Published artifact (PyPI, generated):** [`molecule-ai-workspace-runtime`](https://pypi.org/project/molecule-ai-workspace-runtime/)
+   — produced by `.github/workflows/publish-runtime.yml` on every
+   `runtime-vX.Y.Z` tag push. Do NOT edit this independently — it gets
+   overwritten on every publish.
+
+The legacy sibling repo `molecule-ai-workspace-runtime` (the GitHub repo, as
+distinct from the PyPI package) is no longer the source-of-truth and should
+be treated as a publish artifact only. It can be archived or used as a
+read-only mirror.
+
+## Why this shape
+
+The 8 workspace template repos (claude-code, langgraph, hermes, etc.) each
+build their own Docker image and `pip install molecule-ai-workspace-runtime`
+from PyPI. PyPI is the right distribution channel — semver, reproducible
+builds, no submodule dance per-repo. But the runtime ALSO needs to evolve
+in lock-step with the platform's wire protocol (queue shape, A2A metadata,
+event payloads). Shipping cross-cutting protocol changes as separate
+runtime + platform PRs in two repos creates ordering pain and broken
+intermediate states.
+
+The monorepo + auto-publish split gives both: edit cross-cutting changes
+in one PR, publish the runtime artifact via a tag.

 ## What's in the package

-Everything in `workspace/` except adapter-specific code:
+Everything in `workspace/*.py` plus the `adapters/`, `builtin_tools/`,
+`plugins_registry/`, `policies/`, `skill_loader/` subpackages. Build
+artifacts (`Dockerfile`, `*.sh`, `pytest.ini`, `requirements.txt`) are
+excluded.

- `molecule_runtime/` — all shared `.py` files (main.py, config.py, heartbeat.py, etc.)
- `molecule_runtime/adapters/` — `BaseAdapter`, `AdapterConfig`, `SetupResult`, `shared_runtime`
- `molecule_runtime/builtin_tools/` — delegation, memory, approvals, sandbox, telemetry
- `molecule_runtime/skill_loader/` — skill loading + hot-reload
- `molecule_runtime/plugins_registry/` — plugin discovery and install pipeline
- `molecule_runtime/policies/` — namespace routing policies
- Console script: `molecule-runtime` → `molecule_runtime.main:main_sync`
+The build script rewrites bare imports so the published package is a
+proper Python namespace:
+
+```
+# In monorepo workspace/:
+from a2a_client import discover_peer
+from builtin_tools.memory import store
+
+# In published molecule_runtime/ (auto-rewritten at publish time):
+from molecule_runtime.a2a_client import discover_peer
+from molecule_runtime.builtin_tools.memory import store
+```
+
+The closed allowlist of rewritten module names lives in
+`scripts/build_runtime_package.py` (`TOP_LEVEL_MODULES` + `SUBPACKAGES`).
+Add a new top-level module to workspace/? Add it to the allowlist in the
+same PR.

 ## Adapter repos

-Each of the 8 adapter repos now contains:
+Each of the 8 adapter template repos contains:
 - `adapter.py` — runtime-specific `Adapter` class
- `requirements.txt` — `molecule-ai-workspace-runtime>=0.1.0` + adapter deps
- `Dockerfile` — standalone image (no longer extends workspace-template:base)
+- `requirements.txt` — `molecule-ai-workspace-runtime>=0.1.X` + adapter deps
+- `Dockerfile` — standalone image with `ENV ADAPTER_MODULE=adapter` and
+  `ENTRYPOINT ["molecule-runtime"]`

 | Adapter | Repo |
 |---------|------|
@ -39,8 +77,8 @@ Each of the 8 adapter repos now contains:

 ## Adapter discovery (ADAPTER_MODULE)

-Standalone adapter repos set `ENV ADAPTER_MODULE=adapter` in their Dockerfile.
-The runtime's `get_adapter()` checks this env var first:
+Standalone adapter repos set `ENV ADAPTER_MODULE=adapter` in their
+Dockerfile. The runtime's `get_adapter()` checks this env var first:

 ```python
 # In molecule_runtime/adapters/__init__.py
@ -49,25 +87,104 @@ def get_adapter(runtime: str) -> type[BaseAdapter]:
    if adapter_module:
        mod = importlib.import_module(adapter_module)
        return getattr(mod, "Adapter")
-    # Fall back to built-in subdirectory scan (monorepo local dev)
-    ...
+    raise KeyError(...)
 ```

 ## Publishing a new version

 ```bash
-cd workspace-template
-# 1. Bump version in pyproject.toml
-# 2. Sync to molecule-ai-workspace-runtime repo
-# 3. Tag and push — CI publishes to PyPI via PYPI_TOKEN secret
+# From any local checkout of monorepo, after merging your runtime change:
+git tag runtime-v0.1.6
+git push origin runtime-v0.1.6
 ```

-Or manually:
-```bash
-cd workspace-template
-python -m build
-python -m twine upload dist/*
+The `publish-runtime` workflow takes over — checks out the tag, runs
+`scripts/build_runtime_package.py --version 0.1.6`, builds wheel + sdist,
+runs a smoke import to catch broken rewrites, and uploads to PyPI via
+the `PYPI_TOKEN` repo secret.
+
+For dev/test releases without tagging, dispatch the workflow manually
+with an explicit version (e.g. `0.1.6.dev1` — PEP 440 dev/rc/post forms
+are accepted).
+
+After publish, the 8 template repos pick up the new version on their
+next `:latest` rebuild. To force-pull immediately, bump the pin in each
+template's `requirements.txt`.
+
+## End-to-end CD chain
+
+The full chain from monorepo merge → workspace containers running new code:
+
 ```
+1. Merge PR with workspace/ changes to main
+   ↓
+2. .github/workflows/auto-tag-runtime.yml fires
+   ↓ reads PR labels (release:major/minor) or defaults to patch
+   ↓ pushes runtime-vX.Y.Z tag
+   ↓
+3. .github/workflows/publish-runtime.yml fires (on the tag)
+   ↓ builds wheel via scripts/build_runtime_package.py
+   ↓ smoke-imports the wheel
+   ↓ uploads to PyPI
+   ↓ cascade job fires repository_dispatch (event-type: runtime-published)
+   ↓ to all 8 workspace-template-* repos
+   ↓
+4. Each template's publish-image.yml fires (on repository_dispatch)
+   ↓ rebuilds Dockerfile (which pip-installs the new PyPI version)
+   ↓ pushes ghcr.io/molecule-ai/workspace-template-<runtime>:latest
+   ↓
+5. Production hosts run scripts/refresh-workspace-images.sh
+   OR an operator hits POST /admin/workspace-images/refresh on the platform
+   ↓ docker pull all 8 :latest tags
+   ↓ remove + force-recreate any running ws-* containers using a refreshed image
+   ↓ canvas re-provisions the workspaces on next interaction
+```
+
+Steps 1-4 are fully automated. Step 5 is one-click: a single curl or shell
+command. SaaS deployments typically wire step 5 into their normal deploy
+pipeline (every release pulls fresh images on every host); local dev fires
+it manually after a runtime release lands.
+
+### Required secrets
+
+| Secret | Where | Why |
+|---|---|---|
+| `PYPI_TOKEN` | molecule-core repo | Twine upload auth (PyPI) |
+| `TEMPLATE_DISPATCH_TOKEN` | molecule-core repo | Fine-grained PAT with `actions:write` on the 8 template repos. Without it the `cascade` job warns and exits clean — PyPI still publishes; templates just don't auto-rebuild. |
+
+### Step 5 specifics
+
+**Local dev (compose stack):**
+```bash
+bash scripts/refresh-workspace-images.sh                  # all runtimes
+bash scripts/refresh-workspace-images.sh --runtime claude-code
+bash scripts/refresh-workspace-images.sh --no-recreate    # pull only, leave containers
+```
+
+**Via platform admin endpoint (any deploy):**
+```bash
+curl -X POST "$PLATFORM/admin/workspace-images/refresh"
+curl -X POST "$PLATFORM/admin/workspace-images/refresh?runtime=claude-code"
+curl -X POST "$PLATFORM/admin/workspace-images/refresh?recreate=false"
+```
+
+The endpoint pulls + recreates from inside the platform container, so it
+needs Docker socket access (the compose stack mounts
+`/var/run/docker.sock` already) AND GHCR auth on the host's docker config
+(`docker login ghcr.io` once per host). On a fresh host without GHCR auth,
+the pull step warns per runtime and the response surfaces the failures.
+
+## Local dev (build the package without publishing)
+
+```bash
+python3 scripts/build_runtime_package.py --version 0.1.0-local --out /tmp/runtime-build
+cd /tmp/runtime-build
+python -m build              # produces dist/*.whl + dist/*.tar.gz
+pip install dist/*.whl       # install into a venv to test locally
+```
+
+This is the same pipeline CI runs. Use it to validate import-rewrite
+correctness before pushing a `runtime-v*` tag.

 ## Writing a new adapter

@ -75,5 +192,18 @@ python -m twine upload dist/*
 2. Copy `adapter.py` pattern from any existing adapter repo
 3. Change imports: `from molecule_runtime.adapters.base import BaseAdapter, AdapterConfig`
 4. Create `requirements.txt` with `molecule-ai-workspace-runtime>=0.1.0` + your deps
-5. Create `Dockerfile` with `ENV ADAPTER_MODULE=adapter` and `ENTRYPOINT ["molecule-runtime"]`
+5. Create `Dockerfile` with `ENV ADAPTER_MODULE=adapter` and
+   `ENTRYPOINT ["molecule-runtime"]`
 6. Register the runtime name in the platform's known runtimes list
+
+## Migration note
+
+Prior to this workflow, the runtime was duplicated across monorepo
+`workspace/` AND a sibling repo `molecule-ai-workspace-runtime`, with no
+sync mechanism. That caused 30+ files to drift between the two trees and
+tonight's chat-leak / queued-classification fixes existed only in the
+monorepo copy until manually ported.
+
+If you have an old local checkout of `molecule-ai-workspace-runtime`, treat
+it as outdated. The monorepo `workspace/` is now authoritative; the PyPI
+artifact is rebuilt from it on every `runtime-v*` tag.
--- a/manifest.json
+++ b/manifest.json
@ -39,6 +39,7 @@
    {"name": "free-beats-all", "repo": "Molecule-AI/molecule-ai-org-template-free-beats-all", "ref": "main"},
    {"name": "medo-smoke", "repo": "Molecule-AI/molecule-ai-org-template-medo-smoke", "ref": "main"},
    {"name": "molecule-worker-gemini", "repo": "Molecule-AI/molecule-ai-org-template-molecule-worker-gemini", "ref": "main"},
-    {"name": "reno-stars", "repo": "Molecule-AI/molecule-ai-org-template-reno-stars", "ref": "main"}
+    {"name": "reno-stars", "repo": "Molecule-AI/molecule-ai-org-template-reno-stars", "ref": "main"},
+    {"name": "ux-ab-lab", "repo": "Molecule-AI/molecule-ai-org-template-ux-ab-lab", "ref": "main"}
  ]
 }
--- a/scripts/build_runtime_package.py
+++ b/scripts/build_runtime_package.py
@ -0,0 +1,298 @@
+#!/usr/bin/env python3
+"""Build the molecule-ai-workspace-runtime PyPI package from monorepo workspace/.
+
+Monorepo workspace/ is the single source-of-truth for runtime code. The PyPI
+package is a publish-time mirror produced by this script, NOT a parallel
+editable copy. Anyone editing the runtime should edit workspace/, never the
+sibling molecule-ai-workspace-runtime repo.
+
+What this does
+--------------
+1. Copies workspace/ source into build/molecule_runtime/ (note the rename:
+   bare modules become a real Python package).
+2. Rewrites top-level imports so e.g. `from a2a_client import X` becomes
+   `from molecule_runtime.a2a_client import X`. The rewrite is regex-based
+   on a closed allowlist of modules — third-party imports like `from a2a.X`
+   (the a2a-sdk package) are left alone because the regex is anchored on
+   exact module names.
+3. Writes a pyproject.toml with the requested version + the README + the
+   py.typed marker.
+4. Leaves the build dir ready for `python -m build` to produce a wheel/sdist.
+
+Usage
+-----
+  scripts/build_runtime_package.py --version 0.1.6 --out /tmp/runtime-build
+  cd /tmp/runtime-build && python -m build
+  python -m twine upload dist/*
+
+The publish workflow (.github/workflows/publish-runtime.yml) drives this
+on every `runtime-v*` tag push.
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import shutil
+import sys
+from pathlib import Path
+
+# Top-level Python modules in workspace/ that become molecule_runtime.X.
+# Anything imported as `from <name> import` or `import <name>` (where <name>
+# matches one of these) gets rewritten to use the package prefix.
+#
+# Closed list (not "every .py we copy") because a typo in workspace/ would
+# otherwise leak into a wrong rewrite. Update this when adding a new
+# top-level module to workspace/.
+TOP_LEVEL_MODULES = {
+    "a2a_cli",
+    "a2a_client",
+    "a2a_executor",
+    "a2a_mcp_server",
+    "a2a_tools",
+    "adapter_base",
+    "agent",
+    "agents_md",
+    "claude_sdk_executor",
+    "cli_executor",
+    "config",
+    "consolidation",
+    "coordinator",
+    "events",
+    "executor_helpers",
+    "heartbeat",
+    "hermes_executor",
+    "initial_prompt",
+    "main",
+    "molecule_ai_status",
+    "platform_auth",
+    "plugins",
+    "preflight",
+    "prompt",
+    "shared_runtime",
+}
+
+# Subdirectory packages — these are already real packages (they have or will
+# have __init__.py) so the rewrite is `from <pkg>` → `from molecule_runtime.<pkg>`.
+SUBPACKAGES = {
+    "adapters",
+    "builtin_tools",
+    "plugins_registry",
+    "policies",
+    "skill_loader",
+}
+
+# Files in workspace/ NOT included in the published package. These are
+# build artifacts, dev scripts, or monorepo-only scaffolding.
+EXCLUDE_FILES = {
+    "Dockerfile",
+    "build-all.sh",
+    "rebuild-runtime-images.sh",
+    "entrypoint.sh",
+    "pytest.ini",
+    "requirements.txt",
+    # Note: adapter_base.py, agents_md.py, hermes_executor.py, shared_runtime.py
+    # are kept (referenced by adapters/__init__.py and other modules); they get
+    # their imports rewritten via TOP_LEVEL_MODULES. Excluding them broke the
+    # smoke-test install with `ModuleNotFoundError: adapter_base`.
+}
+
+EXCLUDE_DIRS = {
+    "__pycache__",
+    "tests",
+    "lib",
+    "molecule_audit",
+    "scripts",
+}
+
+
+def build_import_rewriter() -> re.Pattern:
+    """Compile a single regex matching all import statements that need
+    rewriting. The match groups capture the keyword + module name so the
+    replacement preserves whitespace and trailing punctuation.
+
+    Modules included: TOP_LEVEL_MODULES ∪ SUBPACKAGES.
+
+    The negative-lookahead on `\\.` in the suffix prevents matching
+    `from a2a.server.X import Y` against bare `a2a` (which isn't in our
+    set, but the principle matters for any future short module name that
+    happens to be a prefix of a real package name).
+    """
+    names = sorted(TOP_LEVEL_MODULES | SUBPACKAGES)
+    alt = "|".join(re.escape(n) for n in names)
+    # Matches:
+    #   from <name>(\.|\s|import)
+    #   import <name>(\s|$|,)
+    # And captures the keyword + name so we can re-emit with prefix.
+    pattern = (
+        r"(?m)^(?P<indent>\s*)"          # leading whitespace (preserved)
+        r"(?P<kw>from|import)\s+"        # 'from' or 'import'
+        r"(?P<mod>" + alt + r")"          # the module name
+        r"(?P<rest>[\s.,]|$)"            # what follows: '.subpath', ' import …', ',', whitespace, EOL
+    )
+    return re.compile(pattern)
+
+
+def rewrite_imports(text: str, regex: re.Pattern) -> str:
+    """Replace bare imports with package-prefixed ones.
+
+    `import X`           → `import molecule_runtime.X as X`  (preserve binding)
+    `from X import Y`    → `from molecule_runtime.X import Y`
+    `from X.sub import Y` → `from molecule_runtime.X.sub import Y`
+    """
+    def repl(m: re.Match) -> str:
+        indent, kw, mod, rest = m.group("indent"), m.group("kw"), m.group("mod"), m.group("rest")
+        if kw == "from":
+            # `from X` or `from X.sub` — always safe to prefix.
+            return f"{indent}from molecule_runtime.{mod}{rest}"
+        # `import X` — preserve the binding name `X` (callers do `X.foo`)
+        # by aliasing. `import X.sub` is uncommon for our modules and would
+        # need a different binding form, but isn't used in workspace/ today.
+        if rest.startswith("."):
+            # `import X.sub` — rewrite as `import molecule_runtime.X.sub` and
+            # leave the trailing dot pattern intact for the rest of the line.
+            return f"{indent}import molecule_runtime.{mod}{rest}"
+        # Plain `import X` — alias preserves the local name.
+        return f"{indent}import molecule_runtime.{mod} as {mod}{rest}"
+    return regex.sub(repl, text)
+
+
+def copy_tree_filtered(src: Path, dst: Path) -> list[Path]:
+    """Copy src/ → dst/ skipping EXCLUDE_FILES + EXCLUDE_DIRS. Returns the
+    list of .py files copied so the caller can run the import rewrite over
+    them in one pass."""
+    py_files: list[Path] = []
+    if dst.exists():
+        shutil.rmtree(dst)
+    dst.mkdir(parents=True)
+    for entry in src.iterdir():
+        if entry.is_dir():
+            if entry.name in EXCLUDE_DIRS:
+                continue
+            sub_py = copy_tree_filtered(entry, dst / entry.name)
+            py_files.extend(sub_py)
+        else:
+            if entry.name in EXCLUDE_FILES:
+                continue
+            shutil.copy2(entry, dst / entry.name)
+            if entry.suffix == ".py":
+                py_files.append(dst / entry.name)
+    return py_files
+
+
+PYPROJECT_TEMPLATE = """\
+[build-system]
+requires = ["setuptools>=68.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "molecule-ai-workspace-runtime"
+version = "{version}"
+description = "Molecule AI workspace runtime — shared infrastructure for all agent adapters"
+requires-python = ">=3.11"
+license = {{text = "BSL-1.1"}}
+readme = "README.md"
+dependencies = [
+    "a2a-sdk[http-server]>=1.0.0,<2.0",
+    "httpx>=0.27.0",
+    "uvicorn>=0.30.0",
+    "starlette>=0.38.0",
+    "websockets>=12.0",
+    "pyyaml>=6.0",
+    "langchain-core>=0.3.0",
+    "opentelemetry-api>=1.24.0",
+    "opentelemetry-sdk>=1.24.0",
+    "opentelemetry-exporter-otlp-proto-http>=1.24.0",
+    "temporalio>=1.7.0",
+]
+
+[project.scripts]
+molecule-runtime = "molecule_runtime.main:main_sync"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["molecule_runtime*"]
+
+[tool.setuptools.package-data]
+"molecule_runtime" = ["py.typed"]
+"""
+
+
+README_TEMPLATE = """\
+# molecule-ai-workspace-runtime
+
+Shared workspace runtime for [Molecule AI](https://github.com/Molecule-AI/molecule-core)
+agent adapters. Installed by every workspace template image
+(`workspace-template-claude-code`, `-langgraph`, `-hermes`, etc.) to provide
+A2A delegation, heartbeat, memory, plugin loading, and skill management.
+
+This package is **published from the molecule-core monorepo `workspace/`
+directory** by the `publish-runtime` GitHub Actions workflow on every
+`runtime-v*` tag push. **Do not edit this package directly** — edit
+`workspace/` in the monorepo.
+
+See [`docs/workspace-runtime-package.md`](https://github.com/Molecule-AI/molecule-core/blob/main/docs/workspace-runtime-package.md)
+for the publish flow and architecture.
+"""
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--version", required=True, help="Package version, e.g. 0.1.6")
+    parser.add_argument("--out", required=True, type=Path, help="Build output directory (will be wiped)")
+    parser.add_argument("--source", type=Path, default=Path(__file__).resolve().parent.parent / "workspace",
+                        help="Path to monorepo workspace/ directory (default: ../workspace from this script)")
+    args = parser.parse_args()
+
+    src = args.source.resolve()
+    out = args.out.resolve()
+    if not src.is_dir():
+        print(f"error: source not a directory: {src}", file=sys.stderr)
+        return 2
+
+    pkg_dir = out / "molecule_runtime"
+    print(f"[build] source: {src}")
+    print(f"[build] output: {out}")
+    print(f"[build] package: {pkg_dir}")
+
+    if out.exists():
+        shutil.rmtree(out)
+    out.mkdir(parents=True)
+
+    py_files = copy_tree_filtered(src, pkg_dir)
+    print(f"[build] copied {len(py_files)} .py files")
+
+    # Ensure top-level package marker exists. workspace/ doesn't have one
+    # (it's not a package in monorepo), but the published artifact must.
+    init = pkg_dir / "__init__.py"
+    if not init.exists():
+        init.write_text('"""Molecule AI workspace runtime."""\n')
+
+    # Touch py.typed so type-checkers in adapter consumers see the package
+    # as typed. Empty file is the convention.
+    (pkg_dir / "py.typed").touch()
+
+    # Rewrite imports in every .py file we copied + the new __init__.py.
+    regex = build_import_rewriter()
+    rewrites = 0
+    for f in [*py_files, init]:
+        original = f.read_text()
+        rewritten = rewrite_imports(original, regex)
+        if rewritten != original:
+            f.write_text(rewritten)
+            rewrites += 1
+    print(f"[build] rewrote imports in {rewrites} files")
+
+    # Emit pyproject.toml + README at build root.
+    (out / "pyproject.toml").write_text(PYPROJECT_TEMPLATE.format(version=args.version))
+    (out / "README.md").write_text(README_TEMPLATE)
+
+    print(f"[build] done. To publish:")
+    print(f"  cd {out}")
+    print(f"  python -m build")
+    print(f"  python -m twine upload dist/*")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/refresh-workspace-images.sh
+++ b/scripts/refresh-workspace-images.sh
@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+# refresh-workspace-images.sh — pull the latest workspace template images
+# from GHCR and recreate any running ws-* containers against the new digest.
+#
+# This is the local-dev / single-host equivalent of step 5 of the runtime CD
+# chain (see docs/workspace-runtime-package.md). On a SaaS deployment the
+# host's deploy pipeline does the pull on every release; this script is
+# what to run on a local docker-compose host after a runtime release lands.
+#
+# Usage:
+#   bash scripts/refresh-workspace-images.sh                     # pull all 8 + recreate running ws-*
+#   bash scripts/refresh-workspace-images.sh --runtime claude-code  # pull just one template
+#   bash scripts/refresh-workspace-images.sh --no-recreate          # pull only, leave containers
+#
+# Behavior:
+#   - Always pulls fresh; docker is a no-op if local matches remote, so
+#     repeated runs are cheap.
+#   - Recreate is "kill + remove + let the next canvas interaction re-
+#     provision" — simpler than `docker stop / docker run` because the
+#     platform owns the run flags. Workspaces re-register on next probe.
+#   - If a container is mid-conversation, the kill cancels in-flight work.
+#     Run during a quiet window OR add --no-recreate and recreate manually
+#     via canvas Restart buttons.
+
+set -euo pipefail
+
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+log()  { echo -e "${GREEN}[refresh]${NC} $1" >&2; }
+warn() { echo -e "${YELLOW}[refresh]${NC} $1" >&2; }
+err()  { echo -e "${RED}[refresh]${NC} $1" >&2; }
+
+ALL_RUNTIMES=(claude-code langgraph crewai autogen deepagents hermes gemini-cli openclaw)
+RUNTIMES=("${ALL_RUNTIMES[@]}")
+RECREATE=true
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --runtime) RUNTIMES=("$2"); shift 2;;
+    --no-recreate) RECREATE=false; shift;;
+    -h|--help) sed -n '2,30p' "$0"; exit 0;;
+    *) err "unknown arg: $1"; exit 2;;
+  esac
+done
+
+# 1. Pull fresh tags. Soft-fail per runtime — one missing image (e.g., a
+#    template that hasn't been published yet) shouldn't abort the others.
+log "pulling latest images for: ${RUNTIMES[*]}"
+PULLED=()
+FAILED=()
+for rt in "${RUNTIMES[@]}"; do
+  IMG="ghcr.io/molecule-ai/workspace-template-$rt:latest"
+  if docker pull "$IMG" >/dev/null 2>&1; then
+    log "  ✓ $rt"
+    PULLED+=("$rt")
+  else
+    warn "  ✗ $rt (pull failed — image may not exist or auth missing)"
+    FAILED+=("$rt")
+  fi
+done
+
+if [ "$RECREATE" = "false" ]; then
+  log "skip-recreate set — leaving containers untouched"
+  log "done. pulled=${#PULLED[@]} failed=${#FAILED[@]}"
+  exit 0
+fi
+
+# 2. Find ws-* containers whose image is one of the runtimes we pulled.
+#    `docker inspect` exposes the image tag/digest each was created from.
+log "scanning ws-* containers for stale images..."
+TO_RECREATE=()
+for cn in $(docker ps -a --filter "name=ws-" --format "{{.Names}}"); do
+  IMG=$(docker inspect "$cn" --format '{{.Config.Image}}' 2>/dev/null || echo "")
+  for rt in "${PULLED[@]}"; do
+    if [[ "$IMG" == *"workspace-template-$rt"* ]]; then
+      TO_RECREATE+=("$cn")
+      break
+    fi
+  done
+done
+
+if [ "${#TO_RECREATE[@]}" -eq 0 ]; then
+  log "no running ws-* containers using a refreshed image — nothing to recreate"
+  exit 0
+fi
+
+# 3. Kill + remove. Canvas next-interaction will re-provision.
+log "recreating ${#TO_RECREATE[@]} containers (canvas will re-provision on next interaction)"
+for cn in "${TO_RECREATE[@]}"; do
+  docker rm -f "$cn" >/dev/null 2>&1 && log "  removed $cn" || warn "  failed to remove $cn"
+done
+
+log "done. open the canvas and the workspaces will re-provision against the new image."
--- a/tests/e2e/test_chat_attachments_e2e.sh
+++ b/tests/e2e/test_chat_attachments_e2e.sh
@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# E2E test: chat file attachment round-trip
+#
+# Proves the full drag-drop → agent-reads → agent-returns-file → download
+# path against a live workspace. Runs against the local workspace-server
+# on :8080 with a hermes workspace already online. The test is provider-
+# agnostic as long as the agent has a valid API key — it only asserts
+# that attachments surface on both ends, not a specific reply shape.
+#
+# Usage:  WSID=<workspace-id> tests/e2e/test_chat_attachments_e2e.sh
+#         (pass WSID for an existing hermes workspace)
+#
+# Prereqs:
+#   - workspace-server on http://localhost:8080
+#   - the WSID workspace is online, runtime=hermes
+#   - a working provider key (MINIMAX_API_KEY / ANTHROPIC_API_KEY / etc.)
+#   - /workspace writable by the agent user (some templates ship it
+#     root-owned; chmod 777 for the E2E or use a writable template)
+
+set -euo pipefail
+
+WSID="${WSID:?WSID=<workspace-id> required}"
+BASE="${BASE:-http://localhost:8080}"
+
+log() { printf "\n=== %s ===\n" "$*"; }
+
+log "Preflight: workspace online?"
+STATUS=$(curl -s "$BASE/workspaces/$WSID" | python3 -c 'import json,sys;print(json.load(sys.stdin)["status"])')
+[ "$STATUS" = "online" ] || { echo "workspace not online ($STATUS)"; exit 1; }
+
+log "Step 1 — Upload a text file via /chat/uploads"
+TEST_FILE=$(mktemp -t hermes-e2e-XXXXXX.txt)
+echo "secret code: $(openssl rand -hex 4)-$(openssl rand -hex 4)" > "$TEST_FILE"
+EXPECTED=$(cat "$TEST_FILE" | awk '{print $NF}')
+UPLOAD=$(curl -s -X POST "$BASE/workspaces/$WSID/chat/uploads" -F "files=@$TEST_FILE")
+URI=$(echo "$UPLOAD" | python3 -c 'import json,sys;print(json.load(sys.stdin)["files"][0]["uri"])')
+[ -n "$URI" ] || { echo "upload failed: $UPLOAD"; exit 1; }
+echo "uploaded: $URI"
+
+log "Step 2 — A2A message with file part; expect agent to quote the code"
+# Build the JSON via a python helper so the URI value doesn't have to be
+# shell-interpolated through a heredoc (the { } tokens in a JSON body
+# collide with bash brace-expansion when quoted wrong).
+PAYLOAD=$(URI="$URI" python3 -c '
+import json, os
+uri = os.environ["URI"]
+print(json.dumps({
+  "jsonrpc":"2.0","id":"e2e-up","method":"message/send",
+  "params":{"message":{"role":"user","messageId":"e2e-up","kind":"message","parts":[
+    {"kind":"text","text":"Read the attached file and tell me the exact secret code."},
+    {"kind":"file","file":{"name":"test.txt","mimeType":"text/plain","uri":uri}},
+  ]},"configuration":{"acceptedOutputModes":["text/plain"],"blocking":True}}}))
+')
+REPLY=$(curl -s -X POST "$BASE/workspaces/$WSID/a2a" \
+  -H 'Content-Type: application/json' \
+  --max-time 120 \
+  -d "$PAYLOAD")
+REPLY_TEXT=$(echo "$REPLY" | python3 -c 'import json,sys;d=json.load(sys.stdin);[print(p.get("text","")) for p in d["result"]["parts"] if p.get("kind")=="text"]')
+echo "agent reply: $REPLY_TEXT"
+if echo "$REPLY_TEXT" | grep -qF "$EXPECTED"; then
+  echo "PASS: agent saw the attached file"
+else
+  echo "FAIL: agent reply missing expected code '$EXPECTED'"
+  exit 1
+fi
+
+log "Step 3 — Seed a file inside /workspace and ask agent to reference it"
+# Relies on /workspace being writable by the platform (we copy as root via
+# docker exec, mimicking the path a real agent would use through its tools).
+CONTAINER=$(docker ps --format '{{.Names}}' | grep -E "^ws-${WSID:0:12}" | head -1)
+[ -n "$CONTAINER" ] || { echo "container not found"; exit 1; }
+docker exec "$CONTAINER" sh -c 'echo "E2E report body $(date -u +%s)" > /workspace/e2e-report.txt'
+
+REPLY=$(curl -s -X POST "$BASE/workspaces/$WSID/a2a" \
+  -H 'Content-Type: application/json' \
+  --max-time 120 \
+  -d '{"jsonrpc":"2.0","id":"e2e-down","method":"message/send","params":{"message":{"role":"user","messageId":"e2e-down","kind":"message","parts":[{"kind":"text","text":"There is a file at /workspace/e2e-report.txt. Mention its exact path in your reply so I can download it."}]},"configuration":{"acceptedOutputModes":["text/plain"],"blocking":true}}}')
+FILE_URI=$(echo "$REPLY" | python3 -c 'import json,sys,re;d=json.load(sys.stdin);[print(p["file"]["uri"]) for p in d["result"]["parts"] if p.get("kind")=="file"]' | head -1)
+[ -n "$FILE_URI" ] || { echo "FAIL: agent reply had no file part"; echo "$REPLY"; exit 1; }
+echo "agent attached: $FILE_URI"
+
+log "Step 4 — Download via /chat/download"
+DL_PATH=${FILE_URI#workspace:}
+BODY=$(curl -s "$BASE/workspaces/$WSID/chat/download?path=$DL_PATH")
+echo "downloaded: $BODY"
+if echo "$BODY" | grep -q "E2E report body"; then
+  echo "PASS: downloaded the agent-returned file"
+else
+  echo "FAIL: download did not return expected body"
+  exit 1
+fi
+
+log "ALL E2E CHECKS PASSED"
--- a/tests/e2e/test_chat_attachments_multiruntime_e2e.sh
+++ b/tests/e2e/test_chat_attachments_multiruntime_e2e.sh
@ -0,0 +1,149 @@
+#!/usr/bin/env bash
+# Multi-runtime E2E: chat attachments work across runtimes.
+#
+# The platform-level attachment helpers live in
+# molecule_runtime.executor_helpers. Every runtime's executor is
+# expected to call them. This script proves the invariant two ways:
+#
+#   1) Static plumbing check — each target container must expose the
+#      helpers via an importable symbol AND the runtime's executor must
+#      reference them (so a future build that skipped the patch is
+#      caught, not silently ignored).
+#
+#   2) Live round-trip — upload a text file, send an A2A message with
+#      a FilePart, and assert the agent's reply quotes the file
+#      contents (proves the manifest reached the model). Skipped with
+#      a PASS-NOTE when the runtime lacks valid provider credentials,
+#      because a missing ANTHROPIC_API_KEY / CLAUDE_CODE_OAUTH_TOKEN
+#      is infra, not platform plumbing.
+#
+# Usage:  WS_HERMES=<id> WS_LANGGRAPH=<id> WS_CLAUDE_CODE=<id> \
+#           tests/e2e/test_chat_attachments_multiruntime_e2e.sh
+
+set -uo pipefail
+BASE="${BASE:-http://localhost:8080}"
+fails=0
+
+has_patch_in_container() {
+  local container="$1"
+  # Signal that platform helpers are available AND wired into the
+  # runtime's executor. Grep the two authoritative paths — if either
+  # is missing, a future build dropped the patch.
+  docker exec "$container" python3 -c '
+import sys
+try:
+    from molecule_runtime.executor_helpers import (
+        extract_attached_files, collect_outbound_files,
+        build_user_content_with_files, ensure_workspace_writable,
+    )
+    print("helpers: OK")
+except Exception as e:
+    print(f"helpers: MISSING ({e})"); sys.exit(1)
+' 2>&1
+}
+
+has_executor_patched() {
+  # For hermes: /app/executor.py should call build_user_content_with_files
+  # For langgraph: molecule_runtime/a2a_executor.py should call extract_attached_files
+  # For claude-code: the monkey-patch installs ClaudeSDKExecutor.execute
+  #                  as _execute_with_attachments
+  local container="$1" runtime="$2"
+  case "$runtime" in
+    hermes)
+      docker exec "$container" grep -q "build_user_content_with_files" /app/executor.py \
+        && echo "executor: hermes template uses platform helpers" \
+        || { echo "executor: /app/executor.py missing helper call"; return 1; }
+      ;;
+    langgraph)
+      docker exec "$container" grep -q "extract_attached_files(getattr(context" \
+        /usr/local/lib/python3.11/site-packages/molecule_runtime/a2a_executor.py \
+        && echo "executor: langgraph A2A executor invokes extract_attached_files" \
+        || { echo "executor: a2a_executor.py not patched"; return 1; }
+      ;;
+    claude-code)
+      docker exec "$container" python3 -c '
+from molecule_runtime.claude_sdk_executor import ClaudeSDKExecutor
+name = ClaudeSDKExecutor.execute.__qualname__
+assert name.endswith("_execute_with_attachments"), f"unpatched: {name}"
+print(f"executor: claude-code monkey-patch active ({name})")
+' 2>&1 || return 1
+      ;;
+  esac
+}
+
+round_trip() {
+  local label="$1" wsid="$2"
+  local test_file expected upload uri payload reply reply_text
+  test_file=$(mktemp -t e2e-mr-XXXX.txt)
+  expected="secret-$(openssl rand -hex 6)"
+  echo "$expected" > "$test_file"
+  upload=$(curl -s -X POST "$BASE/workspaces/$wsid/chat/uploads" -F "files=@$test_file")
+  uri=$(echo "$upload" | python3 -c 'import json,sys;print(json.load(sys.stdin)["files"][0]["uri"])' 2>/dev/null)
+  [ -z "$uri" ] && { echo "FAIL $label: upload returned no URI: $upload"; rm -f "$test_file"; return 1; }
+  payload=$(URI="$uri" python3 -c '
+import json, os
+uri = os.environ["URI"]
+print(json.dumps({
+  "jsonrpc":"2.0","id":"mr","method":"message/send",
+  "params":{"message":{"role":"user","messageId":"mr","kind":"message","parts":[
+    {"kind":"text","text":"Read the attached text file and reply with ONLY the one-line content."},
+    {"kind":"file","file":{"name":"probe.txt","mimeType":"text/plain","uri":uri}},
+  ]},"configuration":{"acceptedOutputModes":["text/plain"],"blocking":True}}}))')
+
+  # Hit the platform proxy, with generous timeout — some runtimes warm on first call
+  reply=$(curl -s -X POST "$BASE/workspaces/$wsid/a2a" \
+    -H 'Content-Type: application/json' --max-time 120 -d "$payload")
+  reply_text=$(echo "$reply" | python3 -c '
+import json, sys, re
+try:
+    data = re.sub(r"[\x00-\x08\x0b-\x1f]", " ", sys.stdin.read())
+    d = json.loads(data)
+    parts = d.get("result",{}).get("parts",[])
+    print(" ".join(p.get("text","") for p in parts if p.get("kind")=="text"))
+except Exception as exc:
+    print(f"(parse failed: {exc})")
+' 2>&1)
+  rm -f "$test_file"
+
+  if echo "$reply_text" | grep -qF "$expected"; then
+    echo "PASS $label round-trip: agent quoted $expected"
+    return 0
+  fi
+  # Credential-missing signatures we choose to tolerate (infra, not platform)
+  if echo "$reply_text" | grep -qEi "could not resolve authentication|missing api|not logged in|hermes setup|no llm provider|401|\"type\": \"server_error\""; then
+    echo "SKIP $label round-trip: agent lacks credentials (reply=$(echo "$reply_text" | head -c 120)...)"
+    return 0
+  fi
+  echo "INFO $label round-trip: agent reply did not contain expected text"
+  echo "    reply: $(echo "$reply_text" | head -c 200)"
+  return 0  # Don't hard-fail; the plumbing check already asserted the platform layer
+}
+
+check_runtime() {
+  local label="$1" runtime="$2" wsid="$3"
+  [ -z "$wsid" ] && { echo "SKIP $label (no workspace id)"; return; }
+  printf "\n======================== %s (%s) ========================\n" "$label" "$wsid"
+
+  local status
+  status=$(curl -s "$BASE/workspaces/$wsid" | python3 -c 'import json,sys;print(json.load(sys.stdin)["status"])')
+  if [ "$status" != "online" ]; then
+    echo "FAIL $label: workspace status=$status"
+    fails=$((fails + 1)); return
+  fi
+  local container
+  container=$(docker ps --format '{{.Names}}' | grep -E "^ws-${wsid:0:12}" | head -1)
+  [ -z "$container" ] && { echo "FAIL $label: container not found"; fails=$((fails + 1)); return; }
+
+  has_patch_in_container "$container" || { echo "FAIL $label: platform helpers missing"; fails=$((fails + 1)); return; }
+  has_executor_patched "$container" "$runtime" || { echo "FAIL $label: executor not patched"; fails=$((fails + 1)); return; }
+  round_trip "$label" "$wsid" || { fails=$((fails + 1)); return; }
+}
+
+check_runtime "hermes"      "hermes"      "${WS_HERMES:-}"
+check_runtime "langgraph"   "langgraph"   "${WS_LANGGRAPH:-}"
+check_runtime "claude-code" "claude-code" "${WS_CLAUDE_CODE:-}"
+
+printf "\n=================================================\n"
+if [ $fails -eq 0 ]; then echo "ALL RUNTIME E2E CHECKS PASSED"; exit 0; fi
+echo "FAIL: $fails runtime check(s) failed"
+exit 1
--- a/tests/e2e/test_staging_full_saas.sh
+++ b/tests/e2e/test_staging_full_saas.sh
@ -195,14 +195,35 @@ TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(js
 ok "Tenant admin token retrieved (len=${#TENANT_TOKEN})"

 # ─── 4. Wait for tenant TLS / DNS propagation ──────────────────────────
+# Kept below the 20-min provision envelope so a genuinely-stuck tenant
+# still fails loud at the earlier provision step rather than masquerading
+# as a TLS issue. CF DNS propagation + tunnel hostname registration +
+# ACME cert + edge cache run 5-7 min on a healthy day; +5 min headroom
+# over the previous 10-min cap covers the slower path observed in #2090.
+#
+# On timeout, dump DNS + curl -v + headers so the next failure identifies
+# the broken layer (DNS / TLS / HTTP). Authorization is redacted
+# defensively in case a future caller adds an auth header to this probe.
 log "4/11 Waiting for tenant TLS / DNS propagation..."
-TLS_DEADLINE=$(( $(date +%s) + 180 ))
+TLS_TIMEOUT_SEC=$((15 * 60))
+TLS_DEADLINE=$(( $(date +%s) + TLS_TIMEOUT_SEC ))
+TENANT_HOST="${TENANT_URL#http*://}"
+TENANT_HOST="${TENANT_HOST%%/*}"
+TENANT_HOST="${TENANT_HOST%%:*}"
 while true; do
  if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then
    break
  fi
  if [ "$(date +%s)" -gt "$TLS_DEADLINE" ]; then
-    fail "Tenant URL never responded 2xx on /health within 3 min"
+    log "── DIAGNOSTIC BURST (TLS-readiness timeout) ──"
+    log "DNS lookup ($TENANT_HOST):"
+    getent hosts "$TENANT_HOST" 2>&1 || log "  (no DNS resolution)"
+    log "curl -v $TENANT_URL/health (last 40 lines):"
+    curl -kv --max-time 10 "$TENANT_URL/health" 2>&1 \
+      | sed -E 's/(Authorization|Cookie):.*/\1: [redacted]/i' \
+      | tail -n 40 | sed 's/^/  /' || true
+    log "── END DIAGNOSTIC ──"
+    fail "Tenant URL never responded 2xx on /health within ${TLS_TIMEOUT_SEC}s"
  fi
  sleep 5
 done
@ -403,6 +424,13 @@ fi
 if echo "$AGENT_TEXT" | grep -qF "Unknown provider"; then
  fail "A2A — REGRESSION: install.sh set PROVIDER to a value not in hermes's registry. Run 'hermes doctor' on the workspace to see valid values. Raw: $AGENT_TEXT"
 fi
+# "Invalid API key" — the comment block lists this as a CP #238 race
+# (tenant auth chain) signal but the grep was missing. Caller-side
+# 401's containing this exact phrase don't match the generic
+# "error|exception" catch-all below, so they'd slip through.
+if echo "$AGENT_TEXT" | grep -qF "Invalid API key"; then
+  fail "A2A — REGRESSION: tenant auth chain returned 'Invalid API key'. Likely CP boot-event 401 race (CP #238) or stale OPENAI_API_KEY in the runtime env. Raw: $AGENT_TEXT"
+fi
 # Generic catch-all — falls through if none of the known regressions hit.
 if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then
  fail "A2A returned an error-shaped response: $AGENT_TEXT"
--- a/workspace-server/cmd/server/dotenv.go
+++ b/workspace-server/cmd/server/dotenv.go
@ -0,0 +1,190 @@
+package main
+
+import (
+	"bufio"
+	"log"
+	"os"
+	"path/filepath"
+	"strings"
+)
+
+// loadDotEnvIfPresent walks upward from CWD looking for a .env file and
+// merges its KEY=VALUE pairs into the process environment. Already-set
+// vars (e.g. from `docker run -e`, CI exports, or ad-hoc `KEY=val
+// ./binary`) win over file values so operators can override without
+// editing the file.
+//
+// Why walk upward: the binary may be launched from the monorepo root,
+// the workspace-server subdir, or anywhere else the operator finds
+// convenient. Walking upward from CWD finds the canonical .env
+// (gitignored, lives at the monorepo root) regardless of cwd, so a
+// fresh `go build -o /tmp/molecule-server ./cmd/server && /tmp/molecule-server`
+// from any subdir picks up the same MOLECULE_ENV / DATABASE_URL / etc.
+// the operator already has — without sourcing or `set -a`.
+//
+// Why no godotenv dep: the format we use is simple — KEY=VALUE with
+// optional `#` comments and no interpolation — so a tiny in-tree parser
+// is auditable, has no supply-chain surface, and avoids drift across
+// repos where some teams configure godotenv differently.
+//
+// Why it's safe in production: the Dockerfile does not COPY .env into
+// the image and `.env` is gitignored, so production containers have no
+// .env on disk to load. If an operator goes out of their way to put one
+// there, the explicit-env-wins rule above means container env still
+// dominates.
+func loadDotEnvIfPresent() {
+	path, ok := findDotEnv()
+	if !ok {
+		return
+	}
+	f, err := os.Open(path)
+	if err != nil {
+		log.Printf(".env: open %s: %v (skipping)", path, err)
+		return
+	}
+	defer f.Close()
+
+	loaded := 0
+	skipped := 0
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		k, v, ok := parseDotEnvLine(scanner.Text())
+		if !ok {
+			continue
+		}
+		// Existing env wins. NOTE: an explicitly-set empty string
+		// (`KEY=` exported from a parent shell) counts as "set" — we
+		// keep the empty value rather than backfilling from the file.
+		// Matches Node's `process.env[k] !== undefined` check in the
+		// canvas's next.config.ts loader so both processes treat the
+		// same input identically. Operators who want the file value
+		// to win must `unset KEY` in the launching shell.
+		if _, exists := os.LookupEnv(k); exists {
+			skipped++
+			continue
+		}
+		if err := os.Setenv(k, v); err != nil {
+			log.Printf(".env: set %s: %v", k, err)
+			continue
+		}
+		loaded++
+	}
+	if err := scanner.Err(); err != nil {
+		log.Printf(".env: scan %s: %v", path, err)
+	}
+	log.Printf(".env: %s — loaded %d, %d already set in env", path, loaded, skipped)
+}
+
+// findDotEnv returns the path of the nearest .env file walking upward
+// from CWD. Capped at 6 levels so a deeply-nested launch dir doesn't
+// scan the entire filesystem.
+//
+// Sentinel gate: only accept a .env that sits next to `workspace-server/`
+// (the monorepo marker). Without it, a developer running the binary from
+// `~/Documents/other-project/` would walk up to `~/.env` and load
+// arbitrary variables — a real foot-gun on shared dev machines and a
+// possible information-leak vector on bare-metal deploys. Skipping the
+// match falls through to "no .env found" which is identical to today's
+// pre-fix behavior (the operator must export env explicitly).
+func findDotEnv() (string, bool) {
+	dir, err := os.Getwd()
+	if err != nil {
+		return "", false
+	}
+	for i := 0; i < 6; i++ {
+		p := filepath.Join(dir, ".env")
+		if st, err := os.Stat(p); err == nil && !st.IsDir() {
+			if isMonorepoRoot(dir) {
+				return p, true
+			}
+			// .env exists here but the directory isn't the monorepo
+			// root — keep walking. Loading it could clobber
+			// environment with values from an unrelated project.
+		}
+		parent := filepath.Dir(dir)
+		if parent == dir {
+			break
+		}
+		dir = parent
+	}
+	return "", false
+}
+
+// isMonorepoRoot returns true if `dir` looks like the molecule-core
+// monorepo root — the directory that owns the .env we want to load.
+// The marker is `workspace-server/go.mod`, which is the canonical
+// in-tree go module and exists only in this monorepo. A simple
+// `workspace-server/` directory check would false-positive on a fork
+// that renamed the dir; the go.mod check is more precise.
+func isMonorepoRoot(dir string) bool {
+	st, err := os.Stat(filepath.Join(dir, "workspace-server", "go.mod"))
+	return err == nil && !st.IsDir()
+}
+
+// parseDotEnvLine parses a single .env line. Returns (key, value, true)
+// for KEY=VALUE pairs. Returns (_, _, false) for blanks, comments, and
+// malformed lines. Handles:
+//   - leading `export ` prefix (so shell-friendly .env files written
+//     for `source .env` or direnv work without modification)
+//   - leading UTF-8 BOM on the first line (Windows editors)
+//   - inline `# comment` after a value when preceded by whitespace
+//   - surrounding `"` or `'` quotes on the value (stripped one matched
+//     pair); inside a quoted value, `#` is part of the value, not a
+//     comment marker
+func parseDotEnvLine(line string) (string, string, bool) {
+	// Strip a UTF-8 BOM if present. bufio.Scanner doesn't filter it,
+	// so the very first line of a Windows-edited .env would otherwise
+	// produce a key like U+FEFF + "FOO" that os.Setenv silently accepts.
+	line = strings.TrimPrefix(line, "\ufeff")
+	line = strings.TrimSpace(line)
+	if line == "" || strings.HasPrefix(line, "#") {
+		return "", "", false
+	}
+	// Drop a leading `export ` (literal space — `export\tFOO=bar`
+	// with a tab is intentionally rejected, matching the TS mirror in
+	// canvas/next.config.ts. shells emit `export ` with a space; tabs
+	// would only appear in hand-mangled files.) so lines like
+	// `export FOO=bar` (the form direnv and many `.env` templates
+	// emit) don't end up as a junk key with an embedded space.
+	line = strings.TrimPrefix(line, "export ")
+	line = strings.TrimLeft(line, " \t") // re-trim in case `export` itself had trailing space
+	eq := strings.IndexByte(line, '=')
+	if eq <= 0 {
+		return "", "", false
+	}
+	k := strings.TrimSpace(line[:eq])
+	v := line[eq+1:]
+	// Trim leading whitespace so a quoted value's opening quote is at
+	// v[0]. The comment-detection loop below then treats the position
+	// after the trim as "start of value" — `KEY=    # comment` has its
+	// `#` at the new v[0] (preceded only by whitespace in the source)
+	// and is correctly classified as an empty value followed by a
+	// comment, not as a value of `# comment`.
+	v = strings.TrimLeft(v, " \t")
+	// Quoted value: strip one matched pair of surrounding quotes and
+	// take the contents verbatim (no inline-comment splitting). Must
+	// happen BEFORE comment detection so `KEY="value # not a comment"`
+	// keeps the `#` as part of the value.
+	if len(v) >= 2 && (v[0] == '"' || v[0] == '\'') {
+		quote := v[0]
+		if end := strings.IndexByte(v[1:], quote); end >= 0 {
+			return k, v[1 : 1+end], true
+		}
+		// Unterminated quote — fall through to bare-value handling
+		// (treats the opening quote as a literal char in the value).
+	}
+	// Bare value: strip inline comment. A `#` is a comment marker iff
+	// it's at the start of the (trimmed) value OR is preceded by
+	// whitespace. `KEY=token#fragment` keeps the `#` as part of the
+	// value because v[i-1] is alphanum.
+	for i := 0; i < len(v); i++ {
+		if v[i] != '#' {
+			continue
+		}
+		if i == 0 || v[i-1] == ' ' || v[i-1] == '\t' {
+			v = v[:i]
+			break
+		}
+	}
+	return k, strings.TrimSpace(v), true
+}
--- a/workspace-server/cmd/server/dotenv_test.go
+++ b/workspace-server/cmd/server/dotenv_test.go
@ -0,0 +1,211 @@
+package main
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func TestParseDotEnvLine(t *testing.T) {
+	cases := []struct {
+		in      string
+		k, v    string
+		ok      bool
+		comment string
+	}{
+		{in: "", ok: false, comment: "empty line"},
+		{in: "   ", ok: false, comment: "whitespace-only"},
+		{in: "# top-level comment", ok: false, comment: "full-line comment"},
+		{in: "  #  indented comment", ok: false, comment: "indented full-line comment"},
+		{in: "FOO", ok: false, comment: "no equals"},
+		{in: "=BAR", ok: false, comment: "missing key"},
+
+		{in: "FOO=bar", k: "FOO", v: "bar", ok: true, comment: "plain"},
+		{in: "  FOO=bar", k: "FOO", v: "bar", ok: true, comment: "leading whitespace"},
+		{in: "FOO=bar   ", k: "FOO", v: "bar", ok: true, comment: "trailing whitespace stripped"},
+		{in: "FOO  =bar", k: "FOO", v: "bar", ok: true, comment: "whitespace before equals"},
+
+		{in: "FOO=bar # comment", k: "FOO", v: "bar", ok: true, comment: "inline space-hash comment"},
+		{in: "FOO=bar\t# comment", k: "FOO", v: "bar", ok: true, comment: "inline tab-hash comment"},
+		{in: "FOO=bar    # lots of spaces", k: "FOO", v: "bar", ok: true, comment: "multiple spaces before hash"},
+
+		{in: "FOO=bar#nocomment", k: "FOO", v: "bar#nocomment", ok: true, comment: "bare hash inside value preserved"},
+		{in: "URL=postgres://u:p@h:5432/db?sslmode=disable", k: "URL", v: "postgres://u:p@h:5432/db?sslmode=disable", ok: true, comment: "url with embedded equals"},
+		{in: "TOKEN=eyJhbGciOiJIUzI1NiJ9.payload.sig=", k: "TOKEN", v: "eyJhbGciOiJIUzI1NiJ9.payload.sig=", ok: true, comment: "base64 padding preserved"},
+
+		{in: "FOO=", k: "FOO", v: "", ok: true, comment: "empty value"},
+		{in: "ADMIN_TOKEN=", k: "ADMIN_TOKEN", v: "", ok: true, comment: "empty value (production gate sentinel)"},
+
+		// Regression: the repo's own .env contains lines like
+		// `CONFIGS_DIR=                   # Path to ...` where the value
+		// is empty + an inline comment. Pre-fix parser stripped leading
+		// whitespace BEFORE detecting the comment, leaving `#` at v[0]
+		// with nothing preceding it, so the inline-comment check missed
+		// it and the comment text was returned as the value. Server
+		// then tried to use the comment as a directory path and template
+		// loading silently failed (GET /templates returned []).
+		{in: "CONFIGS_DIR=                   # Path to /var/foo (auto-discovered if empty)", k: "CONFIGS_DIR", v: "", ok: true, comment: "empty value with leading whitespace + inline comment"},
+		{in: "FOO=    # comment", k: "FOO", v: "", ok: true, comment: "spaces-only value with inline comment"},
+		{in: "FOO=\t# comment", k: "FOO", v: "", ok: true, comment: "tab-only value with inline comment"},
+
+		// `export` prefix: shell-friendly .env files (direnv, .envrc-style)
+		// — the prefix must be stripped, NOT folded into the key.
+		{in: "export FOO=bar", k: "FOO", v: "bar", ok: true, comment: "export prefix stripped"},
+		{in: "  export FOO=bar", k: "FOO", v: "bar", ok: true, comment: "leading whitespace + export"},
+		{in: "export DATABASE_URL=postgres://u:p@h/db", k: "DATABASE_URL", v: "postgres://u:p@h/db", ok: true, comment: "export with URL value"},
+
+		// Quoted values: one matched pair of surrounding quotes is
+		// stripped; embedded `#` survives because it isn't an inline
+		// comment inside a quote.
+		{in: `FOO="hello world"`, k: "FOO", v: "hello world", ok: true, comment: "double-quoted value"},
+		{in: `FOO='hello world'`, k: "FOO", v: "hello world", ok: true, comment: "single-quoted value"},
+		{in: `FOO="value # not a comment"`, k: "FOO", v: "value # not a comment", ok: true, comment: "hash inside quotes is part of value"},
+		{in: `FOO=  "padded"`, k: "FOO", v: "padded", ok: true, comment: "whitespace before opening quote"},
+		{in: `FOO="unterminated`, k: "FOO", v: `"unterminated`, ok: true, comment: "unterminated quote stays as bare value"},
+
+		// CRLF endings: bufio.Scanner strips \n; \r is left and stripped
+		// by the value-side TrimSpace. Locking this in so a future
+		// refactor doesn't accidentally feed \r into os.Setenv.
+		{in: "FOO=bar\r", k: "FOO", v: "bar", ok: true, comment: "CRLF trailing carriage return stripped"},
+
+		// UTF-8 BOM at file start: a Windows-edited .env begins with
+		// \xEF\xBB\xBF; without explicit stripping the first key would
+		// be "\ufeffFOO".
+		{in: "\ufeffFOO=bar", k: "FOO", v: "bar", ok: true, comment: "UTF-8 BOM stripped"},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.comment, func(t *testing.T) {
+			k, v, ok := parseDotEnvLine(tc.in)
+			if ok != tc.ok {
+				t.Fatalf("ok = %v, want %v (input=%q)", ok, tc.ok, tc.in)
+			}
+			if !tc.ok {
+				return
+			}
+			if k != tc.k || v != tc.v {
+				t.Fatalf("got (%q, %q), want (%q, %q)", k, v, tc.k, tc.v)
+			}
+		})
+	}
+}
+
+// makeFakeMonorepo creates a temp dir that satisfies isMonorepoRoot()
+// (i.e., contains workspace-server/go.mod) plus a .env file with the
+// given body. Returns the dir so the caller can chdir into it.
+func makeFakeMonorepo(t *testing.T, envBody string) string {
+	t.Helper()
+	dir := t.TempDir()
+	wsDir := filepath.Join(dir, "workspace-server")
+	if err := os.MkdirAll(wsDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(wsDir, "go.mod"), []byte("module fake\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(dir, ".env"), []byte(envBody), 0o644); err != nil {
+		t.Fatalf("write .env: %v", err)
+	}
+	return dir
+}
+
+func TestLoadDotEnvIfPresent_PreservesExisting(t *testing.T) {
+	dir := makeFakeMonorepo(t, "DOTENV_TEST_NEW=from_file\nDOTENV_TEST_EXISTING=from_file\n")
+
+	// Pre-set one of the keys — file value must NOT clobber it.
+	t.Setenv("DOTENV_TEST_EXISTING", "from_real_env")
+	// Ensure the other key starts unset.
+	os.Unsetenv("DOTENV_TEST_NEW")
+	t.Cleanup(func() { os.Unsetenv("DOTENV_TEST_NEW") })
+
+	// Run from the temp dir so findDotEnv picks our fixture.
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(dir); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	loadDotEnvIfPresent()
+
+	if got := os.Getenv("DOTENV_TEST_NEW"); got != "from_file" {
+		t.Errorf("DOTENV_TEST_NEW = %q, want %q", got, "from_file")
+	}
+	if got := os.Getenv("DOTENV_TEST_EXISTING"); got != "from_real_env" {
+		t.Errorf("existing env clobbered: got %q, want %q", got, "from_real_env")
+	}
+}
+
+func TestLoadDotEnvIfPresent_NoFile_NoOp(t *testing.T) {
+	dir := t.TempDir() // empty — no .env at this level
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(dir); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	// Should not panic, log loud errors, or set anything. Best-effort
+	// silent miss is the contract.
+	loadDotEnvIfPresent()
+}
+
+func TestFindDotEnv_WalksUpward(t *testing.T) {
+	root := makeFakeMonorepo(t, "X=1\n")
+	nested := filepath.Join(root, "a", "b", "c")
+	if err := os.MkdirAll(nested, 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(nested); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	got, ok := findDotEnv()
+	if !ok {
+		t.Fatal("expected to find .env walking upward")
+	}
+	want := filepath.Join(root, ".env")
+	// macOS resolves /var → /private/var on TempDir, so compare via
+	// EvalSymlinks for both sides to dodge that.
+	gotR, _ := filepath.EvalSymlinks(got)
+	wantR, _ := filepath.EvalSymlinks(want)
+	if gotR != wantR {
+		t.Errorf("findDotEnv() = %q, want %q", got, want)
+	}
+}
+
+func TestFindDotEnv_RejectsUnrelatedDotEnv(t *testing.T) {
+	// Simulates a developer running the binary from inside an
+	// unrelated project tree that happens to have its own .env (or
+	// from $HOME with a personal ~/.env). Without the monorepo
+	// sentinel, findDotEnv would happily load it and clobber env
+	// with arbitrary values — a real foot-gun this regression test
+	// guards against.
+	dir := t.TempDir()
+	if err := os.WriteFile(filepath.Join(dir, ".env"), []byte("LEAKY=value\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(dir); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	if got, ok := findDotEnv(); ok {
+		t.Errorf("findDotEnv() = %q, ok=true; want ok=false (no workspace-server sibling)", got)
+	}
+}
--- a/workspace-server/cmd/server/main.go
+++ b/workspace-server/cmd/server/main.go
@ -33,6 +33,14 @@ import (
 )

 func main() {
+	// .env auto-load: in dev, the operator keeps MOLECULE_ENV /
+	// DATABASE_URL / etc. in the monorepo's .env file. Loading it here
+	// — before any code reads env — means a fresh `/tmp/molecule-server`
+	// run picks up dev config without `set -a && source .env`. No-op
+	// in production (Docker image doesn't ship a .env, and existing env
+	// always wins over file values, so container env stays dominant).
+	loadDotEnvIfPresent()
+
 	// CP self-refresh: pull any operator-rotated config (e.g. a new
 	// MOLECULE_CP_SHARED_SECRET) before any other code reads env.
 	// Best-effort — if the CP is unreachable we keep booting with the
@ -221,6 +229,18 @@ func main() {
 		})
 	}

+	// Orphan-container reconcile sweep — finds running containers
+	// whose workspace row is already status='removed' and stops
+	// them. Defence in depth on top of the inline cleanup in
+	// handlers/workspace_crud.go: any Docker hiccup that left a
+	// container alive after the user clicked delete heals on the
+	// next sweep instead of leaking forever.
+	if prov != nil {
+		go supervised.RunWithRecover(ctx, "orphan-sweeper", func(c context.Context) {
+			registry.StartOrphanSweeper(c, prov)
+		})
+	}
+
 	// Provision-timeout sweep — flips workspaces that have been stuck in
 	// status='provisioning' past the timeout window to 'failed' and emits
 	// WORKSPACE_PROVISION_TIMEOUT. Without this the UI banner is cosmetic
--- a/workspace-server/internal/events/broadcaster.go
+++ b/workspace-server/internal/events/broadcaster.go
@ -15,6 +15,29 @@ import (

 const broadcastChannel = "events:broadcast"

+// EventEmitter is the contract handler code needs from a broadcaster.
+// Defining it here lets tests substitute a capture-only stub instead
+// of standing up the full Redis + WebSocket hub topology that the
+// concrete *Broadcaster builds (and that previously blocked
+// TestProvisionWorkspace_* regression tests on issue #1814).
+//
+// Includes BroadcastOnly because the activity-log + A2A-response paths
+// inside the handler package fan out via that method — narrowing
+// further would force production callers back to the concrete type.
+//
+// *Broadcaster satisfies this interface trivially. Production code that
+// needs the wider surface (SubscribeSSE, Subscribe) keeps using the
+// concrete *Broadcaster type — sse.go + cmd/server/main.go are the
+// only such call sites today.
+type EventEmitter interface {
+	RecordAndBroadcast(ctx context.Context, eventType string, workspaceID string, payload interface{}) error
+	BroadcastOnly(workspaceID string, eventType string, payload interface{})
+}
+
+// Compile-time assertion: a renamed/reshaped Broadcaster method that
+// silently broke this interface would fail to build here.
+var _ EventEmitter = (*Broadcaster)(nil)
+
 // sseSubscription is a single in-process SSE subscriber.
 // deliverToSSE writes to ch; StreamEvents reads from it.
 type sseSubscription struct {
--- a/workspace-server/internal/handlers/a2a_proxy.go
+++ b/workspace-server/internal/handlers/a2a_proxy.go
@ -20,6 +20,7 @@ import (
 	"time"

 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/registry"
 	"github.com/gin-gonic/gin"
@ -120,18 +121,26 @@ func isUpstreamBusyError(err error) bool {
 	if err == nil {
 		return false
 	}
+	// Typed sentinels propagate cleanly through *url.Error.Unwrap
+	// since Go 1.13, so errors.Is is the primary check for both
+	// DeadlineExceeded and Canceled. The substring fallbacks below
+	// stay only for shapes net/http does NOT type — bare "EOF" /
+	// "connection reset" can arrive as plain *net.OpError with no
+	// errors.Is hook to the stdlib sentinels.
 	if errors.Is(err, context.DeadlineExceeded) {
 		return true
 	}
+	// applyIdleTimeout uses context.WithCancel; surfaces here as
+	// Canceled, distinct from DeadlineExceeded but the same "upstream
+	// busy" class — caller produces a 503 + Retry-After.
+	if errors.Is(err, context.Canceled) {
+		return true
+	}
 	if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
 		return true
 	}
-	// url.Error wraps "read tcp … EOF" and "Post …: context deadline
-	// exceeded" strings from the stdlib HTTP client without typing the
-	// inner cause. Fall back to substring match for those.
 	msg := err.Error()
-	return strings.Contains(msg, "context deadline exceeded") ||
-		strings.Contains(msg, "EOF") ||
+	return strings.Contains(msg, "EOF") ||
 		strings.Contains(msg, "connection reset")
 }

@ -286,7 +295,7 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
 	body = normalizedBody

 	startTime := time.Now()
-	resp, cancelFwd, err := h.dispatchA2A(ctx, agentURL, body, callerID)
+	resp, cancelFwd, err := h.dispatchA2A(ctx, workspaceID, agentURL, body, callerID)
 	if cancelFwd != nil {
 		defer cancelFwd()
 	}
@ -478,25 +487,80 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
 	return marshaledBody, a2aMethod, nil
 }

+// idleTimeoutDuration is the per-dispatch silence window: if the
+// platform's broadcaster emits no events for this workspace for the
+// full duration, the dispatch ctx is cancelled. Resets on every
+// ACTIVITY_LOGGED / TASK_UPDATED / A2A_RESPONSE event for the
+// workspace, so a chat that's actively reporting tool calls or
+// streaming status updates never trips it. Picked to be longer than
+// any reasonable single-tool-use cadence (Claude Code's slowest
+// observed silence between tools is ~30s) but short enough that a
+// truly wedged runtime fails in 1 minute, not 5.
+const idleTimeoutDuration = 60 * time.Second
+
 // dispatchA2A POSTs `body` to `agentURL`. Uses WithoutCancel so delegation
-// chains survive client disconnect (browser tab close). Default timeouts:
-// canvas (callerID == "") = 5 min, agent-to-agent = 30 min. Callers can
-// override via the X-Timeout header (applied to ctx upstream in ProxyA2A).
-func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, agentURL string, body []byte, callerID string) (*http.Response, context.CancelFunc, error) {
+// chains survive client disconnect (browser tab close). Two layers of
+// timeout per dispatch:
+//
+//   - Idle timeout (always applied): cancels the dispatch when no
+//     broadcaster events for the workspace fire for
+//     idleTimeoutDuration. Any progress event resets the clock — so
+//     a long but actively-streaming reply runs forever, while a
+//     wedged runtime fails fast.
+//   - Absolute ceiling (agent-to-agent only): 30 min cap as a
+//     defence against runaway delegation loops. Canvas dispatches
+//     have no absolute ceiling — the user can wait as long as they
+//     want, the idle timer is the only hangup signal.
+//
+// Either layer is overridable by the X-Timeout header upstream in
+// ProxyA2A; X-Timeout: 0 explicitly disables the absolute ceiling.
+func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, workspaceID, agentURL string, body []byte, callerID string) (*http.Response, context.CancelFunc, error) {
+	// #1483 SSRF defense-in-depth: the primary call path through
+	// proxyA2ARequest → resolveAgentURL already validates via isSafeURL
+	// (a2a_proxy.go:424), but adding the check here closes the gap for
+	// any future code path that calls dispatchA2A directly without
+	// going through resolveAgentURL. Wrapping as proxyDispatchBuildError
+	// keeps the caller's error-classification path unchanged — the same
+	// shape it already produces a 500 for.
+	if err := isSafeURL(agentURL); err != nil {
+		return nil, nil, &proxyDispatchBuildError{err: err}
+	}
 	forwardCtx := context.WithoutCancel(ctx)
-	var cancel context.CancelFunc
+	var ceilingCancel context.CancelFunc
 	if _, hasDeadline := ctx.Deadline(); !hasDeadline {
-		if callerID == "" {
-			forwardCtx, cancel = context.WithTimeout(forwardCtx, 5*time.Minute)
-		} else {
-			forwardCtx, cancel = context.WithTimeout(forwardCtx, 30*time.Minute)
+		if callerID != "" {
+			forwardCtx, ceilingCancel = context.WithTimeout(forwardCtx, 30*time.Minute)
+		}
+		// callerID == "" (canvas): no absolute ceiling. The idle
+		// timeout below is the only deadline.
+	}
+	// Idle timeout — cancels the dispatch ctx after
+	// idleTimeoutDuration of broadcaster silence for this workspace.
+	// Always applied (canvas + agent-to-agent both benefit; the
+	// ceiling above is a separate runaway-loop cap that only fires
+	// for agent traffic). Combines with the ceiling cancel into a
+	// single returned cancel func that the caller defers.
+	// applyIdleTimeout needs SubscribeSSE which only lives on the
+	// concrete *Broadcaster, not on the EventEmitter interface the
+	// handler now stores. Type-assert + fall through to a no-op idle
+	// timer if the broadcaster doesn't support subscriptions (the
+	// EventEmitter mock used by some tests, e.g.). Production wires
+	// the concrete *Broadcaster, so the assertion always succeeds in
+	// real deploys.
+	var b *events.Broadcaster
+	if concrete, ok := h.broadcaster.(*events.Broadcaster); ok {
+		b = concrete
+	}
+	forwardCtx, idleCancel := applyIdleTimeout(forwardCtx, b, workspaceID, idleTimeoutDuration)
+	cancel := func() {
+		idleCancel()
+		if ceilingCancel != nil {
+			ceilingCancel()
 		}
 	}
 	req, err := http.NewRequestWithContext(forwardCtx, "POST", agentURL, bytes.NewReader(body))
 	if err != nil {
-		if cancel != nil {
-			cancel()
-		}
+		cancel()
 		// Wrap the construction failure so the caller can distinguish it
 		// from an upstream Do() error and produce the correct 500 response.
 		return nil, nil, &proxyDispatchBuildError{err: err}
@ -505,3 +569,52 @@ func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, agentURL string, bod
 	resp, doErr := a2aClient.Do(req)
 	return resp, cancel, doErr
 }
+
+// applyIdleTimeout returns a child ctx that gets cancelled when no
+// broadcaster events for `workspaceID` arrive for `idle` duration.
+// Any incoming event resets the clock. The returned cancel func
+// MUST be called to clean up the goroutine + subscription.
+//
+// nil broadcaster or non-positive idle returns the parent ctx
+// unchanged (and a no-op cancel) so test paths that don't wire a
+// broadcaster keep working.
+func applyIdleTimeout(parent context.Context, b *events.Broadcaster, workspaceID string, idle time.Duration) (context.Context, context.CancelFunc) {
+	if b == nil || idle <= 0 || workspaceID == "" {
+		return parent, func() {}
+	}
+	ctx, cancel := context.WithCancel(parent)
+	sub, unsub := b.SubscribeSSE(workspaceID)
+	go func() {
+		defer unsub()
+		timer := time.NewTimer(idle)
+		defer timer.Stop()
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case _, ok := <-sub:
+				if !ok {
+					// Subscription channel closed — fall back to
+					// pure-timer mode. Don't cancel: another caller
+					// may have closed our sub but the request itself
+					// is still in flight. Let the timer or the
+					// caller's defer drive cleanup.
+					continue
+				}
+				// Stop+drain pattern so a fired-but-unread timer
+				// doesn't double-cancel after the Reset.
+				if !timer.Stop() {
+					select {
+					case <-timer.C:
+					default:
+					}
+				}
+				timer.Reset(idle)
+			case <-timer.C:
+				cancel()
+				return
+			}
+		}
+	}()
+	return ctx, cancel
+}
--- a/workspace-server/internal/handlers/a2a_proxy_test.go
+++ b/workspace-server/internal/handlers/a2a_proxy_test.go
@ -5,6 +5,7 @@ import (
 	"context"
 	"database/sql"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"net/http"
@ -600,9 +601,21 @@ func TestIsUpstreamBusyError(t *testing.T) {
 	}{
 		{"nil", nil, false},
 		{"context.DeadlineExceeded", context.DeadlineExceeded, true},
+		// applyIdleTimeout cancels its child ctx via context.WithCancel
+		// when the broadcaster silence window elapses — surfaces here
+		// as context.Canceled. Same "upstream busy" classification.
+		{"context.Canceled", context.Canceled, true},
+		{"wrapped context.Canceled", fmt.Errorf("dispatch wrapped: %w", context.Canceled), true},
 		{"io.EOF", io.EOF, true},
 		{"io.ErrUnexpectedEOF", io.ErrUnexpectedEOF, true},
-		{"wrapped context deadline string", fmt.Errorf(`Post "http://ws-foo:8000": context deadline exceeded`), true},
+		// Real net/http wraps context.DeadlineExceeded via *url.Error.Unwrap,
+		// so errors.Is(err, context.DeadlineExceeded) catches it. The
+		// pre-892de784 substring "context deadline exceeded" fallback
+		// also accepted a string-only error like
+		// `fmt.Errorf("Post: context deadline exceeded")`; that fallback
+		// was dropped because errors.Is handles the real shape and the
+		// substring was indistinguishable from a user-content match.
+		{"wrapped context deadline (errors.Is path)", fmt.Errorf("Post: %w", context.DeadlineExceeded), true},
 		{"wrapped EOF string", fmt.Errorf(`Post "http://ws-foo:8000": EOF`), true},
 		{"connection reset", fmt.Errorf("read tcp 127.0.0.1:8080->127.0.0.1:12345: connection reset by peer"), true},
 		{"generic dns error", fmt.Errorf("no such host"), false},
@ -1074,7 +1087,7 @@ func TestDispatchA2A_BuildRequestError(t *testing.T) {
 	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())

 	// Malformed URL causes http.NewRequestWithContext to fail.
-	_, cancel, err := handler.dispatchA2A(context.Background(), "http://%%badhost", []byte("{}"), "")
+	_, cancel, err := handler.dispatchA2A(context.Background(), "ws-target", "http://%%badhost", []byte("{}"), "")
 	if cancel != nil {
 		cancel()
 	}
@ -1097,13 +1110,13 @@ func TestDispatchA2A_CanvasTimeout(t *testing.T) {
 	}))
 	defer srv.Close()

-	resp, cancel, err := handler.dispatchA2A(context.Background(), srv.URL, []byte(`{}`), "")
+	resp, cancel, err := handler.dispatchA2A(context.Background(), "ws-target", srv.URL, []byte(`{}`), "")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	defer resp.Body.Close()
 	if cancel == nil {
-		t.Fatal("canvas caller (empty callerID) must set a timeout + return cancel")
+		t.Fatal("canvas caller must return a cancel func (idle-timeout cleanup)")
 	}
 	cancel() // restore
 }
@ -1118,20 +1131,23 @@ func TestDispatchA2A_AgentTimeout(t *testing.T) {
 	}))
 	defer srv.Close()

-	resp, cancel, err := handler.dispatchA2A(context.Background(), srv.URL, []byte(`{}`), "ws-caller")
+	resp, cancel, err := handler.dispatchA2A(context.Background(), "ws-target", srv.URL, []byte(`{}`), "ws-caller")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	defer resp.Body.Close()
 	if cancel == nil {
-		t.Fatal("agent-to-agent caller must set a timeout + return cancel")
+		t.Fatal("agent-to-agent caller must return a cancel func (idle + ceiling cleanup)")
 	}
 	cancel()
 }

-func TestDispatchA2A_ContextDeadline_NoCancelAdded(t *testing.T) {
-	// When ctx already has a deadline, dispatchA2A must NOT layer its own
-	// timeout (cancel should be nil).
+func TestDispatchA2A_ContextDeadline_NoExtraCeiling(t *testing.T) {
+	// When ctx already has a deadline, dispatchA2A must not layer
+	// its own absolute ceiling on top — the caller's deadline wins.
+	// The idle-timer cleanup still produces a non-nil cancel func
+	// (introduced by the always-on idle timeout) but the cancel func
+	// is safe to call repeatedly and from a deferred path.
 	setupTestDB(t)
 	setupTestRedis(t)
 	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
@ -1144,17 +1160,134 @@ func TestDispatchA2A_ContextDeadline_NoCancelAdded(t *testing.T) {
 	ctx, ctxCancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer ctxCancel()

-	resp, cancel, err := handler.dispatchA2A(ctx, srv.URL, []byte(`{}`), "")
+	resp, cancel, err := handler.dispatchA2A(ctx, "ws-target", srv.URL, []byte(`{}`), "")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	defer resp.Body.Close()
-	if cancel != nil {
-		t.Error("cancel should be nil when ctx already has a deadline")
-		cancel()
+	if cancel == nil {
+		t.Error("cancel must be non-nil (idle-timer cleanup)")
 	}
 }

+// --- applyIdleTimeout ---
+
+// TestApplyIdleTimeout_FiresOnSilence verifies the helper cancels its
+// child ctx when no broadcaster events arrive for `idle` duration.
+// Uses a short idle window (60ms) so the test runs fast.
+func TestApplyIdleTimeout_FiresOnSilence(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	b := newTestBroadcaster()
+
+	parent, parentCancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer parentCancel()
+
+	idleCtx, idleCancel := applyIdleTimeout(parent, b, "ws-silent", 60*time.Millisecond)
+	defer idleCancel()
+
+	select {
+	case <-idleCtx.Done():
+		// expected — no events ever arrived for ws-silent
+	case <-time.After(2 * time.Second):
+		t.Fatal("idleCtx never cancelled despite no events")
+	}
+	if !errors.Is(idleCtx.Err(), context.Canceled) {
+		t.Errorf("idleCtx err = %v, want context.Canceled", idleCtx.Err())
+	}
+}
+
+// TestApplyIdleTimeout_ResetsOnEvent verifies that a broadcaster event
+// for the workspace resets the timer. Sends one event mid-window and
+// confirms ctx is still alive after the original deadline would have
+// fired, but cancelled after a second silence window elapses.
+func TestApplyIdleTimeout_ResetsOnEvent(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	b := newTestBroadcaster()
+
+	parent, parentCancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer parentCancel()
+
+	idle := 80 * time.Millisecond
+	idleCtx, idleCancel := applyIdleTimeout(parent, b, "ws-active", idle)
+	defer idleCancel()
+
+	// Send a progress event halfway through the window — should
+	// extend the deadline by another `idle`.
+	time.Sleep(idle / 2)
+	b.BroadcastOnly("ws-active", "ACTIVITY_LOGGED", map[string]interface{}{"activity_type": "agent_log"})
+
+	// At t = idle (original deadline), ctx must still be alive
+	// because the event reset the clock.
+	select {
+	case <-idleCtx.Done():
+		t.Fatal("idleCtx cancelled despite mid-window event resetting the timer")
+	case <-time.After(idle - (idle / 2) + 10*time.Millisecond):
+		// ok — past the original deadline, still alive
+	}
+
+	// Now wait for the second silence window to actually fire.
+	select {
+	case <-idleCtx.Done():
+		// expected
+	case <-time.After(idle + 200*time.Millisecond):
+		t.Fatal("idleCtx never cancelled after the second silence window")
+	}
+}
+
+// TestApplyIdleTimeout_NilBroadcasterDegradesGracefully — nil
+// broadcaster (some test paths) returns the parent ctx unchanged.
+func TestApplyIdleTimeout_NilBroadcasterDegradesGracefully(t *testing.T) {
+	parent := context.Background()
+	idleCtx, cancel := applyIdleTimeout(parent, nil, "ws-x", 50*time.Millisecond)
+	defer cancel()
+	if idleCtx != parent {
+		t.Error("nil broadcaster must return the parent ctx unchanged")
+	}
+	// And calling cancel must be safe.
+	cancel()
+}
+
+// TestDispatchA2A_RejectsUnsafeURL is the #1483 defense-in-depth
+// regression. setupTestDB disables SSRF for normal tests so existing
+// dispatchA2A unit tests can hit httptest.NewServer (loopback) — we
+// re-enable it here to verify the new in-function isSafeURL guard.
+// Production callers go through resolveAgentURL which already
+// validates; this test pins that dispatchA2A is now safe even when
+// called directly by a future caller that skips resolveAgentURL.
+//
+// Note: dispatchA2A's signature includes workspaceID (added by the
+// idle-timeout work) so this test passes a stub value — the SSRF check
+// fires before workspaceID is referenced.
+func TestDispatchA2A_RejectsUnsafeURL(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	restoreSSRF := setSSRFCheckForTest(true)
+	t.Cleanup(restoreSSRF)
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+
+	// Cloud metadata IP — must be rejected before any HTTP call goes out.
+	_, cancel, err := handler.dispatchA2A(
+		context.Background(),
+		"ws-target",
+		"http://169.254.169.254/latest/meta-data/",
+		[]byte(`{}`),
+		"",
+	)
+	if cancel != nil {
+		cancel()
+		t.Error("cancel must be nil when the URL is rejected pre-request")
+	}
+	if err == nil {
+		t.Fatal("expected SSRF rejection error, got nil")
+	}
+	if _, ok := err.(*proxyDispatchBuildError); !ok {
+		t.Errorf("expected *proxyDispatchBuildError (caller maps to 500), got %T: %v", err, err)
+	}
+}
+
+
 // --- handleA2ADispatchError ---

 func TestHandleA2ADispatchError_ContextDeadline(t *testing.T) {
--- a/workspace-server/internal/handlers/a2a_queue.go
+++ b/workspace-server/internal/handlers/a2a_queue.go
@ -288,7 +288,7 @@ func (h *WorkspaceHandler) DrainQueueForWorkspace(ctx context.Context, workspace
 	}
 	// logActivity=false: the original EnqueueA2A callsite already logged
 	// the dispatch attempt; re-logging here would double-count events.
-	status, _, proxyErr := h.proxyA2ARequest(ctx, workspaceID, item.Body, callerID, false)
+	status, respBody, proxyErr := h.proxyA2ARequest(ctx, workspaceID, item.Body, callerID, false)

 	// 202 Accepted = the dispatch was itself queued again (target still busy).
 	// That's not a failure — the queued item just stays queued naturally on
@ -321,4 +321,89 @@ func (h *WorkspaceHandler) DrainQueueForWorkspace(ctx context.Context, workspace
 	MarkQueueItemCompleted(ctx, item.ID)
 	log.Printf("A2AQueue drain: dispatched %s to workspace %s (attempt=%d)",
 		item.ID, workspaceID, item.Attempts)
+
+	// Stitch the response back to the originating delegation row, if this
+	// queue item was a delegation. Without this, check_task_status would
+	// see status='queued' (set by the executeDelegation queued-branch) and
+	// the LLM would think the work was never done. We embed delegation_id
+	// in params.message.metadata at Delegate-handler time; pull it out
+	// here and UPDATE the delegate_result row so the original caller can
+	// observe the real reply.
+	if delegationID := extractDelegationIDFromBody(item.Body); delegationID != "" {
+		h.stitchDrainResponseToDelegation(ctx, callerID, item.WorkspaceID, delegationID, respBody)
+	}
+}
+
+// extractDelegationIDFromBody pulls params.message.metadata.delegation_id
+// out of an A2A JSON-RPC body. Empty string when absent — drain treats
+// that as "this queue item didn't originate from /workspaces/:id/delegate"
+// and skips the stitch, so non-delegation queue uses (cross-workspace
+// peer-direct A2A) aren't affected.
+func extractDelegationIDFromBody(body []byte) string {
+	var envelope struct {
+		Params struct {
+			Message struct {
+				Metadata struct {
+					DelegationID string `json:"delegation_id"`
+				} `json:"metadata"`
+			} `json:"message"`
+		} `json:"params"`
+	}
+	if err := json.Unmarshal(body, &envelope); err != nil {
+		return ""
+	}
+	return envelope.Params.Message.Metadata.DelegationID
+}
+
+// stitchDrainResponseToDelegation writes the drained response into the
+// delegation's existing delegate_result row (created with status='queued'
+// by executeDelegation when the proxy first returned queued). This is the
+// other half of the loop that closes "queued → completed" so the LLM's
+// check_task_status reflects ground truth.
+//
+// Errors are logged-only — drain is fire-and-forget from Heartbeat, and a
+// stitch failure shouldn't block other queued items. The delegation will
+// just remain stuck at 'queued' in this case, which is the pre-fix
+// behaviour (no regression vs. shipping nothing).
+func (h *WorkspaceHandler) stitchDrainResponseToDelegation(ctx context.Context, sourceID, targetID, delegationID string, respBody []byte) {
+	if sourceID == "" || delegationID == "" {
+		return
+	}
+	responseText := extractResponseText(respBody)
+	respJSON, _ := json.Marshal(map[string]interface{}{
+		"text":          responseText,
+		"delegation_id": delegationID,
+	})
+	res, err := db.DB.ExecContext(ctx, `
+		UPDATE activity_logs
+		   SET status        = 'completed',
+		       summary       = $1,
+		       response_body = $2::jsonb
+		 WHERE workspace_id   = $3
+		   AND method         = 'delegate_result'
+		   AND target_id      = $4
+		   AND response_body->>'delegation_id' = $5
+	`, "Delegation completed ("+truncate(responseText, 80)+")", string(respJSON),
+		sourceID, targetID, delegationID)
+	if err != nil {
+		log.Printf("A2AQueue drain stitch: update failed for delegation %s: %v", delegationID, err)
+		return
+	}
+	if rows, _ := res.RowsAffected(); rows == 0 {
+		log.Printf("A2AQueue drain stitch: no delegate_result row for delegation %s (queued-row may not exist yet)", delegationID)
+		return
+	}
+	log.Printf("A2AQueue drain stitch: delegation %s queued → completed (%d chars)", delegationID, len(responseText))
+
+	// Broadcast DELEGATION_COMPLETE so the canvas chat feed flips the
+	// "⏸ queued" line to "✓ completed" in real time. Without this the
+	// transition only surfaces after the user reloads or polls activity.
+	if h.broadcaster != nil {
+		h.broadcaster.RecordAndBroadcast(ctx, "DELEGATION_COMPLETE", sourceID, map[string]interface{}{
+			"delegation_id":    delegationID,
+			"target_id":        targetID,
+			"response_preview": truncate(responseText, 200),
+			"via":              "queue_drain",
+		})
+	}
 }
--- a/workspace-server/internal/handlers/a2a_queue_test.go
+++ b/workspace-server/internal/handlers/a2a_queue_test.go
@ -80,6 +80,39 @@ func TestExtractIdempotencyKey_emptyOnMissing(t *testing.T) {
 	}
 }

+func TestExtractDelegationIDFromBody(t *testing.T) {
+	cases := []struct {
+		name string
+		body string
+		want string
+	}{
+		{
+			name: "delegation body — metadata.delegation_id present",
+			body: `{"method":"message/send","params":{"message":{"role":"user","messageId":"abc-123","metadata":{"delegation_id":"abc-123"},"parts":[{"type":"text","text":"hi"}]}}}`,
+			want: "abc-123",
+		},
+		{
+			name: "non-delegation body — no metadata (peer-direct A2A)",
+			body: `{"method":"message/send","params":{"message":{"role":"user","messageId":"m-1","parts":[{"type":"text","text":"hi"}]}}}`,
+			want: "",
+		},
+		{
+			name: "metadata present but no delegation_id key",
+			body: `{"params":{"message":{"metadata":{"trace_id":"t-1"}}}}`,
+			want: "",
+		},
+		{"malformed JSON", `not json`, ""},
+		{"empty body", ``, ""},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := extractDelegationIDFromBody([]byte(tc.body)); got != tc.want {
+				t.Errorf("extractDelegationIDFromBody = %q, want %q", got, tc.want)
+			}
+		})
+	}
+}
+
 // ──────────────────────────────────────────────────────────────────────────────
 // DrainQueueForWorkspace — nil-safe error extraction regression tests
 //
--- a/workspace-server/internal/handlers/activity.go
+++ b/workspace-server/internal/handlers/activity.go
@ -286,6 +286,37 @@ func (h *ActivityHandler) Notify(c *gin.Context) {
 		"name":         wsName,
 	})

+	// Persist to activity_logs so the chat history loader restores this
+	// message after a page reload. Pre-fix, send_message_to_user pushes
+	// were broadcast-only — survived the WebSocket session but vanished
+	// when the user refreshed because nothing wrote them to the DB.
+	//
+	// Shape chosen to match the existing loader query
+	// (`type=a2a_receive&source=canvas`):
+	//   - activity_type='a2a_receive' so it joins the same query path
+	//   - source_id=NULL so the canvas-source filter accepts it
+	//   - method='notify' to distinguish from real A2A receives in audits
+	//   - request_body=NULL so the loader doesn't append a duplicate
+	//     "user message" bubble for it
+	//   - response_body={"result": "<text>"} matches extractResponseText's
+	//     simplest branch ({result: string} → take verbatim)
+	//
+	// Errors are logged-only — broadcast already succeeded, the user
+	// sees the message; persistence failure just means the message
+	// won't survive reload (pre-fix behavior). Don't fail the whole
+	// notify on a DB hiccup.
+	respJSON, _ := json.Marshal(map[string]interface{}{"result": body.Message})
+	preview := body.Message
+	if len(preview) > 80 {
+		preview = preview[:80] + "…"
+	}
+	if _, err := db.DB.ExecContext(c.Request.Context(), `
+		INSERT INTO activity_logs (workspace_id, activity_type, method, summary, response_body, status)
+		VALUES ($1, 'a2a_receive', 'notify', $2, $3::jsonb, 'ok')
+	`, workspaceID, "Agent message: "+preview, string(respJSON)); err != nil {
+		log.Printf("Notify: failed to persist message for %s: %v", workspaceID, err)
+	}
+
 	c.JSON(http.StatusOK, gin.H{"status": "sent"})
 }

@ -373,7 +404,9 @@ func (h *ActivityHandler) Report(c *gin.Context) {
 }

 // LogActivity inserts an activity log and optionally broadcasts via WebSocket.
-func LogActivity(ctx context.Context, broadcaster *events.Broadcaster, params ActivityParams) {
+// Takes events.EventEmitter (#1814) so callers passing a stub broadcaster
+// in tests no longer need to construct the full *events.Broadcaster.
+func LogActivity(ctx context.Context, broadcaster events.EventEmitter, params ActivityParams) {
 	reqJSON, reqErr := json.Marshal(params.RequestBody)
 	if reqErr != nil {
 		log.Printf("LogActivity: failed to marshal request_body for %s: %v", params.WorkspaceID, reqErr)
--- a/workspace-server/internal/handlers/activity_test.go
+++ b/workspace-server/internal/handlers/activity_test.go
@ -217,6 +217,86 @@ func TestActivityReport_RejectsUnknownType(t *testing.T) {
 	}
 }

+func TestNotify_PersistsToActivityLogsForReloadRecovery(t *testing.T) {
+	// Regression guard for the "responses gone on reload" bug. send_message_to_user
+	// pushes (which route through Notify) used to be broadcast-only — they
+	// rendered in the canvas but vanished on page reload because nothing
+	// wrote them to activity_logs. The chat history loader queries
+	// `type=a2a_receive&source=canvas`, so the persisted row must:
+	//   - Use activity_type='a2a_receive' (loader's filter)
+	//   - Have source_id NULL (canvas-source filter)
+	//   - Carry the message text in response_body so extractResponseText
+	//     can reconstruct the agent reply on reload
+	mockDB, mock, _ := sqlmock.New()
+	defer mockDB.Close()
+	db.DB = mockDB
+
+	// Workspace existence check
+	mock.ExpectQuery(`SELECT name FROM workspaces`).
+		WithArgs("ws-notify").
+		WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("DD"))
+
+	// Persistence INSERT — verify shape
+	mock.ExpectExec(`INSERT INTO activity_logs`).
+		WithArgs(
+			"ws-notify",
+			sqlmock.AnyArg(), // summary
+			sqlmock.AnyArg(), // response_body JSON
+		).
+		WillReturnResult(sqlmock.NewResult(1, 1))
+
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	gin.SetMode(gin.TestMode)
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-notify"}}
+	body := `{"message":"agent reply that arrived after the sync POST timed out"}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/ws-notify/notify", strings.NewReader(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+	handler.Notify(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("DB expectations not met: %v", err)
+	}
+}
+
+func TestNotify_DBFailure_StillBroadcastsAnd200(t *testing.T) {
+	// Persistence is best-effort — a DB hiccup must NOT block the
+	// WebSocket push (which the user is already seeing in their open
+	// canvas). Pre-fix the WS push always succeeded; we don't want
+	// the new persistence step to regress that path.
+	mockDB, mock, _ := sqlmock.New()
+	defer mockDB.Close()
+	db.DB = mockDB
+
+	mock.ExpectQuery(`SELECT name FROM workspaces`).
+		WithArgs("ws-x").
+		WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("DD"))
+	mock.ExpectExec(`INSERT INTO activity_logs`).
+		WillReturnError(fmt.Errorf("simulated db hiccup"))
+
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	gin.SetMode(gin.TestMode)
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-x"}}
+	body := `{"message":"hi"}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/ws-x/notify", strings.NewReader(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+	handler.Notify(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("DB failure must not break the response; got %d", w.Code)
+	}
+}
+
 // ==================== Direct unit tests for SessionSearch helpers ====================

 // --- parseSessionSearchParams ---
--- a/workspace-server/internal/handlers/admin_workspace_images.go
+++ b/workspace-server/internal/handlers/admin_workspace_images.go
@ -0,0 +1,227 @@
+package handlers
+
+import (
+	"context"
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/docker/docker/api/types/container"
+	"github.com/docker/docker/api/types/filters"
+	dockerimage "github.com/docker/docker/api/types/image"
+	dockerclient "github.com/docker/docker/client"
+	"github.com/gin-gonic/gin"
+
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
+)
+
+// AdminWorkspaceImagesHandler serves POST /admin/workspace-images/refresh — the
+// production-side end of the runtime CD chain. Operators (or post-publish
+// automation) hit this to (1) pull the latest workspace template images from
+// GHCR via the Docker SDK and (2) recreate any running ws-* containers so
+// they adopt the new image. Without this, a freshly-published runtime sits
+// in the registry but containers keep running the old image until the next
+// manual restart.
+//
+// On a SaaS deployment the deploy pipeline already pulls on every release,
+// so the pull step is a no-op there; the recreate step is still the way to
+// make running workspaces adopt the new image without a full host restart.
+//
+// POST /admin/workspace-images/refresh
+//
+//	?runtime=claude-code   (optional; default = all 8 templates)
+//	&recreate=true|false   (default true; false = pull only)
+//
+// Returns JSON {pulled: [...], failed: [...], recreated: [...]}
+type AdminWorkspaceImagesHandler struct {
+	docker *dockerclient.Client
+}
+
+func NewAdminWorkspaceImagesHandler(docker *dockerclient.Client) *AdminWorkspaceImagesHandler {
+	return &AdminWorkspaceImagesHandler{docker: docker}
+}
+
+// allRuntimes is the canonical list mirroring docs/workspace-runtime-package.md.
+// Update both when a new template is added.
+var allRuntimes = []string{
+	"claude-code", "langgraph", "crewai", "autogen",
+	"deepagents", "hermes", "gemini-cli", "openclaw",
+}
+
+type refreshResult struct {
+	Pulled    []string `json:"pulled"`
+	Failed    []string `json:"failed"`
+	Recreated []string `json:"recreated"`
+}
+
+// ghcrAuthHeader returns the base64-encoded JSON auth payload Docker's
+// ImagePull expects in PullOptions.RegistryAuth, or empty string when no
+// GHCR_USER/GHCR_TOKEN env is set (lets public images pull through).
+//
+// The Docker SDK doesn't read ~/.docker/config.json — every authenticated
+// pull needs an explicit RegistryAuth string. Format per the Docker
+// engine API: {"username":"…","password":"…","serveraddress":"ghcr.io"}
+// → base64-encoded JSON with no trailing padding stripped (engine handles
+// either form).
+func ghcrAuthHeader() string {
+	user := strings.TrimSpace(os.Getenv("GHCR_USER"))
+	token := strings.TrimSpace(os.Getenv("GHCR_TOKEN"))
+	if user == "" || token == "" {
+		return ""
+	}
+	payload := map[string]string{
+		"username":      user,
+		"password":      token,
+		"serveraddress": "ghcr.io",
+	}
+	js, err := json.Marshal(payload)
+	if err != nil {
+		// Should be unreachable for a static map[string]string. Log so a
+		// future contributor adding a non-marshallable field notices.
+		log.Printf("workspace-images: failed to marshal GHCR auth: %v", err)
+		return ""
+	}
+	return base64.URLEncoding.EncodeToString(js)
+}
+
+func (h *AdminWorkspaceImagesHandler) Refresh(c *gin.Context) {
+	runtimes := allRuntimes
+	if r := c.Query("runtime"); r != "" {
+		// Accept a single runtime; reject anything not in the canonical list
+		// so a typo doesn't silently no-op.
+		found := false
+		for _, known := range allRuntimes {
+			if known == r {
+				found = true
+				break
+			}
+		}
+		if !found {
+			c.JSON(http.StatusBadRequest, gin.H{
+				"error":          fmt.Sprintf("unknown runtime: %s", r),
+				"known_runtimes": allRuntimes,
+			})
+			return
+		}
+		runtimes = []string{r}
+	}
+	recreate := c.DefaultQuery("recreate", "true") == "true"
+
+	res := refreshResult{Pulled: []string{}, Failed: []string{}, Recreated: []string{}}
+	auth := ghcrAuthHeader()
+
+	// 1. Pull each template image via the Docker SDK. Soft-fail per-runtime
+	//    so one missing image (e.g. unpublished template) doesn't abort
+	//    the others. Each pull's progress stream is drained to completion
+	//    — the engine treats early-close as "abandon", leaving partial
+	//    layers around with no reference.
+	pullCtx, cancel := context.WithTimeout(c.Request.Context(), 5*time.Minute)
+	defer cancel()
+	for _, rt := range runtimes {
+		image := fmt.Sprintf("ghcr.io/molecule-ai/workspace-template-%s:latest", rt)
+		opts := dockerimage.PullOptions{Platform: provisioner.DefaultImagePlatform()}
+		if auth != "" {
+			opts.RegistryAuth = auth
+		}
+		rc, err := h.docker.ImagePull(pullCtx, image, opts)
+		if err != nil {
+			log.Printf("workspace-images/refresh: pull %s failed: %v", rt, err)
+			res.Failed = append(res.Failed, rt)
+			continue
+		}
+		// Drain to completion. We discard progress payload because no
+		// caller renders it; the platform log already records pulled/failed
+		// per runtime. If a future caller wants live progress, decode the
+		// JSON-line stream into events here.
+		if _, err := io.Copy(io.Discard, rc); err != nil {
+			rc.Close()
+			log.Printf("workspace-images/refresh: drain %s failed: %v", rt, err)
+			res.Failed = append(res.Failed, rt)
+			continue
+		}
+		rc.Close()
+		res.Pulled = append(res.Pulled, rt)
+	}
+
+	if !recreate {
+		c.JSON(http.StatusOK, res)
+		return
+	}
+
+	// 2. Find ws-* containers running an image we just pulled. Recreate
+	//    them — kill+remove and let the platform's normal provisioning
+	//    flow re-create on next canvas interaction.
+	listCtx, listCancel := context.WithTimeout(c.Request.Context(), 30*time.Second)
+	defer listCancel()
+	containers, err := h.docker.ContainerList(listCtx, container.ListOptions{
+		All:     true,
+		Filters: filters.NewArgs(filters.Arg("name", "ws-")),
+	})
+	if err != nil {
+		log.Printf("workspace-images/refresh: container list failed: %v", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "container list failed", "partial_result": res})
+		return
+	}
+
+	pulledSet := map[string]struct{}{}
+	for _, rt := range res.Pulled {
+		pulledSet[rt] = struct{}{}
+	}
+	for _, ctr := range containers {
+		// ContainerList's ctr.Image is the *resolved digest* (sha256:…),
+		// not the human-readable tag. Use ContainerInspect to get the
+		// original Config.Image (e.g. "ghcr.io/molecule-ai/workspace-
+		// template-claude-code:latest") so we can match against the
+		// pulled-runtime set. The cost is one extra round-trip per
+		// ws-* container — there are at most 8 typically, so this is
+		// well below any UX threshold.
+		inspectCtx, inspectCancel := context.WithTimeout(c.Request.Context(), 10*time.Second)
+		full, err := h.docker.ContainerInspect(inspectCtx, ctr.ID)
+		inspectCancel()
+		if err != nil {
+			log.Printf("workspace-images/refresh: inspect %s failed: %v", ctr.ID[:12], err)
+			continue
+		}
+		imageRef := ""
+		if full.Config != nil {
+			imageRef = full.Config.Image
+		}
+		matched := ""
+		for rt := range pulledSet {
+			if strings.Contains(imageRef, "workspace-template-"+rt) {
+				matched = rt
+				break
+			}
+		}
+		if matched == "" {
+			continue
+		}
+		name := strings.TrimPrefix(ctr.Names[0], "/")
+		// Remove with force — the workspace will re-provision on the next
+		// canvas interaction. This drops in-flight conversations on the
+		// removed container; document via the response so callers can
+		// schedule the refresh during a quiet window.
+		rmCtx, rmCancel := context.WithTimeout(c.Request.Context(), 30*time.Second)
+		err = h.docker.ContainerRemove(rmCtx, ctr.ID, container.RemoveOptions{Force: true})
+		rmCancel()
+		if err != nil {
+			log.Printf("workspace-images/refresh: remove %s failed: %v", name, err)
+			continue
+		}
+		res.Recreated = append(res.Recreated, name)
+	}
+
+	authStatus := "no GHCR auth (public images only)"
+	if auth != "" {
+		authStatus = "GHCR_USER/GHCR_TOKEN auth"
+	}
+	log.Printf("workspace-images/refresh: pulled=%d failed=%d recreated=%d (%s)",
+		len(res.Pulled), len(res.Failed), len(res.Recreated), authStatus)
+	c.JSON(http.StatusOK, res)
+}
--- a/workspace-server/internal/handlers/admin_workspace_images_test.go
+++ b/workspace-server/internal/handlers/admin_workspace_images_test.go
@ -0,0 +1,73 @@
+package handlers
+
+import (
+	"encoding/base64"
+	"encoding/json"
+	"testing"
+)
+
+func TestGHCRAuthHeader_NoEnvReturnsEmpty(t *testing.T) {
+	t.Setenv("GHCR_USER", "")
+	t.Setenv("GHCR_TOKEN", "")
+	if got := ghcrAuthHeader(); got != "" {
+		t.Errorf("expected empty (no auth → public-only), got %q", got)
+	}
+}
+
+func TestGHCRAuthHeader_PartialEnvReturnsEmpty(t *testing.T) {
+	// Both must be set — defensive against half-configured env.
+	t.Setenv("GHCR_USER", "alice")
+	t.Setenv("GHCR_TOKEN", "")
+	if got := ghcrAuthHeader(); got != "" {
+		t.Errorf("user-only env should disable auth, got %q", got)
+	}
+	t.Setenv("GHCR_USER", "")
+	t.Setenv("GHCR_TOKEN", "fake-tok-xxx")
+	if got := ghcrAuthHeader(); got != "" {
+		t.Errorf("token-only env should disable auth, got %q", got)
+	}
+}
+
+func TestGHCRAuthHeader_EncodesDockerEnginePayload(t *testing.T) {
+	t.Setenv("GHCR_USER", "alice")
+	t.Setenv("GHCR_TOKEN", "fake-tok-value")
+	got := ghcrAuthHeader()
+	if got == "" {
+		t.Fatal("expected non-empty auth header")
+	}
+	raw, err := base64.URLEncoding.DecodeString(got)
+	if err != nil {
+		t.Fatalf("auth header is not valid base64-url: %v", err)
+	}
+	var payload map[string]string
+	if err := json.Unmarshal(raw, &payload); err != nil {
+		t.Fatalf("decoded auth is not valid JSON: %v (raw=%s)", err, raw)
+	}
+	if payload["username"] != "alice" {
+		t.Errorf("username: got %q, want alice", payload["username"])
+	}
+	if payload["password"] != "fake-tok-value" {
+		t.Errorf("password: got %q, want fake-tok-value", payload["password"])
+	}
+	if payload["serveraddress"] != "ghcr.io" {
+		t.Errorf("serveraddress: got %q, want ghcr.io", payload["serveraddress"])
+	}
+}
+
+func TestGHCRAuthHeader_TrimsWhitespace(t *testing.T) {
+	// .env lines often have trailing newlines or accidental spaces. Without
+	// trimming, a stray space would produce an auth payload the engine
+	// rejects with a confusing 401.
+	t.Setenv("GHCR_USER", "  alice  ")
+	t.Setenv("GHCR_TOKEN", "\tfake-tok-value\n")
+	got := ghcrAuthHeader()
+	raw, _ := base64.URLEncoding.DecodeString(got)
+	var payload map[string]string
+	_ = json.Unmarshal(raw, &payload)
+	if payload["username"] != "alice" {
+		t.Errorf("username not trimmed: got %q", payload["username"])
+	}
+	if payload["password"] != "fake-tok-value" {
+		t.Errorf("password not trimmed: got %q", payload["password"])
+	}
+}
--- a/workspace-server/internal/handlers/chat_files.go
+++ b/workspace-server/internal/handlers/chat_files.go
@ -0,0 +1,415 @@
+package handlers
+
+// chat_files.go — file upload/download for workspace chat.
+//
+// Split from templates.go because these endpoints have a different
+// security model (no /configs write, no template fallback) and a
+// different wire format (multipart in, binary-stream out). Template
+// files are agent workspace configuration; chat files are user-agent
+// conversation payloads.
+
+import (
+	"archive/tar"
+	"bytes"
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"fmt"
+	"io"
+	"log"
+	"mime"
+	"mime/multipart"
+	"net/http"
+	"path/filepath"
+	"regexp"
+	"strings"
+
+	"github.com/docker/docker/api/types/container"
+	"github.com/gin-gonic/gin"
+)
+
+// ChatFilesHandler serves file upload + download for chat. It
+// composes the existing TemplatesHandler's Docker plumbing
+// (findContainer, execInContainer, copyFilesToContainer) rather than
+// duplicating them, so a bug fix in the Docker layer propagates to
+// both endpoints.
+type ChatFilesHandler struct {
+	templates *TemplatesHandler
+}
+
+func NewChatFilesHandler(t *TemplatesHandler) *ChatFilesHandler {
+	return &ChatFilesHandler{templates: t}
+}
+
+// chatUploadMaxBytes caps the full multipart request body so a
+// malicious / runaway client can't OOM the server. 50 MB covers most
+// documents + a handful of images per message; larger artefacts
+// should go through git/S3 rather than chat.
+const chatUploadMaxBytes = 50 * 1024 * 1024
+
+// chatUploadMaxFileBytes caps individual files in a multi-file upload.
+// Keeping the per-file cap below the total lets a user send, say, a
+// 5 MB PDF + 10 screenshots without tripping the batch limit on any
+// single attachment.
+const chatUploadMaxFileBytes = 25 * 1024 * 1024
+
+// chatUploadDir is the in-container path where user-uploaded chat
+// attachments land. Under /workspace so the file persists with the
+// workspace volume and is readable by the agent without any extra
+// plumbing — the agent just reads from the URI path we return.
+const chatUploadDir = "/workspace/.molecule/chat-uploads"
+
+// unsafeFilenameChars matches anything outside the conservative
+// {alnum, dot, underscore, dash} set. Filenames get rewritten
+// character-class at a time, so embedded paths, control chars,
+// newlines, quotes, and shell metachars never reach the filesystem.
+var unsafeFilenameChars = regexp.MustCompile(`[^a-zA-Z0-9._\-]`)
+
+// contentDispositionAttachment produces a safe `attachment; filename=...`
+// header. Quotes, CR, and LF in the filename are escaped per RFC 6266 /
+// RFC 5987: control chars dropped, backslash and double-quote
+// backslash-escaped inside the quoted-string. Also emits the
+// percent-encoded filename* parameter so non-ASCII names survive.
+// This matters because agents can write arbitrary filenames into
+// /workspace, and anything they produce reaches this header via
+// `filepath.Base(path)` — not all agents sanitize on their side.
+func contentDispositionAttachment(name string) string {
+	safeQ := make([]rune, 0, len(name))
+	for _, r := range name {
+		switch {
+		case r == '\r' || r == '\n':
+			// Drop — any CR/LF would terminate the header early.
+			continue
+		case r == '"' || r == '\\':
+			// Escape per RFC 6266 §4.1 quoted-string.
+			safeQ = append(safeQ, '\\', r)
+		case r < 0x20 || r == 0x7f:
+			// Drop other control chars.
+			continue
+		default:
+			safeQ = append(safeQ, r)
+		}
+	}
+	asciiSafe := string(safeQ)
+	// filename=  — double-quoted, escaped. Gives legacy clients a value.
+	// filename*= — RFC 5987 percent-encoded UTF-8, preferred when present.
+	return fmt.Sprintf(`attachment; filename="%s"; filename*=UTF-8''%s`,
+		asciiSafe, urlPathEscape(name))
+}
+
+// urlPathEscape percent-encodes every byte outside the RFC 3986
+// unreserved set — stricter than net/url.PathEscape (which leaves
+// "/" unescaped because it's legal in URL paths). Filenames must
+// never contain "/" anyway, so escaping it is defence-in-depth
+// against an agent that writes a path-like name.
+func urlPathEscape(s string) string {
+	const unreserved = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
+	var b strings.Builder
+	for _, c := range []byte(s) {
+		if strings.IndexByte(unreserved, c) >= 0 {
+			b.WriteByte(c)
+		} else {
+			fmt.Fprintf(&b, "%%%02X", c)
+		}
+	}
+	return b.String()
+}
+
+func sanitizeFilename(in string) string {
+	base := filepath.Base(in)
+	base = strings.ReplaceAll(base, " ", "_")
+	base = unsafeFilenameChars.ReplaceAllString(base, "_")
+	if len(base) > 100 {
+		ext := filepath.Ext(base)
+		if len(ext) > 16 {
+			ext = ""
+		}
+		base = base[:100-len(ext)] + ext
+	}
+	if base == "" || base == "." || base == ".." {
+		return "file"
+	}
+	return base
+}
+
+// ChatUploadedFile is the per-file response returned from POST
+// /workspaces/:id/chat/uploads. Clients include this payload (or a
+// trimmed subset) in their outgoing A2A `message/send` parts.
+type ChatUploadedFile struct {
+	// URI uses a custom "workspace:" scheme so clients can resolve it
+	// against the streaming Download endpoint regardless of where the
+	// canvas itself is hosted. The path component is always absolute
+	// within the workspace container.
+	URI      string `json:"uri"`
+	Name     string `json:"name"`
+	MimeType string `json:"mimeType,omitempty"`
+	Size     int64  `json:"size"`
+}
+
+// Upload handles POST /workspaces/:id/chat/uploads.
+// Accepts multipart/form-data with one or more `files` fields, stages
+// each under /workspace/.molecule/chat-uploads with a UUID prefix,
+// and returns the list of URIs for the caller to attach to an A2A
+// message.
+func (h *ChatFilesHandler) Upload(c *gin.Context) {
+	workspaceID := c.Param("id")
+	if err := validateWorkspaceID(workspaceID); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid workspace ID"})
+		return
+	}
+
+	// Hard cap the request body BEFORE ParseMultipartForm — otherwise
+	// a client could chunk-upload past the cap before Go notices.
+	c.Request.Body = http.MaxBytesReader(c.Writer, c.Request.Body, chatUploadMaxBytes)
+	if err := c.Request.ParseMultipartForm(chatUploadMaxBytes); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "failed to parse multipart form"})
+		return
+	}
+
+	form := c.Request.MultipartForm
+	var headers []*multipart.FileHeader
+	if form != nil && form.File != nil {
+		headers = form.File["files"]
+	}
+	if len(headers) == 0 {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "expected at least one 'files' field"})
+		return
+	}
+
+	ctx := c.Request.Context()
+	containerName := h.templates.findContainer(ctx, workspaceID)
+	if containerName == "" {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"})
+		return
+	}
+
+	// Build the archive in memory. Files are byte-preserving through
+	// Go's string<->[]byte (the tar helper takes map[string]string but
+	// the conversion is a literal copy, not a UTF-8 reinterpretation).
+	archive := map[string]string{}
+	uploaded := make([]ChatUploadedFile, 0, len(headers))
+	for _, fh := range headers {
+		if fh.Size > chatUploadMaxFileBytes {
+			c.JSON(http.StatusRequestEntityTooLarge, gin.H{
+				"error": fmt.Sprintf("%s exceeds per-file limit (%d MB)", fh.Filename, chatUploadMaxFileBytes/(1024*1024)),
+			})
+			return
+		}
+		f, err := fh.Open()
+		if err != nil {
+			c.JSON(http.StatusBadRequest, gin.H{"error": "failed to read upload"})
+			return
+		}
+		// LimitReader guards against a truthful-but-lying Size header:
+		// if the multipart stream carries more bytes than declared, we
+		// stop at the cap instead of growing the buffer.
+		data, err := io.ReadAll(io.LimitReader(f, chatUploadMaxFileBytes+1))
+		f.Close()
+		if err != nil {
+			c.JSON(http.StatusBadRequest, gin.H{"error": "failed to read upload"})
+			return
+		}
+		if int64(len(data)) > chatUploadMaxFileBytes {
+			c.JSON(http.StatusRequestEntityTooLarge, gin.H{
+				"error": fmt.Sprintf("%s exceeds per-file limit (%d MB)", fh.Filename, chatUploadMaxFileBytes/(1024*1024)),
+			})
+			return
+		}
+
+		name := sanitizeFilename(fh.Filename)
+		// 16-byte (UUID-equivalent) random prefix. Within a single
+		// batch we also check for collisions — birthday on 128 bits
+		// is astronomical, but a bad PRNG or single re-used draw
+		// would silently overwrite a sibling upload with its own
+		// content and return two URIs pointing at one file.
+		var stored string
+		for attempt := 0; attempt < 4; attempt++ {
+			idBytes := make([]byte, 16)
+			if _, err := rand.Read(idBytes); err != nil {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to allocate upload ID"})
+				return
+			}
+			candidate := hex.EncodeToString(idBytes) + "-" + name
+			if _, taken := archive[candidate]; !taken {
+				stored = candidate
+				break
+			}
+		}
+		if stored == "" {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to allocate unique upload ID"})
+			return
+		}
+		archive[stored] = string(data)
+
+		mt := fh.Header.Get("Content-Type")
+		if mt == "" {
+			mt = mime.TypeByExtension(filepath.Ext(name))
+		}
+		uploaded = append(uploaded, ChatUploadedFile{
+			URI:      "workspace:" + chatUploadDir + "/" + stored,
+			Name:     name,
+			MimeType: mt,
+			Size:     int64(len(data)),
+		})
+	}
+
+	// mkdir -p is idempotent; we fire it every upload instead of
+	// caching state here so container restarts don't surprise us.
+	_, _ = h.templates.execInContainer(ctx, containerName, []string{"mkdir", "-p", chatUploadDir})
+
+	// Defence in depth: pre-remove each target path before extracting
+	// the tar. An agent with write access to /workspace could in
+	// theory race-create a symlink at <chatUploadDir>/<stored-name>
+	// pointing at a sensitive in-container path (its own /etc/*,
+	// mounted secrets). Docker's tar extraction on some drivers
+	// follows pre-existing symlinks at the destination. `rm -f` the
+	// exact stored-name closes that window — the UUID prefix on the
+	// name makes a successful race effectively impossible, but this
+	// guard costs nothing and documents the intent.
+	rmArgs := []string{"rm", "-f", "--"}
+	for stored := range archive {
+		rmArgs = append(rmArgs, chatUploadDir+"/"+stored)
+	}
+	_, _ = h.templates.execInContainer(ctx, containerName, rmArgs)
+
+	if err := h.copyFlatToContainer(ctx, containerName, chatUploadDir, archive); err != nil {
+		log.Printf("Chat upload copy failed for %s: %v", workspaceID, err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to stage files in workspace"})
+		return
+	}
+
+	c.JSON(http.StatusOK, gin.H{"files": uploaded})
+}
+
+// copyFlatToContainer extracts one tar of flat files into destPath
+// inside the container. Unlike the shared copyFilesToContainer helper
+// (which prepends destPath into tar entry names — correct for its
+// callers whose files relative-live inside a nested tree), this
+// helper writes tar entries with ONLY the flat filename so Docker's
+// extraction at destPath lands them directly in destPath, not at
+// destPath/destPath/... as the shared helper would.
+// Filenames are validated to contain no path separator so nothing
+// can escape destPath via an embedded "../" or a leading "/".
+func (h *ChatFilesHandler) copyFlatToContainer(ctx context.Context, containerName, destPath string, files map[string]string) error {
+	if h.templates.docker == nil {
+		return fmt.Errorf("docker not available")
+	}
+	var buf bytes.Buffer
+	tw := tar.NewWriter(&buf)
+	for name, content := range files {
+		if strings.ContainsAny(name, "/\\") || name == ".." || name == "." || name == "" {
+			return fmt.Errorf("unsafe flat filename: %q", name)
+		}
+		data := []byte(content)
+		if err := tw.WriteHeader(&tar.Header{
+			Name:     name, // relative — Docker resolves against destPath
+			Mode:     0644,
+			Size:     int64(len(data)),
+			Typeflag: tar.TypeReg,
+		}); err != nil {
+			return fmt.Errorf("tar header %q: %w", name, err)
+		}
+		if _, err := tw.Write(data); err != nil {
+			return fmt.Errorf("tar write %q: %w", name, err)
+		}
+	}
+	if err := tw.Close(); err != nil {
+		return fmt.Errorf("tar close: %w", err)
+	}
+	return h.templates.docker.CopyToContainer(ctx, containerName, destPath, &buf, container.CopyToContainerOptions{})
+}
+
+// Download handles GET /workspaces/:id/chat/download?path=<abs path>.
+// Streams the file bytes from the container with a correct
+// Content-Type and attachment Content-Disposition. Binary-safe —
+// unlike the existing JSON ReadFile endpoint which carries content
+// as a string (lossy for non-UTF-8 bytes).
+func (h *ChatFilesHandler) Download(c *gin.Context) {
+	workspaceID := c.Param("id")
+	if err := validateWorkspaceID(workspaceID); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid workspace ID"})
+		return
+	}
+
+	path := c.Query("path")
+	if path == "" {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path query required"})
+		return
+	}
+	if !filepath.IsAbs(path) {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path must be absolute"})
+		return
+	}
+	// Path must land under one of the allowed roots — mirrors the
+	// ReadFile security model and prevents arbitrary reads of /etc
+	// or other system paths via this endpoint.
+	rooted := false
+	for root := range allowedRoots {
+		if path == root || strings.HasPrefix(path, root+"/") {
+			rooted = true
+			break
+		}
+	}
+	if !rooted {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path must be under /configs, /workspace, /home, or /plugins"})
+		return
+	}
+	// Reject anything that canonicalises differently or contains a
+	// traversal segment. Defence-in-depth on top of the prefix check.
+	if filepath.Clean(path) != path || strings.Contains(path, "..") {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid path"})
+		return
+	}
+
+	ctx := c.Request.Context()
+	if h.templates.docker == nil {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "docker unavailable"})
+		return
+	}
+	containerName := h.templates.findContainer(ctx, workspaceID)
+	if containerName == "" {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"})
+		return
+	}
+
+	// docker cp returns a tar stream containing the requested path.
+	// For a regular file that's a single tar entry; we extract and
+	// stream the body through.
+	reader, _, err := h.templates.docker.CopyFromContainer(ctx, containerName, path)
+	if err != nil {
+		c.JSON(http.StatusNotFound, gin.H{"error": "file not found"})
+		return
+	}
+	defer reader.Close()
+
+	tr := tar.NewReader(reader)
+	hdr, err := tr.Next()
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to read archive"})
+		return
+	}
+	if hdr.Typeflag != tar.TypeReg {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path is not a regular file"})
+		return
+	}
+
+	name := filepath.Base(path)
+	mt := mime.TypeByExtension(filepath.Ext(name))
+	if mt == "" {
+		mt = "application/octet-stream"
+	}
+	c.Header("Content-Type", mt)
+	c.Header("Content-Length", fmt.Sprintf("%d", hdr.Size))
+	c.Header("Content-Disposition", contentDispositionAttachment(name))
+	c.Status(http.StatusOK)
+
+	// Stream exactly hdr.Size bytes. CopyN was chosen over LimitReader
+	// because it returns an error when the source is short — that
+	// surfaces a bug in the tar extraction path immediately instead
+	// of silently truncating. Agents can legitimately produce files
+	// larger than the 50 MB upload cap (that's a per-request inbound
+	// cap, not a per-artifact one), so we cannot clamp here.
+	if _, err := io.CopyN(c.Writer, tr, hdr.Size); err != nil {
+		log.Printf("Chat download stream error for %s (%s): %v", workspaceID, path, err)
+	}
+}
--- a/workspace-server/internal/handlers/chat_files_test.go
+++ b/workspace-server/internal/handlers/chat_files_test.go
@ -0,0 +1,194 @@
+package handlers
+
+// Unit tests for chat_files.go. The Docker-touching paths (Upload
+// actually copying into a container, Download actually streaming tar)
+// are exercised via integration tests — docker-in-docker is out of
+// scope for the unit suite. These tests cover the validation + error
+// surfaces that a caller can reach without a running container.
+
+import (
+	"bytes"
+	"mime/multipart"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	"github.com/gin-gonic/gin"
+)
+
+func TestSanitizeFilename(t *testing.T) {
+	cases := []struct {
+		in, want string
+	}{
+		{"report.pdf", "report.pdf"},
+		{"my file.pdf", "my_file.pdf"},
+		{"../../etc/passwd", "passwd"},
+		{"weird;$name`.txt", "weird__name_.txt"},
+		{"", "file"},
+		{".", "file"},
+		{"..", "file"},
+	}
+	for _, tc := range cases {
+		got := sanitizeFilename(tc.in)
+		if got != tc.want {
+			t.Errorf("sanitizeFilename(%q) = %q, want %q", tc.in, got, tc.want)
+		}
+	}
+}
+
+func TestSanitizeFilename_LongNamePreservesExtension(t *testing.T) {
+	// 120-char base + .pdf — the helper should truncate the base but
+	// keep the extension intact so content-type inference still works.
+	longBase := strings.Repeat("a", 120)
+	got := sanitizeFilename(longBase + ".pdf")
+	if len(got) > 100 {
+		t.Errorf("filename not truncated: len=%d", len(got))
+	}
+	if !strings.HasSuffix(got, ".pdf") {
+		t.Errorf("extension stripped: %q", got)
+	}
+}
+
+func TestChatUpload_InvalidWorkspaceID(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil)
+	h := NewChatFilesHandler(tmplh)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "not-a-uuid"}}
+	c.Request = httptest.NewRequest("POST", "/workspaces/not-a-uuid/chat/uploads", nil)
+
+	h.Upload(c)
+
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400 on invalid workspace id, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
+func TestChatUpload_MissingFiles(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil)
+	h := NewChatFilesHandler(tmplh)
+
+	// Multipart body with no `files` field — only a text field.
+	var buf bytes.Buffer
+	mw := multipart.NewWriter(&buf)
+	_ = mw.WriteField("other", "value")
+	mw.Close()
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+	req := httptest.NewRequest("POST", "/workspaces/00000000-0000-0000-0000-000000000001/chat/uploads", &buf)
+	req.Header.Set("Content-Type", mw.FormDataContentType())
+	c.Request = req
+
+	h.Upload(c)
+
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400 when files field missing, got %d: %s", w.Code, w.Body.String())
+	}
+	if !strings.Contains(w.Body.String(), "files") {
+		t.Errorf("expected error to mention files field: %s", w.Body.String())
+	}
+}
+
+func TestChatDownload_InvalidPath(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil)
+	h := NewChatFilesHandler(tmplh)
+
+	cases := []struct {
+		name, path, wantSubstr string
+	}{
+		{"empty", "", "path query required"},
+		{"relative", "workspace/foo.txt", "must be absolute"},
+		{"wrong root", "/etc/passwd", "must be under"},
+		{"traversal", "/workspace/../etc/passwd", "invalid path"},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			w := httptest.NewRecorder()
+			c, _ := gin.CreateTestContext(w)
+			c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+			req := httptest.NewRequest("GET", "/workspaces/xxx/chat/download?path="+tc.path, nil)
+			c.Request = req
+
+			h.Download(c)
+
+			if w.Code != http.StatusBadRequest {
+				t.Errorf("expected 400 for %s, got %d: %s", tc.name, w.Code, w.Body.String())
+			}
+			if !strings.Contains(w.Body.String(), tc.wantSubstr) {
+				t.Errorf("expected error to contain %q, got: %s", tc.wantSubstr, w.Body.String())
+			}
+		})
+	}
+}
+
+func TestContentDispositionAttachment_Escapes(t *testing.T) {
+	cases := []struct {
+		name, input, wantSubstr string
+	}{
+		{
+			name:       "plain ASCII passes through",
+			input:      "report.pdf",
+			wantSubstr: `filename="report.pdf"`,
+		},
+		{
+			name:       "double-quote is backslash-escaped",
+			input:      `weird".pdf`,
+			wantSubstr: `filename="weird\".pdf"`,
+		},
+		{
+			name:       "CR and LF dropped to prevent header injection",
+			input:      "bad\r\nX-Leak: 1\r\n.txt",
+			wantSubstr: `filename="badX-Leak: 1.txt"`,
+		},
+		{
+			name:       "non-ASCII emits filename* percent-encoded",
+			input:      "résumé.pdf",
+			wantSubstr: "filename*=UTF-8''r%C3%A9sum%C3%A9.pdf",
+		},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := contentDispositionAttachment(tc.input)
+			if !strings.Contains(got, tc.wantSubstr) {
+				t.Errorf("contentDispositionAttachment(%q) = %q, missing substring %q", tc.input, got, tc.wantSubstr)
+			}
+			// Must never contain a bare CR or LF — either would end the header.
+			if strings.ContainsAny(got, "\r\n") {
+				t.Errorf("header contains CR/LF: %q", got)
+			}
+		})
+	}
+}
+
+func TestChatDownload_DockerUnavailable(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil) // docker=nil
+	h := NewChatFilesHandler(tmplh)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+	req := httptest.NewRequest("GET", "/workspaces/xxx/chat/download?path=/workspace/report.pdf", nil)
+	c.Request = req
+
+	h.Download(c)
+
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503 when docker is nil, got %d: %s", w.Code, w.Body.String())
+	}
+}
--- a/workspace-server/internal/handlers/delegation.go
+++ b/workspace-server/internal/handlers/delegation.go
@ -78,13 +78,21 @@ func (h *DelegationHandler) Delegate(c *gin.Context) {
 	// reason (logged); we still dispatch the A2A request and surface the
 	// warning in the response.

-	// Build A2A payload
+	// Build A2A payload. Embedding delegation_id in metadata gives the
+	// queue drain path a way to look up the originating delegation row
+	// when stitching the response back (issue: previously the drain
+	// dispatched successfully but discarded the response, so
+	// check_task_status returned status='queued' forever even after a
+	// real reply landed). messageId mirrors delegation_id so the
+	// platform's idempotency-key extraction also keys off the same id.
 	a2aBody, _ := json.Marshal(map[string]interface{}{
 		"method": "message/send",
 		"params": map[string]interface{}{
 			"message": map[string]interface{}{
-				"role":  "user",
-				"parts": []map[string]interface{}{{"type": "text", "text": body.Task}},
+				"role":      "user",
+				"messageId": delegationID,
+				"parts":     []map[string]interface{}{{"type": "text", "text": body.Task}},
+				"metadata":  map[string]interface{}{"delegation_id": delegationID},
 			},
 		},
 	})
@ -284,6 +292,40 @@ func (h *DelegationHandler) executeDelegation(sourceID, targetID, delegationID s
 		return
 	}

+	// 202 + {queued: true} means the target was busy and the proxy
+	// enqueued the request for the next drain tick — NOT a completion.
+	// Treat it as such: write a clean 'queued' activity row with no
+	// JSON-as-text leakage into the summary, broadcast a status update,
+	// and return. The eventual drain doesn't (yet) feed a result back
+	// into this delegation, so callers polling check_task_status will
+	// see status='queued' and know to retry instead of believing the
+	// queued JSON is the agent's reply. Fixes the chat-leak where the
+	// LLM echoed "Delegation completed (workspace agent busy ...)" to
+	// the user.
+	if status == http.StatusAccepted && isQueuedProxyResponse(respBody) {
+		log.Printf("Delegation %s: target %s busy — queued for drain", delegationID, targetID)
+		h.updateDelegationStatus(sourceID, delegationID, "queued", "")
+		// Store delegation_id in response_body so DrainQueueForWorkspace's
+		// stitch step can find this row by JSON-path key after the queued
+		// dispatch eventually succeeds. Without the key, the drain finds
+		// the row by (workspace_id, target_id, method) but can't tell
+		// multiple-queued-delegations-to-same-target apart.
+		queuedJSON, _ := json.Marshal(map[string]interface{}{
+			"delegation_id": delegationID,
+			"queued":        true,
+		})
+		if _, err := db.DB.ExecContext(ctx, `
+			INSERT INTO activity_logs (workspace_id, activity_type, method, source_id, target_id, summary, response_body, status)
+			VALUES ($1, 'delegation', 'delegate_result', $2, $3, $4, $5::jsonb, 'queued')
+		`, sourceID, sourceID, targetID, "Delegation queued — target at capacity", string(queuedJSON)); err != nil {
+			log.Printf("Delegation %s: failed to insert queued log: %v", delegationID, err)
+		}
+		h.broadcaster.RecordAndBroadcast(ctx, "DELEGATION_STATUS", sourceID, map[string]interface{}{
+			"delegation_id": delegationID, "target_id": targetID, "status": "queued",
+		})
+		return
+	}
+
 	// A2A returned 200 — target received and processed the task
 	// Status: dispatched → received → completed (we don't have a separate "received" signal from the target yet)
 	responseText := extractResponseText(respBody)
@ -517,6 +559,21 @@ func isTransientProxyError(err *proxyA2AError) bool {
 	return false
 }

+// isQueuedProxyResponse reports whether the proxy returned a body shaped like
+// `{"queued": true, "queue_id": ..., "queue_depth": ..., "message": ...}` —
+// the busy-target enqueue path in a2a_proxy_helpers.go. Caller checks this
+// alongside HTTP 202 to distinguish a successful agent reply from a deferred
+// dispatch; without the distinction we'd write the queued-message JSON into
+// the delegation result row and the LLM would surface it as agent output.
+func isQueuedProxyResponse(body []byte) bool {
+	var resp map[string]interface{}
+	if json.Unmarshal(body, &resp) != nil {
+		return false
+	}
+	queued, _ := resp["queued"].(bool)
+	return queued
+}
+
 func extractResponseText(body []byte) string {
 	var resp map[string]interface{}
 	if json.Unmarshal(body, &resp) != nil {
--- a/workspace-server/internal/handlers/delegation_test.go
+++ b/workspace-server/internal/handlers/delegation_test.go
@ -376,6 +376,39 @@ func TestIsTransientProxyError_RetriesOnRestartRaceStatuses(t *testing.T) {
 	}
 }

+func TestIsQueuedProxyResponse(t *testing.T) {
+	// Regression guard for the chat-leak bug: when the proxy returns
+	// 202 with a queued-shape body, executeDelegation must classify it
+	// as "queued" — not "completed". Mis-classifying it causes the
+	// queued JSON to land in activity_logs.summary, which the LLM then
+	// echoes verbatim into the agent chat as
+	// "Delegation completed: Delegation completed (workspace agent
+	// busy — request queued, will dispatch...)".
+	cases := []struct {
+		name string
+		body string
+		want bool
+	}{
+		{
+			name: "real proxy busy-enqueue body",
+			body: `{"queued":true,"queue_id":"d0993390-5f5a-4f5d-90a2-66639e53e3c9","queue_depth":1,"message":"workspace agent busy — request queued, will dispatch when capacity available"}`,
+			want: true,
+		},
+		{"queued false explicitly", `{"queued":false}`, false},
+		{"queued field absent (real A2A reply)", `{"jsonrpc":"2.0","id":"1","result":{"kind":"message","parts":[{"kind":"text","text":"hi"}]}}`, false},
+		{"non-bool queued value (defensive)", `{"queued":"true"}`, false},
+		{"malformed JSON", `not-json`, false},
+		{"empty body", ``, false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := isQueuedProxyResponse([]byte(tc.body)); got != tc.want {
+				t.Errorf("isQueuedProxyResponse(%q) = %v, want %v", tc.body, got, tc.want)
+			}
+		})
+	}
+}
+
 func TestDelegationRetryDelay_IsSaneWindow(t *testing.T) {
 	// Regression guard: the retry delay must be long enough for the
 	// reactive URL refresh in proxyA2ARequest to kick in (which involves
--- a/workspace-server/internal/handlers/handlers_additional_test.go
+++ b/workspace-server/internal/handlers/handlers_additional_test.go
@ -11,6 +11,7 @@ import (
 	"time"

 	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
 	"github.com/gin-gonic/gin"
 )

@ -31,7 +32,7 @@ func TestWorkspaceCreate_WithParentID(t *testing.T) {
 	mock.ExpectBegin()
 	// Default tier is 3 (Privileged) — see workspace.go create-handler comment.
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Child Agent", nil, 3, "langgraph", sqlmock.AnyArg(), &parentID, nil, "none", (*int64)(nil)).
+		WithArgs(sqlmock.AnyArg(), "Child Agent", nil, 3, "langgraph", sqlmock.AnyArg(), &parentID, nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@ -66,7 +67,7 @@ func TestWorkspaceCreate_ExplicitClaudeCodeRuntime(t *testing.T) {

 	mock.ExpectBegin()
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "CC Agent", nil, 2, "claude-code", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil)).
+		WithArgs(sqlmock.AnyArg(), "CC Agent", nil, 2, "claude-code", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 	mock.ExpectCommit()
 	mock.ExpectExec("INSERT INTO canvas_layouts").
@ -277,6 +278,40 @@ func TestWorkspaceList_WithData(t *testing.T) {
 	}
 }

+// ---------- workspace.go: Create with explicit max_concurrent_tasks ----------
+
+func TestWorkspaceCreate_MaxConcurrentTasksOverride(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	mock.ExpectBegin()
+	mock.ExpectExec("INSERT INTO workspaces").
+		WithArgs(sqlmock.AnyArg(), "Leader Agent", nil, 3, "claude-code", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), 3).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	mock.ExpectCommit()
+	mock.ExpectExec("INSERT INTO canvas_layouts").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	mock.ExpectExec("INSERT INTO structure_events").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	body := `{"name":"Leader Agent","runtime":"claude-code","max_concurrent_tasks":3}`
+	c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Create(c)
+
+	if w.Code != http.StatusCreated {
+		t.Errorf("expected 201, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet expectations: %v", err)
+	}
+}
+
 // ---------- registry.go: Register with provisioner URL preserved ----------

 func TestRegister_ProvisionerURLPreserved(t *testing.T) {
--- a/workspace-server/internal/handlers/handlers_test.go
+++ b/workspace-server/internal/handlers/handlers_test.go
@ -291,7 +291,7 @@ func TestWorkspaceCreate(t *testing.T) {
 	// Expect workspace INSERT (uuid is dynamic, use AnyArg for id, runtime, awareness_namespace).
 	// Default tier is 3 (Privileged) — see workspace.go create-handler comment.
 	mock.ExpectExec("INSERT INTO workspaces").
-		WithArgs(sqlmock.AnyArg(), "Test Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil)).
+		WithArgs(sqlmock.AnyArg(), "Test Agent", nil, 3, "langgraph", sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil), models.DefaultMaxConcurrentTasks).
 		WillReturnResult(sqlmock.NewResult(0, 1))

 	// Expect transaction commit (no secrets in this payload)
--- a/workspace-server/internal/handlers/org.go
+++ b/workspace-server/internal/handlers/org.go
@ -5,6 +5,7 @@ package handlers

 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"log"
 	"net/http"
@ -180,6 +181,108 @@ func NewOrgHandler(wh *WorkspaceHandler, b *events.Broadcaster, p *provisioner.P
 	}
 }

+// EnvRequirement is either a single env var name (strict: that exact
+// var must be configured) or an any-of group (any one of the listed
+// names satisfies the requirement).
+//
+// YAML shapes accepted:
+//
+//	required_env:
+//	  - GITHUB_TOKEN                              # single
+//	  - any_of: [ANTHROPIC_API_KEY, CLAUDE_CODE_OAUTH_TOKEN]   # OR group
+//
+// The any-of form exists because some runtimes accept either of two
+// credential shapes — Claude Code takes ANTHROPIC_API_KEY or an OAuth
+// token interchangeably, and forcing an org template to pick one
+// would falsely block the other. For JSON (GET /org/templates),
+// the same shapes round-trip: strings stay strings, groups stay
+// {any_of: [...]}.
+type EnvRequirement struct {
+	// Name is non-empty for a single required env var.
+	Name string
+	// AnyOf is non-empty for an OR group; any one member satisfies.
+	AnyOf []string
+}
+
+// Members returns every env name this requirement considers —
+// [Name] for single, AnyOf for groups. Used by preflight, collect,
+// and the name-validation regex gate.
+func (e EnvRequirement) Members() []string {
+	if e.Name != "" {
+		return []string{e.Name}
+	}
+	return e.AnyOf
+}
+
+// IsSatisfied reports whether any member of the requirement is
+// present in `configured`. Single: exact-match. AnyOf: at least
+// one hit.
+func (e EnvRequirement) IsSatisfied(configured map[string]struct{}) bool {
+	for _, m := range e.Members() {
+		if _, ok := configured[m]; ok {
+			return true
+		}
+	}
+	return false
+}
+
+// UnmarshalYAML accepts either a scalar (string → single) or a map
+// with an `any_of` list (→ group).
+func (e *EnvRequirement) UnmarshalYAML(value *yaml.Node) error {
+	if value.Kind == yaml.ScalarNode {
+		var s string
+		if err := value.Decode(&s); err != nil {
+			return err
+		}
+		e.Name = s
+		return nil
+	}
+	var alt struct {
+		AnyOf []string `yaml:"any_of"`
+	}
+	if err := value.Decode(&alt); err != nil {
+		return fmt.Errorf("env requirement must be a string or {any_of: [...]}: %w", err)
+	}
+	if len(alt.AnyOf) == 0 {
+		return fmt.Errorf("env requirement any_of must contain at least one env var")
+	}
+	e.AnyOf = alt.AnyOf
+	return nil
+}
+
+// MarshalJSON emits the dual shape so GET /org/templates callers get
+// {"required_env": ["GITHUB_TOKEN", {"any_of": [...]}]}, matching
+// the YAML syntax.
+func (e EnvRequirement) MarshalJSON() ([]byte, error) {
+	if e.Name != "" {
+		return json.Marshal(e.Name)
+	}
+	return json.Marshal(struct {
+		AnyOf []string `json:"any_of"`
+	}{AnyOf: e.AnyOf})
+}
+
+// UnmarshalJSON is the inverse — accepts the same dual shape so
+// POST /org/import with an inline `template` body works too.
+func (e *EnvRequirement) UnmarshalJSON(data []byte) error {
+	var s string
+	if err := json.Unmarshal(data, &s); err == nil {
+		e.Name = s
+		return nil
+	}
+	var alt struct {
+		AnyOf []string `json:"any_of"`
+	}
+	if err := json.Unmarshal(data, &alt); err != nil {
+		return fmt.Errorf("env requirement must be a string or {any_of: [...]}: %w", err)
+	}
+	if len(alt.AnyOf) == 0 {
+		return fmt.Errorf("env requirement any_of must contain at least one env var")
+	}
+	e.AnyOf = alt.AnyOf
+	return nil
+}
+
 // OrgTemplate is the YAML structure for an org hierarchy.
 type OrgTemplate struct {
 	Name           string              `yaml:"name" json:"name"`
@ -189,6 +292,18 @@ type OrgTemplate struct {
 	// GlobalMemories is a list of org-wide memories seeded as GLOBAL scope
 	// on the first root workspace (PM) during org import. Issue #1050.
 	GlobalMemories []models.MemorySeed `yaml:"global_memories" json:"global_memories"`
+	// RequiredEnv lists env vars that MUST be configured globally (or
+	// on every workspace in the subtree that needs them) before import
+	// succeeds. Each entry is either a plain string (strict) or an
+	// {any_of: [...]} group (at least one member must be set). Declared
+	// at the org level for shared creds; also extensible per-workspace
+	// via OrgWorkspace.RequiredEnv for team-scoped credentials.
+	RequiredEnv []EnvRequirement `yaml:"required_env" json:"required_env"`
+	// RecommendedEnv is the "nice-to-have" tier — import still succeeds
+	// without them, but features degrade. Same single|any_of shape as
+	// RequiredEnv so a recommended OR group reads "set any one of these
+	// to unlock the feature; all missing = warning".
+	RecommendedEnv []EnvRequirement `yaml:"recommended_env" json:"recommended_env"`
 }

 type OrgDefaults struct {
@ -287,15 +402,27 @@ type OrgWorkspace struct {
 	// InitialMemories are memories seeded into this workspace at creation
 	// time. If empty, defaults.initial_memories are used. Issue #1050.
 	InitialMemories []models.MemorySeed `yaml:"initial_memories" json:"initial_memories"`
-	Schedules       []OrgSchedule       `yaml:"schedules" json:"schedules"`
-	Channels        []OrgChannel        `yaml:"channels" json:"channels"`
+	// MaxConcurrentTasks: see models.CreateWorkspacePayload.
+	MaxConcurrentTasks int                 `yaml:"max_concurrent_tasks" json:"max_concurrent_tasks"`
+	Schedules          []OrgSchedule       `yaml:"schedules" json:"schedules"`
+	Channels           []OrgChannel        `yaml:"channels" json:"channels"`
 	External        bool                `yaml:"external" json:"external"`
 	URL             string              `yaml:"url" json:"url"`
 	Canvas          struct {
 		X float64 `yaml:"x" json:"x"`
 		Y float64 `yaml:"y" json:"y"`
 	} `yaml:"canvas" json:"canvas"`
-	Children []OrgWorkspace `yaml:"children" json:"children"`
+	// RequiredEnv / RecommendedEnv declared at the workspace level
+	// narrow down what a specific team needs beyond the org-wide union.
+	// When GET /org/templates walks the tree, these flow up into
+	// OrgTemplate.RequiredEnv / RecommendedEnv. A workspace's subtree
+	// inherits: a parent declaring ANTHROPIC_API_KEY as required
+	// means every descendant considers it required too (no override
+	// needed at each leaf). Same single|any_of shape as the org-level
+	// lists.
+	RequiredEnv    []EnvRequirement `yaml:"required_env" json:"required_env"`
+	RecommendedEnv []EnvRequirement `yaml:"recommended_env" json:"recommended_env"`
+	Children       []OrgWorkspace   `yaml:"children" json:"children"`
 }

 // ListTemplates handles GET /org/templates — lists available org templates.
@ -354,11 +481,18 @@ func (h *OrgHandler) ListTemplates(c *gin.Context) {
 			continue
 		}
 		count := countWorkspaces(tmpl.Workspaces)
+		// Walk the tree to collect required + recommended env union.
+		// Canvas uses these to render a preflight modal BEFORE firing
+		// the import — saves the user from a 15-workspace import that
+		// dies one container at a time on missing creds.
+		required, recommended := collectOrgEnv(&tmpl)
 		templates = append(templates, map[string]interface{}{
-			"dir":         e.Name(),
-			"name":        tmpl.Name,
-			"description": tmpl.Description,
-			"workspaces":  count,
+			"dir":             e.Name(),
+			"name":            tmpl.Name,
+			"description":     tmpl.Description,
+			"workspaces":      count,
+			"required_env":    required,
+			"recommended_env": recommended,
 		})
 	}

@ -370,6 +504,13 @@ func (h *OrgHandler) Import(c *gin.Context) {
 	var body struct {
 		Dir      string      `json:"dir"`      // org template directory name
 		Template OrgTemplate `json:"template"` // or inline template
+		// Force skips the required-env preflight. Used by tooling
+		// that already computed the preflight client-side and wants
+		// to proceed despite missing creds (usually because the
+		// user explicitly acknowledged the tradeoff). Default behavior
+		// refuses the import with a 412 and the missing-key list so
+		// the canvas can surface them in its preflight modal.
+		Force bool `json:"force"`
 	}
 	if err := c.ShouldBindJSON(&body); err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request body"})
@ -415,6 +556,59 @@ func (h *OrgHandler) Import(c *gin.Context) {
 		return
 	}

+	// Required-env preflight — refuses import when any required_env is
+	// missing from global_secrets (unless `force: true` overrides). The
+	// canvas runs the same check client-side against GET /org/templates
+	// output and shows a modal so users set keys before clicking Import;
+	// this server-side check is the authoritative guard in case a caller
+	// bypasses the UI (CLI, API clients, etc.). 412 Precondition Failed
+	// carries the missing-key list so tooling can render the same
+	// add-key flow.
+	required, _ := collectOrgEnv(&tmpl)
+	if body.Force {
+		// Log the bypass so a post-incident search can find who
+		// imported an org with missing creds. The common audit flow
+		// treats log.Printf at INFO as the low-cost trail for
+		// explicit-override actions — keeps force as a supported
+		// knob but makes it investigable.
+		log.Printf("Org import: force=true bypass — template=%q, required_env=%v", tmpl.Name, required)
+	} else if len(required) > 0 {
+		ctx := c.Request.Context()
+		configured, err := loadConfiguredGlobalSecretKeys(ctx)
+		if err != nil {
+			// Fail closed. Previously this fell through and imported
+			// anyway, defeating the preflight for exactly the case
+			// it's meant to cover. A DB hiccup should look like a
+			// retryable 500, not a silent green light for an import
+			// that will fail at container-start time on every node.
+			log.Printf("Org import preflight: global secrets lookup failed: %v", err)
+			c.JSON(http.StatusInternalServerError, gin.H{
+				"error": "could not verify required environment variables; try again or pass force=true to override",
+			})
+			return
+		}
+		var missing []EnvRequirement
+		for _, req := range required {
+			// For a single requirement this is exact-match; for an
+			// any-of group, any one member satisfies. Groups whose
+			// alternative is already configured drop out here — the
+			// user doesn't need to re-configure them.
+			if !req.IsSatisfied(configured) {
+				missing = append(missing, req)
+			}
+		}
+		if len(missing) > 0 {
+			c.JSON(http.StatusPreconditionFailed, gin.H{
+				"error":        "missing required environment variables",
+				"missing_env":  missing,
+				"required_env": required,
+				"template":     tmpl.Name,
+				"suggestion":   "set these as global secrets (POST /settings/secrets) or pass force=true to override",
+			})
+			return
+		}
+	}
+
 	results := []map[string]interface{}{}
 	var createErr error

@ -426,7 +620,8 @@ func (h *OrgHandler) Import(c *gin.Context) {
 	// using subtree-aware grid slots (children that are themselves
 	// parents get a bigger slot so they don't overflow into siblings).
 	for _, ws := range tmpl.Workspaces {
-		if err := h.createWorkspaceTree(ws, nil, ws.Canvas.X, ws.Canvas.Y, tmpl.Defaults, orgBaseDir, &results, provisionSem); err != nil {
+		// Root: relX/relY == absX/absY (no parent to be relative to).
+		if err := h.createWorkspaceTree(ws, nil, ws.Canvas.X, ws.Canvas.Y, ws.Canvas.X, ws.Canvas.Y, tmpl.Defaults, orgBaseDir, &results, provisionSem); err != nil {
 			createErr = err
 			break
 		}
--- a/workspace-server/internal/handlers/org_import.go
+++ b/workspace-server/internal/handlers/org_import.go
@ -10,6 +10,8 @@ import (
 	"log"
 	"os"
 	"path/filepath"
+	"regexp"
+	"sort"
 	"strings"
 	"time"

@ -28,7 +30,13 @@ import (
 // parent.abs + childSlotInGrid(index, siblingSizes) computed by the
 // caller. Storing already-absolute coords means a child that is itself
 // a parent can simply compound the grid without any per-call math.
-func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX, absY float64, defaults OrgDefaults, orgBaseDir string, results *[]map[string]interface{}, provisionSem chan struct{}) error {
+// relX / relY are THIS workspace's position RELATIVE to its parent's
+// absolute origin (i.e. childSlotInGrid output for children; 0,0 for
+// roots since a root's absolute IS its relative). The broadcast
+// payload ships relative coords so the canvas can drop the node
+// straight into the parent's child-coordinate space without doing a
+// canvas-wide absolute-position walk.
+func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX, absY, relX, relY float64, defaults OrgDefaults, orgBaseDir string, results *[]map[string]interface{}, provisionSem chan struct{}) error {
 	// Apply defaults
 	runtime := ws.Runtime
 	if runtime == "" {
@ -103,10 +111,14 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 	// (see canvas-topology.ts), so imports don't spray the viewport.
 	initialCollapsed := false

+	maxConcurrent := ws.MaxConcurrentTasks
+	if maxConcurrent <= 0 {
+		maxConcurrent = models.DefaultMaxConcurrentTasks
+	}
 	_, err := db.DB.ExecContext(ctx, `
-		INSERT INTO workspaces (id, name, role, tier, runtime, awareness_namespace, status, parent_id, workspace_dir, workspace_access)
-		VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
-	`, id, ws.Name, role, tier, runtime, awarenessNS, "provisioning", parentID, workspaceDir, workspaceAccess)
+		INSERT INTO workspaces (id, name, role, tier, runtime, awareness_namespace, status, parent_id, workspace_dir, workspace_access, max_concurrent_tasks)
+		VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
+	`, id, ws.Name, role, tier, runtime, awarenessNS, "provisioning", parentID, workspaceDir, workspaceAccess, maxConcurrent)
 	if err != nil {
 		log.Printf("Org import: failed to create %s: %v", ws.Name, err)
 		return fmt.Errorf("failed to create %s: %w", ws.Name, err)
@ -128,10 +140,23 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 	}

 	// Broadcast — include runtime so the canvas pill renders the right
-	// badge immediately instead of "unknown".
-	h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISIONING", id, map[string]interface{}{
+	// badge immediately instead of "unknown". parent_id + x/y let the
+	// canvas's org-deploy animation spawn the child from the parent's
+	// current coords and tween into its reserved slot, instead of
+	// landing in a default grid position first and snapping on the
+	// next hydrate.
+	payload := map[string]interface{}{
 		"name": ws.Name, "tier": tier, "runtime": runtime,
-	})
+		// Parent-relative coords — the canvas's React Flow node uses
+		// these as the node's position when parent_id is set (React
+		// Flow treats node.position as parent-relative when the node
+		// has a parentId). For roots, relX/relY == absX/absY.
+		"x": relX, "y": relY,
+	}
+	if parentID != nil {
+		payload["parent_id"] = *parentID
+	}
+	h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISIONING", id, payload)

 	// Seed initial memories from workspace config or defaults (issue #1050).
 	// Per-workspace initial_memories override defaults; if workspace has none,
@ -509,7 +534,9 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 			slotX, slotY := childSlotInGrid(i, siblingSizes)
 			childAbsX := absX + slotX
 			childAbsY := absY + slotY
-			if err := h.createWorkspaceTree(child, &id, childAbsX, childAbsY, defaults, orgBaseDir, results, provisionSem); err != nil {
+			// slotX/slotY are already parent-relative — that's
+			// exactly what childSlotInGrid returns.
+			if err := h.createWorkspaceTree(child, &id, childAbsX, childAbsY, slotX, slotY, defaults, orgBaseDir, results, provisionSem); err != nil {
 				return err
 			}
 			time.Sleep(workspaceCreatePacingMs * time.Millisecond)
@ -519,6 +546,213 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 	return nil
 }

+// envVarNamePattern guards template-supplied env var names against
+// pathological inputs. A malicious template could ship
+// required_env: ["'; DROP …"] or whitespace-only entries that would
+// flow through collectOrgEnv → into the 412 response body and,
+// worse, into the modal's PUT /settings/secrets input. Schema
+// already has `key TEXT NOT NULL UNIQUE` and our queries are
+// parameterised so SQL injection isn't the threat — the real risks
+// are UI rendering weirdness (newlines, NUL bytes, zero-width chars)
+// and downstream env-var semantics (POSIX requires uppercase +
+// underscore + digit). A strict regex filters both classes of
+// problem at a single choke point.
+var envVarNamePattern = regexp.MustCompile(`^[A-Z][A-Z0-9_]{0,127}$`)
+
+// sanitizeEnvMembers filters a requirement's member list through the
+// name-validation regex, logging rejections. Returns the filtered
+// list and a boolean indicating whether any valid members remain.
+// Used so a group containing one valid + one bogus name is kept
+// (valid member carries the group) rather than silently dropped.
+func sanitizeEnvMembers(members []string, where string) ([]string, bool) {
+	out := make([]string, 0, len(members))
+	for _, k := range members {
+		if !envVarNamePattern.MatchString(k) {
+			if k != "" {
+				log.Printf("collectOrgEnv: rejecting invalid env var name %q from %s (must match %s)", k, where, envVarNamePattern)
+			}
+			continue
+		}
+		out = append(out, k)
+	}
+	return out, len(out) > 0
+}
+
+// envRequirementKey canonicalises a requirement for dedup — sorted
+// member list joined with NUL so `any_of: [A, B]` and `any_of: [B, A]`
+// collapse to the same key. Single requirements are length-1 groups.
+func envRequirementKey(members []string) string {
+	cp := append([]string(nil), members...)
+	sort.Strings(cp)
+	return strings.Join(cp, "\x00")
+}
+
+// collectOrgEnv walks the whole template tree and returns the union of
+// required_env and recommended_env declared anywhere — at the org
+// level, on root workspaces, or on any nested child. Deduplicates by
+// group membership (same set of members = same requirement) and
+// sorts deterministically so the canvas sees a stable order.
+//
+// "Required wins" rules:
+//
+//   - A requirement that appears in BOTH required and recommended
+//     (same members) surfaces only as required.
+//   - A single-name requirement (e.g. "API_KEY") and a group that
+//     contains that same name (e.g. {any_of: [API_KEY, OTHER]}) are
+//     NOT deduplicated — they're semantically different (strict vs
+//     satisfiable-by-alternative) and the stricter "single" one wins,
+//     so the any-of group is dropped when its members overlap with a
+//     strict requirement declared elsewhere.
+//
+// Invalid names fail envVarNamePattern; the filter is applied per
+// group so a group with one bogus entry keeps the rest. A group
+// whose ALL members are invalid is dropped entirely with a log.
+func collectOrgEnv(tmpl *OrgTemplate) (required, recommended []EnvRequirement) {
+	reqByKey := map[string]EnvRequirement{}
+	recByKey := map[string]EnvRequirement{}
+	// Names covered by strict (single) required entries. A group in
+	// EITHER tier whose any-of contains ONE of these names is
+	// dominated by the strict requirement and gets dropped on the
+	// second pass.
+	strictRequiredNames := map[string]struct{}{}
+
+	accept := func(into map[string]EnvRequirement, src []EnvRequirement, where string, markStrict bool) {
+		for _, req := range src {
+			members, ok := sanitizeEnvMembers(req.Members(), where)
+			if !ok {
+				continue
+			}
+			key := envRequirementKey(members)
+			if _, exists := into[key]; exists {
+				continue
+			}
+			if req.Name != "" && len(members) == 1 {
+				into[key] = EnvRequirement{Name: members[0]}
+				if markStrict {
+					strictRequiredNames[members[0]] = struct{}{}
+				}
+			} else {
+				into[key] = EnvRequirement{AnyOf: members}
+			}
+		}
+	}
+	accept(reqByKey, tmpl.RequiredEnv, "template root", true)
+	accept(recByKey, tmpl.RecommendedEnv, "template root", false)
+	var walk func([]OrgWorkspace)
+	walk = func(ws []OrgWorkspace) {
+		for _, w := range ws {
+			accept(reqByKey, w.RequiredEnv, "workspace "+w.Name, true)
+			accept(recByKey, w.RecommendedEnv, "workspace "+w.Name, false)
+			walk(w.Children)
+		}
+	}
+	walk(tmpl.Workspaces)
+
+	// Required wins across tiers: any requirement whose members
+	// overlap with a strict required name gets dropped from
+	// recommended. Keeps the canvas modal from showing the same
+	// key in both sections.
+	prune := func(from map[string]EnvRequirement) {
+		for k, r := range from {
+			for _, m := range r.Members() {
+				if _, strict := strictRequiredNames[m]; strict {
+					delete(from, k)
+					break
+				}
+			}
+		}
+	}
+	prune(recByKey)
+
+	// Same-tier: a strict required X dominates any-of groups in
+	// required that CONTAIN X (a group saying "any of X, Y" is
+	// automatically satisfied when X is required anyway, so it's
+	// redundant). Same logic applied to recommended.
+	pruneSameTier := func(tier map[string]EnvRequirement) {
+		strictInTier := map[string]struct{}{}
+		for _, r := range tier {
+			if r.Name != "" {
+				strictInTier[r.Name] = struct{}{}
+			}
+		}
+		for k, r := range tier {
+			if len(r.AnyOf) == 0 {
+				continue
+			}
+			for _, m := range r.AnyOf {
+				if _, strict := strictInTier[m]; strict {
+					delete(tier, k)
+					break
+				}
+			}
+		}
+	}
+	pruneSameTier(reqByKey)
+	pruneSameTier(recByKey)
+
+	required = flattenAndSortRequirements(reqByKey)
+	recommended = flattenAndSortRequirements(recByKey)
+	return required, recommended
+}
+
+func flattenAndSortRequirements(by map[string]EnvRequirement) []EnvRequirement {
+	out := make([]EnvRequirement, 0, len(by))
+	for _, r := range by {
+		out = append(out, r)
+	}
+	sort.Slice(out, func(i, j int) bool {
+		// Sort singles first by name; groups after, ordered by
+		// joined-member string. Gives the canvas a deterministic
+		// render order so the same template always produces the
+		// same modal layout.
+		iSingle := out[i].Name != ""
+		jSingle := out[j].Name != ""
+		if iSingle != jSingle {
+			return iSingle
+		}
+		if iSingle {
+			return out[i].Name < out[j].Name
+		}
+		return envRequirementKey(out[i].AnyOf) < envRequirementKey(out[j].AnyOf)
+	})
+	return out
+}
+
+// loadConfiguredGlobalSecretKeys returns the set of key names present
+// in global_secrets WHERE the encrypted_value is non-empty. Filtering
+// on the payload size catches the failure mode where a row was
+// upserted with an empty value (historical rows predating the
+// binding:"required" guard on SetGlobal, or a future direct SQL
+// path that skips it) — the preflight would otherwise report the
+// key as "configured" and the per-container preflight would still
+// fail at start time, defeating the whole feature.
+// The LIMIT is a sanity cap: at realistic tenant sizes (< 1k
+// secrets) it's a no-op; at pathological sizes it stops one slow
+// query from wedging org imports. A hit gets logged so operators
+// can investigate.
+const globalSecretsPreflightLimit = 10000
+
+func loadConfiguredGlobalSecretKeys(ctx context.Context) (map[string]struct{}, error) {
+	rows, err := db.DB.QueryContext(ctx,
+		`SELECT key FROM global_secrets WHERE octet_length(encrypted_value) > 0 LIMIT $1`,
+		globalSecretsPreflightLimit)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	out := map[string]struct{}{}
+	for rows.Next() {
+		var k string
+		if scanErr := rows.Scan(&k); scanErr == nil && k != "" {
+			out[k] = struct{}{}
+		}
+	}
+	if len(out) == globalSecretsPreflightLimit {
+		log.Printf("loadConfiguredGlobalSecretKeys: hit LIMIT %d — org-import preflight may be incomplete", globalSecretsPreflightLimit)
+	}
+	return out, rows.Err()
+}
+
 func countWorkspaces(workspaces []OrgWorkspace) int {
 	count := len(workspaces)
 	for _, ws := range workspaces {
--- a/workspace-server/internal/handlers/org_test.go
+++ b/workspace-server/internal/handlers/org_test.go
@ -1,6 +1,7 @@
 package handlers

 import (
+	"sort"
 	"strings"
 	"testing"
 	"time"
@ -650,3 +651,428 @@ func TestOrgImport_ScheduleComputeError(t *testing.T) {
 		})
 	}
 }
+
+// ============================================================================
+// Org env-preflight aggregation (collectOrgEnv)
+// ============================================================================
+
+// strictReq builds a slice of single-name EnvRequirements for test
+// fixtures. Equivalent to the old []string literal but wrapped in
+// the new union shape.
+func strictReq(names ...string) []EnvRequirement {
+	out := make([]EnvRequirement, 0, len(names))
+	for _, n := range names {
+		out = append(out, EnvRequirement{Name: n})
+	}
+	return out
+}
+
+// anyOfReq builds a single any-of EnvRequirement for test fixtures.
+func anyOfReq(names ...string) EnvRequirement {
+	return EnvRequirement{AnyOf: append([]string(nil), names...)}
+}
+
+// reqNames flattens a slice of EnvRequirements into a single comparable
+// slice: single-name reqs contribute their Name, any-of reqs contribute
+// "anyOf(A|B|C)" with members sorted for deterministic output. Lets
+// tests assert against a string form regardless of which kind each
+// entry takes.
+func reqNames(reqs []EnvRequirement) []string {
+	out := make([]string, 0, len(reqs))
+	for _, r := range reqs {
+		if r.Name != "" {
+			out = append(out, r.Name)
+			continue
+		}
+		members := append([]string(nil), r.AnyOf...)
+		sort.Strings(members)
+		out = append(out, "anyOf("+strings.Join(members, "|")+")")
+	}
+	return out
+}
+
+func TestCollectOrgEnv_UnionAcrossLevels(t *testing.T) {
+	tmpl := &OrgTemplate{
+		RequiredEnv:    strictReq("ANTHROPIC_API_KEY"),
+		RecommendedEnv: strictReq("SLACK_WEBHOOK_URL"),
+		Workspaces: []OrgWorkspace{
+			{
+				Name:        "Root",
+				RequiredEnv: strictReq("GITHUB_TOKEN"),
+				Children: []OrgWorkspace{
+					{
+						Name:           "Leaf",
+						RequiredEnv:    strictReq("OPENROUTER_API_KEY"),
+						RecommendedEnv: strictReq("DISCORD_WEBHOOK_URL"),
+					},
+				},
+			},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	// Required is the union of top-level + root + leaf.
+	wantReq := []string{"ANTHROPIC_API_KEY", "GITHUB_TOKEN", "OPENROUTER_API_KEY"}
+	if !stringSlicesEqual(reqNames(req), wantReq) {
+		t.Errorf("required mismatch: got %v, want %v", reqNames(req), wantReq)
+	}
+	wantRec := []string{"DISCORD_WEBHOOK_URL", "SLACK_WEBHOOK_URL"}
+	if !stringSlicesEqual(reqNames(rec), wantRec) {
+		t.Errorf("recommended mismatch: got %v, want %v", reqNames(rec), wantRec)
+	}
+}
+
+func TestCollectOrgEnv_RequiredWinsOverRecommended(t *testing.T) {
+	// Same key declared at one layer as recommended and another as
+	// required MUST surface only on the required side — a required
+	// declaration is strictly stricter than a recommended one, and
+	// listing it in both tiers would confuse the preflight modal.
+	tmpl := &OrgTemplate{
+		RecommendedEnv: strictReq("API_KEY"),
+		Workspaces: []OrgWorkspace{
+			{Name: "X", RequiredEnv: strictReq("API_KEY")},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "API_KEY" {
+		t.Errorf("required should contain API_KEY, got %v", reqNames(req))
+	}
+	for _, r := range rec {
+		if r.Name == "API_KEY" {
+			t.Errorf("API_KEY must not appear in recommended once required elsewhere")
+		}
+	}
+}
+
+func TestCollectOrgEnv_Dedup(t *testing.T) {
+	// Same key declared twice at different levels should appear once.
+	tmpl := &OrgTemplate{
+		RequiredEnv: strictReq("K", "K"),
+		Workspaces: []OrgWorkspace{
+			{Name: "A", RequiredEnv: strictReq("K")},
+			{Name: "B", RequiredEnv: strictReq("K"), Children: []OrgWorkspace{
+				{Name: "C", RequiredEnv: strictReq("K")},
+			}},
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "K" {
+		t.Errorf("dedup failed: got %v, want [K]", reqNames(req))
+	}
+}
+
+func TestCollectOrgEnv_Empty(t *testing.T) {
+	tmpl := &OrgTemplate{}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 0 || len(rec) != 0 {
+		t.Errorf("empty template should produce empty slices, got req=%v rec=%v", reqNames(req), reqNames(rec))
+	}
+}
+
+// stringSlicesEqual checks ordered equality — collectOrgEnv sorts its
+// output so callers can do deterministic comparisons.
+func stringSlicesEqual(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func TestCollectOrgEnv_RequiredWinsOnSameStruct(t *testing.T) {
+	// The same key declared required AND recommended on the SAME
+	// workspace node (rare but legal to parse) must still dedup
+	// correctly and end up required-only.
+	tmpl := &OrgTemplate{
+		Workspaces: []OrgWorkspace{
+			{
+				Name:           "X",
+				RequiredEnv:    strictReq("API_KEY"),
+				RecommendedEnv: strictReq("API_KEY"),
+			},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "API_KEY" {
+		t.Errorf("required should contain API_KEY once, got %v", reqNames(req))
+	}
+	for _, r := range rec {
+		if r.Name == "API_KEY" {
+			t.Errorf("API_KEY must not appear in recommended when also required on same struct")
+		}
+	}
+}
+
+func TestCollectOrgEnv_RejectsInvalidNames(t *testing.T) {
+	// Names failing envVarNamePattern (lowercase, traversal, whitespace,
+	// shell metachars) must be dropped silently — the log line is not
+	// asserted here; the output slice assertion is enough to prove the
+	// filter fires.
+	tmpl := &OrgTemplate{
+		RequiredEnv: strictReq(
+			"VALID_ONE",
+			"lowercase_bad",
+			"../../etc/passwd",
+			"name with spaces",
+			"WITH-DASH",
+			"'; DROP TABLE users;--",
+			"",
+			"A", // single char — still valid per regex
+		),
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if !stringSlicesEqual(reqNames(req), []string{"A", "VALID_ONE"}) {
+		t.Errorf("expected only valid names, got %v", reqNames(req))
+	}
+}
+
+// TestOrgTemplate_ClaudeAnyOfAuthPreflight exercises the shape the
+// ux-ab-lab template ships with: a single any-of group at the org
+// level covering ANTHROPIC_API_KEY vs. CLAUDE_CODE_OAUTH_TOKEN, plus
+// two strict recommended entries (SERPER_API_KEY, VERCEL_TOKEN).
+// Proves the end-to-end YAML → OrgTemplate → collectOrgEnv → IsSatisfied
+// pipeline works for the canonical "Claude sub OR API key" pattern
+// without depending on the on-disk template file (org-templates/ is
+// populated by the clone-manifest, not tracked in this monorepo).
+func TestOrgTemplate_ClaudeAnyOfAuthPreflight(t *testing.T) {
+	src := `
+name: UX A/B Lab
+required_env:
+  - any_of:
+      - ANTHROPIC_API_KEY
+      - CLAUDE_CODE_OAUTH_TOKEN
+recommended_env:
+  - SERPER_API_KEY
+  - VERCEL_TOKEN
+workspaces:
+  - name: Design Director
+    children:
+      - name: UX Researcher
+      - name: Visual Designer
+      - name: React Engineer
+      - name: Deploy Engineer
+      - name: A11y + SEO Auditor
+      - name: Perf Auditor
+`
+	var tmpl OrgTemplate
+	if err := yaml.Unmarshal([]byte(src), &tmpl); err != nil {
+		t.Fatalf("unmarshal: %v", err)
+	}
+	if len(tmpl.Workspaces) != 1 || len(tmpl.Workspaces[0].Children) != 6 {
+		t.Fatalf("expected 1 root with 6 children, got shape %+v", tmpl.Workspaces)
+	}
+
+	required, recommended := collectOrgEnv(&tmpl)
+	if len(required) != 1 {
+		t.Fatalf("expected 1 required requirement (the any-of group), got %d: %v", len(required), reqNames(required))
+	}
+	if required[0].Name != "" {
+		t.Errorf("expected any-of group, got strict name %q", required[0].Name)
+	}
+	wantMembers := []string{"ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"}
+	got := append([]string(nil), required[0].AnyOf...)
+	sort.Strings(got)
+	if !stringSlicesEqual(got, wantMembers) {
+		t.Errorf("any-of members mismatch: got %v, want %v", got, wantMembers)
+	}
+
+	// Either member should independently satisfy the group.
+	if !required[0].IsSatisfied(map[string]struct{}{"ANTHROPIC_API_KEY": {}}) {
+		t.Errorf("ANTHROPIC_API_KEY alone should satisfy the group")
+	}
+	if !required[0].IsSatisfied(map[string]struct{}{"CLAUDE_CODE_OAUTH_TOKEN": {}}) {
+		t.Errorf("CLAUDE_CODE_OAUTH_TOKEN alone should satisfy the group")
+	}
+	if required[0].IsSatisfied(map[string]struct{}{"OPENAI_API_KEY": {}}) {
+		t.Errorf("unrelated key should NOT satisfy the group")
+	}
+
+	wantRec := []string{"SERPER_API_KEY", "VERCEL_TOKEN"}
+	if !stringSlicesEqual(reqNames(recommended), wantRec) {
+		t.Errorf("recommended mismatch: got %v, want %v", reqNames(recommended), wantRec)
+	}
+}
+
+// TestEnvRequirement_UnmarshalYAML proves the on-disk YAML shape
+// (scalar OR `{any_of: [...]}` block) round-trips into EnvRequirement
+// correctly. The preflight pipeline reads user-authored org.yaml
+// files; a regression here would silently drop requirements.
+func TestEnvRequirement_UnmarshalYAML(t *testing.T) {
+	src := `
+required_env:
+  - GITHUB_TOKEN
+  - any_of:
+      - ANTHROPIC_API_KEY
+      - CLAUDE_CODE_OAUTH_TOKEN
+`
+	var parsed struct {
+		RequiredEnv []EnvRequirement `yaml:"required_env"`
+	}
+	if err := yaml.Unmarshal([]byte(src), &parsed); err != nil {
+		t.Fatalf("unmarshal failed: %v", err)
+	}
+	if len(parsed.RequiredEnv) != 2 {
+		t.Fatalf("want 2 requirements, got %d", len(parsed.RequiredEnv))
+	}
+	if parsed.RequiredEnv[0].Name != "GITHUB_TOKEN" {
+		t.Errorf("first should be strict GITHUB_TOKEN, got %+v", parsed.RequiredEnv[0])
+	}
+	if parsed.RequiredEnv[1].Name != "" || len(parsed.RequiredEnv[1].AnyOf) != 2 {
+		t.Errorf("second should be any-of group, got %+v", parsed.RequiredEnv[1])
+	}
+}
+
+// TestEnvRequirement_UnmarshalYAML_RejectsEmptyAnyOf guards against a
+// template that ships `any_of: []` — ambiguous semantics (impossible
+// to satisfy), so the parser must fail loudly rather than silently
+// pass a never-satisfiable requirement through the preflight.
+func TestEnvRequirement_UnmarshalYAML_RejectsEmptyAnyOf(t *testing.T) {
+	src := `
+required_env:
+  - any_of: []
+`
+	var parsed struct {
+		RequiredEnv []EnvRequirement `yaml:"required_env"`
+	}
+	err := yaml.Unmarshal([]byte(src), &parsed)
+	if err == nil {
+		t.Errorf("expected error for empty any_of, got nil: %+v", parsed)
+	}
+}
+
+// ---------------------------------------------------------------------
+// any_of group tests — the new EnvRequirement union shape allows a
+// single requirement to be satisfied by any of a list of members (e.g.
+// ANTHROPIC_API_KEY OR CLAUDE_CODE_OAUTH_TOKEN). collectOrgEnv +
+// IsSatisfied together must handle this correctly.
+// ---------------------------------------------------------------------
+
+func TestEnvRequirement_IsSatisfied(t *testing.T) {
+	configured := map[string]struct{}{
+		"ANTHROPIC_API_KEY": {},
+		"GITHUB_TOKEN":      {},
+	}
+	tests := []struct {
+		name string
+		req  EnvRequirement
+		want bool
+	}{
+		{"strict present", EnvRequirement{Name: "ANTHROPIC_API_KEY"}, true},
+		{"strict absent", EnvRequirement{Name: "MISSING_KEY"}, false},
+		{"any-of first member present", anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"), true},
+		{"any-of second member present", anyOfReq("CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_API_KEY"), true},
+		{"any-of none present", anyOfReq("OPENAI_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"), false},
+		{"any-of single member present", anyOfReq("GITHUB_TOKEN"), true},
+	}
+	for _, tt := range tests {
+		if got := tt.req.IsSatisfied(configured); got != tt.want {
+			t.Errorf("%s: got %v, want %v", tt.name, got, tt.want)
+		}
+	}
+}
+
+func TestCollectOrgEnv_AnyOfGroupPreserved(t *testing.T) {
+	// A group with two alternatives should come through as a single
+	// EnvRequirement carrying both members.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 {
+		t.Fatalf("expected 1 requirement, got %d: %v", len(req), reqNames(req))
+	}
+	if req[0].Name != "" {
+		t.Errorf("expected any-of group, got strict name %q", req[0].Name)
+	}
+	wantMembers := []string{"ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"}
+	got := append([]string(nil), req[0].AnyOf...)
+	sort.Strings(got)
+	if !stringSlicesEqual(got, wantMembers) {
+		t.Errorf("any-of members mismatch: got %v, want %v", got, wantMembers)
+	}
+}
+
+func TestCollectOrgEnv_AnyOfGroupDedup(t *testing.T) {
+	// Two identical groups (members in different order) declared at
+	// different levels must collapse to one.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+		},
+		Workspaces: []OrgWorkspace{
+			{
+				Name: "Root",
+				RequiredEnv: []EnvRequirement{
+					anyOfReq("CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_API_KEY"),
+				},
+			},
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 {
+		t.Errorf("expected 1 requirement after dedup, got %d: %v", len(req), reqNames(req))
+	}
+}
+
+func TestCollectOrgEnv_StrictDominatesGroup(t *testing.T) {
+	// If a strict requirement X is declared anywhere, any-of groups
+	// that CONTAIN X are redundant — the strict requirement will force
+	// X to be configured, which satisfies any group mentioning it too.
+	// Same-tier pruning drops the group.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			{Name: "ANTHROPIC_API_KEY"},
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "ANTHROPIC_API_KEY" {
+		t.Errorf("strict should dominate group, got %v", reqNames(req))
+	}
+}
+
+func TestCollectOrgEnv_StrictRequiredDominatesRecommendedGroup(t *testing.T) {
+	// Cross-tier: a strict required X drops any-of groups in the
+	// recommended tier that mention X.
+	tmpl := &OrgTemplate{
+		RequiredEnv: strictReq("ANTHROPIC_API_KEY"),
+		RecommendedEnv: []EnvRequirement{
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+			{Name: "SLACK_WEBHOOK_URL"},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "ANTHROPIC_API_KEY" {
+		t.Errorf("required mismatch: got %v", reqNames(req))
+	}
+	// The any-of group should have been pruned; only SLACK remains.
+	if len(rec) != 1 || rec[0].Name != "SLACK_WEBHOOK_URL" {
+		t.Errorf("recommended mismatch: got %v, want [SLACK_WEBHOOK_URL]", reqNames(rec))
+	}
+}
+
+func TestCollectOrgEnv_AnyOfWithInvalidMemberKeepsValidOnes(t *testing.T) {
+	// A group with one valid + one invalid member should keep the
+	// valid one (group carried by any remaining legitimate name). A
+	// group where ALL members are invalid is dropped entirely.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			anyOfReq("VALID_ONE", "lowercase_bad"),
+			anyOfReq("'; DROP TABLE;--", ""),
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 {
+		t.Fatalf("expected 1 requirement, got %d: %v", len(req), reqNames(req))
+	}
+	// The remaining group has only one valid member, so it gets
+	// promoted to a single-name requirement (len(members)==1 path).
+	if req[0].Name != "VALID_ONE" && !stringSlicesEqual(req[0].AnyOf, []string{"VALID_ONE"}) {
+		t.Errorf("expected VALID_ONE to survive, got %v", reqNames(req))
+	}
+}
--- a/workspace-server/internal/handlers/registry.go
+++ b/workspace-server/internal/handlers/registry.go
@ -454,6 +454,29 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea
 		return
 	}

+	// Self-reported runtime wedge: takes precedence over the error_rate
+	// path. The heartbeat task lives in its own asyncio task and keeps
+	// firing 200s even after claude_agent_sdk locks up on
+	// `Control request timeout: initialize` — so error_rate stays at 0
+	// (no calls have been recorded as errors yet) while every actual
+	// /a2a POST hangs. The workspace tells us about that case via
+	// runtime_state="wedged"; we honor it directly. Sample_error from
+	// the heartbeat carries the human-readable reason ("SDK init
+	// timeout — restart workspace"), which the canvas surfaces in the
+	// degraded card without the operator scraping container logs.
+	if payload.RuntimeState == "wedged" && currentStatus == "online" {
+		_, err := db.DB.ExecContext(ctx,
+			`UPDATE workspaces SET status = 'degraded', updated_at = now() WHERE id = $1 AND status = 'online'`,
+			payload.WorkspaceID)
+		if err != nil {
+			log.Printf("Heartbeat: failed to mark %s degraded (wedged): %v", payload.WorkspaceID, err)
+		}
+		h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_DEGRADED", payload.WorkspaceID, map[string]interface{}{
+			"runtime_state": "wedged",
+			"sample_error":  payload.SampleError,
+		})
+	}
+
 	if currentStatus == "online" && payload.ErrorRate >= 0.5 {
 		if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'degraded', updated_at = now() WHERE id = $1`, payload.WorkspaceID); err != nil {
 			log.Printf("Heartbeat: failed to mark %s degraded: %v", payload.WorkspaceID, err)
@ -464,7 +487,13 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea
 		})
 	}

-	if currentStatus == "degraded" && payload.ErrorRate < 0.1 {
+	// Recovery from degraded → online when BOTH the error rate has
+	// fallen back AND the workspace is no longer reporting a wedge.
+	// The wedge condition is sticky for the process lifetime
+	// (claude_sdk_executor only clears it on restart), so when the
+	// container restarts and starts heartbeating fresh — RuntimeState
+	// is empty, error_rate is 0 — this branch flips us back to online.
+	if currentStatus == "degraded" && payload.ErrorRate < 0.1 && payload.RuntimeState == "" {
 		if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'online', updated_at = now() WHERE id = $1`, payload.WorkspaceID); err != nil {
 			log.Printf("Heartbeat: failed to recover %s to online: %v", payload.WorkspaceID, err)
 		}
--- a/workspace-server/internal/handlers/registry_test.go
+++ b/workspace-server/internal/handlers/registry_test.go
@ -298,6 +298,163 @@ func TestHeartbeatHandler_OnlineStaysOnline(t *testing.T) {
 	}
 }

+// ==================== Heartbeat — runtime wedge (claude_agent_sdk init timeout) ====================
+
+// TestHeartbeatHandler_RuntimeWedged_FlipsOnlineToDegraded verifies the
+// runtime_state="wedged" path. Heartbeat task in the workspace lives in
+// its own asyncio task and keeps reporting online while the Claude SDK
+// is wedged on Control request timeout; the workspace tells us about
+// the wedge via this field, and we honor it by flipping status →
+// degraded with the wedge reason in last_sample_error.
+func TestHeartbeatHandler_RuntimeWedged_FlipsOnlineToDegraded(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	wedgeMsg := "claude_agent_sdk wedge: Control request timeout: initialize — restart workspace to recover"
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
+
+	// Heartbeat UPDATE — sample_error carries the wedge reason from the
+	// workspace's _runtime_state_payload() helper.
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-wedged", 0.0, wedgeMsg, 0, 600, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// evaluateStatus: currentStatus = online
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("online"))
+
+	// The wedge-handling branch fires the degraded UPDATE with the
+	// `AND status = 'online'` guard (race-safe against concurrent
+	// removal). Match the SQL with the guard included.
+	mock.ExpectExec("UPDATE workspaces SET status = 'degraded'.*status = 'online'").
+		WithArgs("ws-wedged").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// RecordAndBroadcast for WORKSPACE_DEGRADED
+	mock.ExpectExec("INSERT INTO structure_events").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	body := `{"workspace_id":"ws-wedged","error_rate":0.0,"sample_error":"` + wedgeMsg + `","active_tasks":0,"uptime_seconds":600,"runtime_state":"wedged"}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestHeartbeatHandler_DegradedRecoversOnlyAfterWedgeClears verifies that
+// the degraded → online recovery path requires BOTH error_rate < 0.1
+// AND runtime_state cleared. A workspace still reporting wedged stays
+// degraded even when error_rate happens to be 0 (no calls have been
+// recorded as errors yet — the wedge is captured as a runtime state,
+// not an error count).
+func TestHeartbeatHandler_DegradedRecoversOnlyAfterWedgeClears(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-still-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
+
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-still-wedged", 0.0, "still broken", 0, 800, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// currentStatus = degraded
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-still-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("degraded"))
+
+	// No additional UPDATE expected — the recovery branch's
+	// `runtime_state == ""` guard blocks the flip back to online.
+	// (sqlmock fails the test if any unmocked Exec runs.)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	body := `{"workspace_id":"ws-still-wedged","error_rate":0.0,"sample_error":"still broken","active_tasks":0,"uptime_seconds":800,"runtime_state":"wedged"}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestHeartbeatHandler_DegradedToOnline_AfterWedgeClears verifies the
+// happy-path recovery: a workspace previously marked degraded is
+// post-restart, error_rate is back to 0, and runtime_state is empty
+// (the new process re-imported claude_sdk_executor with the flag
+// fresh). Status flips back to online and a WORKSPACE_ONLINE event
+// fires.
+func TestHeartbeatHandler_DegradedToOnline_AfterWedgeClears(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-recovered").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
+
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-recovered", 0.0, "", 0, 30, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-recovered").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("degraded"))
+
+	// Recovery UPDATE fires (degraded → online).
+	mock.ExpectExec("UPDATE workspaces SET status = 'online'").
+		WithArgs("ws-recovered").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	mock.ExpectExec("INSERT INTO structure_events").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	// runtime_state intentionally absent (== ""); error_rate = 0; this
+	// is exactly what a freshly-restarted workspace's first heartbeat
+	// looks like.
+	body := `{"workspace_id":"ws-recovered","error_rate":0.0,"sample_error":"","active_tasks":0,"uptime_seconds":30}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
 // ==================== UpdateCard ====================

 func TestUpdateCard_Success(t *testing.T) {
--- a/workspace-server/internal/handlers/secrets.go
+++ b/workspace-server/internal/handlers/secrets.go
@ -466,3 +466,70 @@ func (h *SecretsHandler) GetModel(c *gin.Context) {

 	c.JSON(http.StatusOK, gin.H{"model": string(decrypted), "source": "workspace_secrets"})
 }
+
+// SetModel handles PUT /workspaces/:id/model — writes the model slug
+// into workspace_secrets as MODEL_PROVIDER (the key GetModel reads).
+// For hermes, the value is a hermes-native slug like "minimax/MiniMax-M2.7";
+// for langgraph it's the legacy "provider:model" form. Either way it's just
+// an opaque string the runtime interprets on its next start.
+//
+// Empty string clears the override. Triggers auto-restart so the new
+// env (HERMES_DEFAULT_MODEL etc.) takes effect immediately — without
+// this the user clicks Save+Restart, the canvas PUT lands, but the
+// already-restarting container misses the window and boots with the
+// old value.
+func (h *SecretsHandler) SetModel(c *gin.Context) {
+	workspaceID := c.Param("id")
+	if !uuidRegex.MatchString(workspaceID) {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid workspace ID"})
+		return
+	}
+	ctx := c.Request.Context()
+
+	var body struct {
+		Model string `json:"model"`
+	}
+	if err := c.ShouldBindJSON(&body); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request body"})
+		return
+	}
+
+	if body.Model == "" {
+		if _, err := db.DB.ExecContext(ctx,
+			`DELETE FROM workspace_secrets WHERE workspace_id = $1 AND key = 'MODEL_PROVIDER'`,
+			workspaceID); err != nil {
+			log.Printf("SetModel delete error: %v", err)
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to clear model"})
+			return
+		}
+		if h.restartFunc != nil {
+			go h.restartFunc(workspaceID)
+		}
+		c.JSON(http.StatusOK, gin.H{"status": "cleared"})
+		return
+	}
+
+	encrypted, err := crypto.Encrypt([]byte(body.Model))
+	if err != nil {
+		log.Printf("SetModel encrypt error: %v", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to encrypt model"})
+		return
+	}
+	version := crypto.CurrentEncryptionVersion()
+	_, err = db.DB.ExecContext(ctx, `
+		INSERT INTO workspace_secrets (workspace_id, key, encrypted_value, encryption_version)
+		VALUES ($1, 'MODEL_PROVIDER', $2, $3)
+		ON CONFLICT (workspace_id, key) DO UPDATE
+			SET encrypted_value = $2, encryption_version = $3, updated_at = now()
+	`, workspaceID, encrypted, version)
+	if err != nil {
+		log.Printf("SetModel upsert error: %v", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save model"})
+		return
+	}
+
+	if h.restartFunc != nil {
+		go h.restartFunc(workspaceID)
+	}
+	c.JSON(http.StatusOK, gin.H{"status": "saved", "model": body.Model})
+}
--- a/workspace-server/internal/handlers/secrets_test.go
+++ b/workspace-server/internal/handlers/secrets_test.go
@ -6,6 +6,7 @@ import (
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
+	"strings"
 	"testing"
 	"time"

@ -535,6 +536,88 @@ func TestSecretsGetModel_DBError(t *testing.T) {
 	}
 }

+// ==================== SetModel ====================
+
+func TestSecretsSetModel_Upsert(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	restartCalled := make(chan string, 1)
+	handler := NewSecretsHandler(func(id string) { restartCalled <- id })
+
+	mock.ExpectExec(`INSERT INTO workspace_secrets`).
+		WithArgs("00000000-0000-0000-0000-000000000001", sqlmock.AnyArg(), sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(1, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+	c.Request = httptest.NewRequest("PUT", "/workspaces/00000000-0000-0000-0000-000000000001/model",
+		strings.NewReader(`{"model":"minimax/MiniMax-M2.7"}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.SetModel(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	select {
+	case id := <-restartCalled:
+		if id != "00000000-0000-0000-0000-000000000001" {
+			t.Errorf("restart called with wrong id: %s", id)
+		}
+	case <-time.After(500 * time.Millisecond):
+		t.Error("restart was not triggered")
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+func TestSecretsSetModel_EmptyClears(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewSecretsHandler(func(string) {})
+
+	mock.ExpectExec(`DELETE FROM workspace_secrets`).
+		WithArgs("00000000-0000-0000-0000-000000000002").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000002"}}
+	c.Request = httptest.NewRequest("PUT", "/workspaces/00000000-0000-0000-0000-000000000002/model",
+		strings.NewReader(`{"model":""}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.SetModel(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+func TestSecretsSetModel_InvalidID(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewSecretsHandler(nil)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "not-a-uuid"}}
+	c.Request = httptest.NewRequest("PUT", "/workspaces/not-a-uuid/model",
+		strings.NewReader(`{"model":"x"}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.SetModel(c)
+
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400 for bad UUID, got %d", w.Code)
+	}
+}
+
 // ==================== Values — Phase 30.2 decrypted pull ====================

 // These tests target the secrets.Values handler (GET /workspaces/:id/secrets/values)
--- a/Show More
+++ b/Show More