Merge pull request #2061 from Molecule-AI/fix/canvas-multilevel-layout-ux

Canvas + platform UX hardening: env preflight, optimistic plugins, dotenv autoload, WS resilience
2026-04-26 18:03:10 +00:00 · 2026-04-26 18:03:10 +00:00 · a8a7aa54b6
commit a8a7aa54b6
parent 5b346ab3e7 09bfd9bdce
101 changed files with 10401 additions and 547 deletions
--- a/canvas/next.config.ts
+++ b/canvas/next.config.ts
@ -1,7 +1,100 @@
 import type { NextConfig } from "next";
+import { existsSync, readFileSync } from "node:fs";
+import { dirname, join } from "node:path";
+
+// Load NEXT_PUBLIC_* vars from the monorepo root .env so a fresh
+// `pnpm dev` works without a per-developer canvas/.env.local. Next.js
+// only auto-loads .env from the project root by default — but our
+// canonical config (NEXT_PUBLIC_PLATFORM_URL, NEXT_PUBLIC_WS_URL,
+// MOLECULE_ENV, etc.) lives at the monorepo root, gitignored, shared
+// by the Go platform binary. Without this, the canvas falls back to
+// `window.location` (`ws://localhost:3000/ws`) and the WS pill stays
+// "Reconnecting" forever because Next.js dev doesn't serve /ws.
+//
+// Mirrors workspace-server/cmd/server/dotenv.go's monorepo-rooted .env
+// loader. Both processes look for the SAME marker (`workspace-server/
+// go.mod`) so a developer renaming or relocating the repo only has to
+// update one heuristic. Production is unaffected: `output: "standalone"`
+// bakes resolved env into the build, and the marker file isn't shipped.
+loadMonorepoEnv();

 const nextConfig: NextConfig = {
  output: "standalone",
 };

 export default nextConfig;
+
+function loadMonorepoEnv() {
+  const root = findMonorepoRoot(__dirname);
+  if (!root) return;
+  const envPath = join(root, ".env");
+  if (!existsSync(envPath)) return;
+  const body = readFileSync(envPath, "utf8");
+  let loaded = 0;
+  let skipped = 0;
+  for (const line of body.split(/\r?\n/)) {
+    const kv = parseLine(line);
+    if (!kv) continue;
+    const [k, v] = kv;
+    // Existing env wins. NOTE: an explicitly-set empty string
+    // (`KEY=` exported from a parent shell, where Node represents it
+    // as `""` not `undefined`) counts as "set" — we keep the empty
+    // value rather than backfilling from the file. Matches Go's
+    // os.LookupEnv check in workspace-server/cmd/server/dotenv.go so
+    // both processes treat the same input identically. Operators who
+    // want the file value to win must `unset KEY` in the launching
+    // shell.
+    if (process.env[k] !== undefined) {
+      skipped++;
+      continue;
+    }
+    process.env[k] = v;
+    loaded++;
+  }
+  // eslint-disable-next-line no-console
+  console.log(
+    `[next.config] loaded ${loaded} vars from ${envPath} (${skipped} already set in env)`,
+  );
+}
+
+function findMonorepoRoot(start: string): string | null {
+  let dir = start;
+  for (let i = 0; i < 6; i++) {
+    if (existsSync(join(dir, "workspace-server", "go.mod"))) return dir;
+    const parent = dirname(dir);
+    if (parent === dir) break;
+    dir = parent;
+  }
+  return null;
+}
+
+// Mirror of workspace-server/cmd/server/dotenv.go's parseDotEnvLine
+// — same rules so the two loaders agree on every line in the shared
+// .env. If you change one parser, change the other.
+function parseLine(raw: string): [string, string] | null {
+  let line = raw.replace(/^/, "").trim();
+  if (line === "" || line.startsWith("#")) return null;
+  // `export ` prefix uses a literal space — `export\tFOO=bar` with a
+  // tab is intentionally rejected, matching the Go mirror in
+  // workspace-server/cmd/server/dotenv.go. Shells emit the prefix
+  // with a space; tabs would only appear in hand-mangled files.
+  if (line.startsWith("export ")) line = line.slice("export ".length).trimStart();
+  const eq = line.indexOf("=");
+  if (eq <= 0) return null;
+  const k = line.slice(0, eq).trim();
+  let v = line.slice(eq + 1).replace(/^[ \t]+/, "");
+  if (v.length >= 2 && (v[0] === '"' || v[0] === "'")) {
+    const quote = v[0];
+    const end = v.indexOf(quote, 1);
+    if (end >= 0) return [k, v.slice(1, end)];
+    // unterminated — fall through to bare-value handling
+  }
+  for (let i = 0; i < v.length; i++) {
+    if (v[i] !== "#") continue;
+    if (i === 0 || v[i - 1] === " " || v[i - 1] === "\t") {
+      v = v.slice(0, i);
+      break;
+    }
+  }
+  return [k, v.trim()];
+}
--- a/canvas/src/app/globals.css
+++ b/canvas/src/app/globals.css
@ -1,5 +1,9 @@
@import "xterm/css/xterm.css";
+/* Theme tokens MUST load before any feature stylesheet that
+   references them so custom properties are in scope. */
+@import "../styles/theme-tokens.css";
@import "../styles/settings-panel.css";
+@import "../styles/org-deploy.css";

@tailwind base;
@tailwind components;
@ -38,7 +42,20 @@ body {
 }

 .react-flow__node {
-  transition: box-shadow 0.2s ease;
+  /* Transform transition drives the "spawn from parent" motion —
+     org-deploy sets the node's initial position to the parent's
+     absolute coords, then repositions to the real slot, and this
+     transition interpolates the translate() in between.
+     Non-deploy workspace moves (drag, nest) get the same smoothing
+     for free. */
+  transition:
+    box-shadow var(--mol-duration-fast) ease,
+    transform var(--mol-duration-spawn) var(--mol-easing-bounce-out);
+}
+/* Drag events must feel instant — React Flow adds this class
+   for the lifetime of the gesture. */
+.react-flow__node.dragging {
+  transition: box-shadow var(--mol-duration-fast) ease;
 }

 /* Scrollbar styling */
--- a/canvas/src/app/page.tsx
+++ b/canvas/src/app/page.tsx
@ -7,13 +7,19 @@ import { CommunicationOverlay } from "@/components/CommunicationOverlay";
 import { Spinner } from "@/components/Spinner";
 import { connectSocket, disconnectSocket } from "@/store/socket";
 import { useCanvasStore } from "@/store/canvas";
-import { api } from "@/lib/api";
+import { api, PlatformUnavailableError } from "@/lib/api";
 import type { WorkspaceData } from "@/store/socket";

 export default function Home() {
  const hydrationError = useCanvasStore((s) => s.hydrationError);
  const setHydrationError = useCanvasStore((s) => s.setHydrationError);
  const [hydrating, setHydrating] = useState(true);
+  // Distinct from hydrationError: platform-down is its own UX path
+  // (different copy, different action — the user's next step is to
+  // check local services, not to retry the API call). Tracked
+  // separately rather than encoded into hydrationError so the
+  // generic-error branch can stay simple.
+  const [platformDown, setPlatformDown] = useState(false);

  useEffect(() => {
    connectSocket();
@ -28,8 +34,11 @@ export default function Home() {
        useCanvasStore.getState().setViewport(viewport);
      }
    }).catch((err) => {
-      // Initial hydration failed — show error banner to user
      console.error("Canvas: initial hydration failed", err);
+      if (err instanceof PlatformUnavailableError) {
+        setPlatformDown(true);
+        return;
+      }
      useCanvasStore.getState().setHydrationError(
        err instanceof Error && err.message ? err.message : "Failed to load canvas"
      );
@ -53,6 +62,10 @@ export default function Home() {
    );
  }

+  if (platformDown) {
+    return <PlatformDownDiagnostic />;
+  }
+
  return (
    <>
      <Canvas />
@ -83,3 +96,43 @@ export default function Home() {
    </>
  );
 }
+
+/**
+ * Dedicated diagnostic for the case where the platform reported its
+ * datastore (Postgres / Redis) is unreachable. Distinct from the
+ * generic API-error overlay: the user's next action is to check
+ * local services, not to retry the API call. Includes the exact
+ * commands for the common dev-host setup.
+ */
+function PlatformDownDiagnostic() {
+  return (
+    <div
+      role="alert"
+      className="fixed inset-0 flex flex-col items-center justify-center bg-zinc-950 text-zinc-300 gap-5 z-[9999] px-6"
+    >
+      <div className="text-amber-400 text-sm font-semibold uppercase tracking-wider">
+        Platform infrastructure unreachable
+      </div>
+      <p className="text-zinc-400 text-sm max-w-lg text-center leading-relaxed">
+        The platform server returned <code className="font-mono text-amber-300">503 platform_unavailable</code>.
+        That means it can&apos;t reach Postgres or Redis to validate your session.
+        Most common cause on a dev host: one of those services stopped.
+      </p>
+      <div className="bg-zinc-900/80 border border-zinc-700/50 rounded-lg px-4 py-3 max-w-lg w-full">
+        <div className="text-[10px] uppercase tracking-wider text-zinc-500 mb-2">Try first</div>
+        <pre className="text-[12px] text-zinc-300 font-mono whitespace-pre-wrap leading-relaxed">{`brew services start postgresql@14
+brew services start redis`}</pre>
+      </div>
+      <p className="text-[11px] text-zinc-500 max-w-lg text-center">
+        If both are running, check <code className="font-mono">/tmp/molecule-server.log</code> for
+        the underlying error. If you&apos;re on hosted SaaS, this is a platform incident — try again in a moment.
+      </p>
+      <button
+        onClick={() => window.location.reload()}
+        className="px-4 py-2 bg-blue-600 hover:bg-blue-500 text-white rounded-md text-sm mt-2"
+      >
+        Reload
+      </button>
+    </div>
+  );
+}
--- a/canvas/src/components/A2ATopologyOverlay.tsx
+++ b/canvas/src/components/A2ATopologyOverlay.tsx
@ -74,7 +74,11 @@ export function buildA2AEdges(
    });
  }

-  // 3. Build React Flow Edge objects
+  // 3. Build React Flow Edge objects. We tag every overlay edge with
+  //    type: "a2a" so React Flow renders it via our custom A2AEdge
+  //    component (canvas/A2AEdge.tsx). The custom component portals
+  //    its label out of the SVG layer so it (a) doesn't get hidden
+  //    behind workspace cards and (b) is clickable.
  return Array.from(map.values()).map(({ source, target, count, lastAt }) => {
    const isHot = now - lastAt < A2A_HOT_MS;
    const stroke = isHot ? "#8b5cf6" : "#3b82f6"; // violet-500 : blue-500
@ -84,6 +88,7 @@ export function buildA2AEdges(

    return {
      id: `a2a-${source}-${target}`,
+      type: "a2a",
      source,
      target,
      animated: isHot,
@ -96,22 +101,22 @@ export function buildA2AEdges(
      style: {
        stroke,
        strokeWidth: 2,
-        // Non-blocking: label overlay never intercepts pointer events
+        // Path itself stays non-interactive so node drags through
+        // the line still work. The clickable target is the label
+        // pill, which sets pointerEvents: all on its own div.
        pointerEvents: "none" as React.CSSProperties["pointerEvents"],
      },
+      // `label` keeps the same string for back-compat with any test
+      // that asserts on it (e.g. buildA2AEdges output shape). Custom
+      // edge reads the rich data from `data` so the label visual is
+      // not constrained to a string anymore.
      label,
-      labelStyle: {
-        fill: "#a1a1aa",   // zinc-400
-        fontSize: 10,
-        pointerEvents: "none" as React.CSSProperties["pointerEvents"],
+      data: {
+        count,
+        lastAt,
+        isHot,
+        label,
      },
-      labelBgStyle: {
-        fill: "#18181b",   // zinc-900
-        fillOpacity: 0.9,
-        pointerEvents: "none" as React.CSSProperties["pointerEvents"],
-      },
-      labelBgPadding: [4, 6] as [number, number],
-      labelBgBorderRadius: 4,
    };
  });
 }
--- a/canvas/src/components/Canvas.tsx
+++ b/canvas/src/components/Canvas.tsx
@ -36,11 +36,22 @@ import { DropTargetBadge } from "./canvas/DropTargetBadge";
 import { useDragHandlers } from "./canvas/useDragHandlers";
 import { useKeyboardShortcuts } from "./canvas/useKeyboardShortcuts";
 import { useCanvasViewport } from "./canvas/useCanvasViewport";
+import { A2AEdge } from "./canvas/A2AEdge";

 const nodeTypes = {
  workspaceNode: WorkspaceNode,
 };

+// Custom edge types. The default React Flow edge renders its label
+// inside the SVG group (always under nodes) with pointerEvents: none
+// inherited from the path. A2AEdge portals the label to a sibling
+// DOM layer so it renders above nodes and accepts clicks. Keep the
+// reference stable (module-scope const) so React Flow doesn't see a
+// new edgeTypes object on every render and warn about prop churn.
+const edgeTypes = {
+  a2a: A2AEdge,
+};
+
 const defaultEdgeOptions: Partial<Edge> = {
  animated: true,
  style: {
@ -58,14 +69,95 @@ export function Canvas() {
 }

 function CanvasInner() {
-  const nodes = useCanvasStore((s) => s.nodes);
+  const rawNodes = useCanvasStore((s) => s.nodes);
  const edges = useCanvasStore((s) => s.edges);
  const a2aEdges = useCanvasStore((s) => s.a2aEdges);
  const showA2AEdges = useCanvasStore((s) => s.showA2AEdges);
+  const deletingIds = useCanvasStore((s) => s.deletingIds);
  const allEdges = useMemo(
    () => (showA2AEdges ? [...edges, ...a2aEdges] : edges),
    [edges, a2aEdges, showA2AEdges],
  );
+  // Drag-lock during a system-owned operation (deploy OR delete).
+  // React Flow respects Node.draggable, which stops the gesture
+  // before it starts — preventDefault() on the drag-start callback
+  // isn't authoritative in v12. We project `draggable: false` onto
+  // each locked node before handing the array to ReactFlow; the
+  // drag-start handler in useDragHandlers remains as a belt-and-
+  // braces check.
+  //
+  // Perf: short-circuit when nothing is provisioning so the memo
+  // passes rawNodes through unchanged (identity-stable → RF
+  // reconciles nothing). When a deploy IS active, build an O(n)
+  // root index once and re-use it. Critically, do NOT spread every
+  // node — only mutate the locked ones — so unmodified nodes keep
+  // their object identity and RF's per-node memo short-circuits.
+  const nodes = useMemo(() => {
+    const anyProvisioning = rawNodes.some((n) => n.data.status === "provisioning");
+    const anyDeleting = deletingIds.size > 0;
+    if (!anyProvisioning && !anyDeleting) return rawNodes;
+
+    const byId = new Map<string, typeof rawNodes[number]>();
+    for (const n of rawNodes) byId.set(n.id, n);
+    const rootOf = new Map<string, string>();
+    const resolveRoot = (id: string): string => {
+      // Iterative walk guards against a pathological cycle (hostile
+      // data) — recursion would hit the stack limit on a deep tree.
+      const visited = new Set<string>();
+      let cursor: string | null = id;
+      while (cursor) {
+        if (visited.has(cursor)) break;
+        visited.add(cursor);
+        const cached = rootOf.get(cursor);
+        if (cached) {
+          for (const seenId of visited) rootOf.set(seenId, cached);
+          return cached;
+        }
+        const n = byId.get(cursor);
+        if (!n) break;
+        if (!n.data.parentId) {
+          for (const seenId of visited) rootOf.set(seenId, cursor);
+          return cursor;
+        }
+        cursor = n.data.parentId;
+      }
+      return id;
+    };
+
+    const provisioningByRoot = new Map<string, number>();
+    for (const n of rawNodes) {
+      if (n.data.status !== "provisioning") continue;
+      const rootId = resolveRoot(n.id);
+      provisioningByRoot.set(rootId, (provisioningByRoot.get(rootId) ?? 0) + 1);
+    }
+
+    let touched = false;
+    const next = rawNodes.map((n) => {
+      const rootId = resolveRoot(n.id);
+      const deployLocked = n.id !== rootId && (provisioningByRoot.get(rootId) ?? 0) > 0;
+      // Delete-locked: nothing in a subtree whose DELETE is in
+      // flight should be draggable, INCLUDING the root of that
+      // subtree (unlike deploy, there's no cancel — the delete
+      // is irrevocable at this point).
+      const deleteLocked = deletingIds.has(n.id);
+      const shouldLock = deployLocked || deleteLocked;
+      if (shouldLock && n.draggable !== false) {
+        touched = true;
+        return { ...n, draggable: false };
+      }
+      if (!shouldLock && n.draggable === false) {
+        // Node was locked in a prior render; deploy cancelled /
+        // completed, or delete failed and was reverted. Restore
+        // default dragability.
+        touched = true;
+        const { draggable: _d, ...rest } = n;
+        void _d;
+        return rest as typeof n;
+      }
+      return n; // identity-preserved
+    });
+    return touched ? next : rawNodes;
+  }, [rawNodes, deletingIds]);
  const onNodesChange = useCanvasStore((s) => s.onNodesChange);
  const selectNode = useCanvasStore((s) => s.selectNode);
  const selectedNodeId = useCanvasStore((s) => s.selectedNodeId);
@ -91,18 +183,45 @@ function CanvasInner() {
  // outside-click handler.
  const pendingDelete = useCanvasStore((s) => s.pendingDelete);
  const setPendingDelete = useCanvasStore((s) => s.setPendingDelete);
-  const removeNode = useCanvasStore((s) => s.removeNode);
+  const removeSubtree = useCanvasStore((s) => s.removeSubtree);
  const confirmDelete = useCallback(async () => {
    if (!pendingDelete) return;
    const { id } = pendingDelete;
    setPendingDelete(null);
+    // Compute the full subtree and mark it as "deleting" so every
+    // node in the chain renders dim + non-draggable during the
+    // network round-trip + the server-side cascade. Matches the
+    // deploy-lock UX: once a system-initiated operation owns this
+    // subtree, the user shouldn't be able to move its pieces
+    // around until it resolves.
+    const state = useCanvasStore.getState();
+    const subtree = new Set<string>();
+    const stack = [id];
+    while (stack.length) {
+      const nid = stack.pop()!;
+      subtree.add(nid);
+      for (const n of state.nodes) {
+        if (n.data.parentId === nid) stack.push(n.id);
+      }
+    }
+    state.beginDelete(subtree);
    try {
      await api.del(`/workspaces/${id}?confirm=true`);
-      removeNode(id);
+      // Mirror the server-side cascade locally — drop the parent AND
+      // every descendant in one atomic update. The per-descendant
+      // WORKSPACE_REMOVED WS events still arrive (and are no-ops
+      // because the nodes are already gone), but we no longer depend
+      // on them: a wedged WS used to leave orphan child cards on the
+      // canvas until the user refreshed the page.
+      removeSubtree(id);
+      state.endDelete(subtree);
    } catch (e) {
+      // Network or server error — restore the subtree to normal
+      // interaction and surface the error.
+      state.endDelete(subtree);
      showToast(e instanceof Error ? e.message : "Delete failed", "error");
    }
-  }, [pendingDelete, setPendingDelete, removeNode]);
+  }, [pendingDelete, setPendingDelete, removeSubtree]);

  const onPaneClick = useCallback(() => {
    selectNode(null);
@ -141,6 +260,7 @@ function CanvasInner() {
          onPaneClick={onPaneClick}
          onMoveEnd={onMoveEnd}
          nodeTypes={nodeTypes}
+          edgeTypes={edgeTypes}
          defaultEdgeOptions={defaultEdgeOptions}
          defaultViewport={defaultViewport}
          fitView={viewport.x === 0 && viewport.y === 0 && viewport.zoom === 1}
--- a/canvas/src/components/EmptyState.tsx
+++ b/canvas/src/components/EmptyState.tsx
@ -1,27 +1,19 @@
 "use client";

-import { useState, useEffect } from "react";
+import { useState, useEffect, useCallback } from "react";
 import { api } from "@/lib/api";
 import { useCanvasStore } from "@/store/canvas";
 import { OrgTemplatesSection } from "./TemplatePalette";
+import { type Template } from "@/lib/deploy-preflight";
+import { useTemplateDeploy } from "@/hooks/useTemplateDeploy";
 import { Spinner } from "./Spinner";
 import { TIER_CONFIG } from "@/lib/design-tokens";

-interface Template {
-  id: string;
-  name: string;
-  description: string;
-  tier: number;
-  model: string;
-  skills: string[];
-  skill_count: number;
-}
-
 export function EmptyState() {
  const [templates, setTemplates] = useState<Template[]>([]);
  const [loading, setLoading] = useState(true);
-  const [deploying, setDeploying] = useState<string | null>(null);
-  const [error, setError] = useState<string | null>(null);
+  const [blankCreating, setBlankCreating] = useState(false);
+  const [blankError, setBlankError] = useState<string | null>(null);

  useEffect(() => {
    api
@ -31,48 +23,56 @@ export function EmptyState() {
      .finally(() => setLoading(false));
  }, []);

-  const deploy = async (template: Template) => {
-    setDeploying(template.id);
-    setError(null);
-    try {
-      const ws = await api.post<{ id: string }>("/workspaces", {
-        name: template.name,
-        template: template.id,
-        tier: template.tier,
-        canvas: { x: 200, y: 150 },
-      });
-      // Auto-select the new workspace and open chat
-      setTimeout(() => {
-        useCanvasStore.getState().selectNode(ws.id);
-        useCanvasStore.getState().setPanelTab("chat");
-      }, 500);
-    } catch (e) {
-      setError(e instanceof Error ? e.message : "Deploy failed");
-    } finally {
-      setDeploying(null);
-    }
-  };
+  // Canvas fills in a visible "center-ish" spot on a fresh tenant so
+  // the user doesn't have to pan to find their new workspace. Fixed
+  // (200, 150) instead of the sidebar's random placement because the
+  // canvas is guaranteed empty when this component mounts.
+  const firstDeployCoords = useCallback(() => ({ x: 200, y: 150 }), []);

+  // After the POST succeeds, auto-select the new workspace and flip
+  // the panel to Chat. This is a UX flourish that only makes sense
+  // on first deploy (the canvas is empty so the selection can't
+  // surprise anyone); the sidebar intentionally skips this step.
+  // 500 ms delay so React Flow has a frame to render the new node
+  // before it receives focus.
+  const handleDeployed = useCallback((workspaceId: string) => {
+    setTimeout(() => {
+      useCanvasStore.getState().selectNode(workspaceId);
+      useCanvasStore.getState().setPanelTab("chat");
+    }, 500);
+  }, []);
+
+  const { deploy, deploying, error, modal } = useTemplateDeploy({
+    canvasCoords: firstDeployCoords,
+    onDeployed: handleDeployed,
+  });
+
+  // "Create blank" bypasses templates entirely — no preflight, no
+  // modal, just POST /workspaces with a default name and tier.
+  // Deliberately NOT routed through useTemplateDeploy because it
+  // has no `template.id` to deploy against.
  const createBlank = async () => {
-    setDeploying("blank");
-    setError(null);
+    setBlankCreating(true);
+    setBlankError(null);
    try {
      const ws = await api.post<{ id: string }>("/workspaces", {
        name: "My First Agent",
        tier: 2,
-        canvas: { x: 200, y: 150 },
+        canvas: firstDeployCoords(),
      });
-      setTimeout(() => {
-        useCanvasStore.getState().selectNode(ws.id);
-        useCanvasStore.getState().setPanelTab("chat");
-      }, 500);
+      handleDeployed(ws.id);
    } catch (e) {
-      setError(e instanceof Error ? e.message : "Create failed");
+      setBlankError(e instanceof Error ? e.message : "Create failed");
    } finally {
-      setDeploying(null);
+      setBlankCreating(false);
    }
  };

+  // Any active gesture locks every button so the user can't fire a
+  // second POST while the first is still in flight.
+  const anyDeploying = !!deploying || blankCreating;
+  const displayError = error ?? blankError;
+
  return (
    <div className="absolute inset-0 flex items-start justify-center pointer-events-none z-[1] overflow-y-auto py-8">
      <div className="relative max-w-2xl w-full rounded-3xl border border-zinc-800/70 bg-zinc-950/80 backdrop-blur-xl px-8 py-8 text-center shadow-2xl shadow-black/40 pointer-events-auto mx-4">
@ -112,8 +112,8 @@ export function EmptyState() {
                <button
                  type="button"
                  key={t.id}
-                  onClick={() => deploy(t)}
-                  disabled={!!deploying}
+                  onClick={() => void deploy(t)}
+                  disabled={anyDeploying}
                  className="group rounded-xl border border-zinc-800/60 bg-zinc-900/50 px-3.5 py-3 hover:border-blue-500/40 hover:bg-zinc-900/80 transition-all disabled:opacity-50 disabled:cursor-not-allowed disabled:hover:border-zinc-800/60 disabled:hover:bg-zinc-900/50 text-left focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70"
                >
                  <div className="flex items-center gap-2 mb-1">
@ -143,10 +143,10 @@ export function EmptyState() {
        <button
          type="button"
          onClick={createBlank}
-          disabled={!!deploying}
+          disabled={anyDeploying}
          className="w-full rounded-xl border border-dashed border-zinc-700/60 bg-zinc-900/30 px-4 py-3 text-sm text-zinc-400 hover:text-zinc-200 hover:border-zinc-600 hover:bg-zinc-900/50 transition-all disabled:opacity-50 disabled:cursor-not-allowed disabled:hover:text-zinc-400 disabled:hover:border-zinc-700/60 focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70"
        >
-          {deploying === "blank" ? "Creating..." : "+ Create blank workspace"}
+          {blankCreating ? "Creating..." : "+ Create blank workspace"}
        </button>

        {/* Org templates — instantiate a whole team in one click */}
@ -154,12 +154,17 @@ export function EmptyState() {
          <OrgTemplatesSection />
        </div>

-        {error && (
+        {displayError && (
          <div role="alert" className="mt-3 px-3 py-2 bg-red-950/40 border border-red-800/50 rounded-lg text-xs text-red-400">
-            {error}
+            {displayError}
          </div>
        )}

+        {/* Missing-keys preflight modal — owned by useTemplateDeploy,
+            shared with TemplatePalette. Rendered inline here so it
+            overlays this card naturally. */}
+        {modal}
+
        {/* Tips */}
        <div className="mt-5 pt-4 border-t border-zinc-800/50">
          <div className="flex items-center justify-center gap-6 text-[10px] text-zinc-400">
--- a/canvas/src/components/Legend.tsx
+++ b/canvas/src/components/Legend.tsx
@ -1,19 +1,92 @@
 "use client";

+import { useEffect, useState } from "react";
 import { STATUS_CONFIG } from "@/lib/design-tokens";
 import { useCanvasStore } from "@/store/canvas";

 const LEGEND_STATUSES = ["online", "provisioning", "degraded", "failed", "paused", "offline"] as const;

+// Persist the user's choice across sessions. Default is "open" so
+// first-time users still see the symbol key; once dismissed we
+// respect that until they explicitly reopen via the floating pill.
+const STORAGE_KEY = "molecule.legend.open";
+
+function readStoredOpen(): boolean {
+  if (typeof window === "undefined") return true;
+  try {
+    const v = window.localStorage.getItem(STORAGE_KEY);
+    if (v === null) return true;
+    return v === "1";
+  } catch {
+    return true;
+  }
+}
+
+function writeStoredOpen(open: boolean) {
+  if (typeof window === "undefined") return;
+  try {
+    window.localStorage.setItem(STORAGE_KEY, open ? "1" : "0");
+  } catch {
+    // localStorage can throw in private mode / quota / disabled
+    // contexts. Silent fallback — the in-memory state still works
+    // for the current session.
+  }
+}
+
 export function Legend() {
  // TemplatePalette (when open) is fixed top-0 left-0 w-[280px] — the
  // default bottom-6 left-4 position of this legend would sit under it.
  // Shift past the 280 px palette + a 16 px gap when the palette is open.
  const paletteOpen = useCanvasStore((s) => s.templatePaletteOpen);
  const leftClass = paletteOpen ? "left-[296px]" : "left-4";
+
+  // SSR-safe pattern: mount with the default (true) so first paint
+  // matches the server output, then hydrate the persisted value
+  // after mount. Avoids a hydration mismatch warning when the user
+  // had previously closed the legend.
+  const [open, setOpen] = useState(true);
+  useEffect(() => {
+    setOpen(readStoredOpen());
+  }, []);
+
+  const closeLegend = () => {
+    setOpen(false);
+    writeStoredOpen(false);
+  };
+  const openLegend = () => {
+    setOpen(true);
+    writeStoredOpen(true);
+  };
+
+  if (!open) {
+    return (
+      <button
+        type="button"
+        onClick={openLegend}
+        aria-label="Show legend"
+        title="Show legend"
+        className={`fixed bottom-6 ${leftClass} z-30 flex items-center gap-1.5 rounded-full bg-zinc-900/95 border border-zinc-700/50 px-3 py-1.5 text-[11px] font-semibold text-zinc-400 uppercase tracking-wider shadow-xl shadow-black/30 backdrop-blur-sm hover:text-zinc-200 hover:border-zinc-600 transition-[left,colors] duration-200`}
+      >
+        <span aria-hidden="true" className="text-[10px]">ⓘ</span>
+        Legend
+      </button>
+    );
+  }
+
  return (
    <div className={`fixed bottom-6 ${leftClass} z-30 bg-zinc-900/95 border border-zinc-700/50 rounded-xl px-4 py-3 shadow-xl shadow-black/30 backdrop-blur-sm max-w-[280px] transition-[left] duration-200`}>
-      <div className="text-[11px] font-semibold text-zinc-400 uppercase tracking-wider mb-2">Legend</div>
+      <div className="flex items-start justify-between mb-2">
+        <div className="text-[11px] font-semibold text-zinc-400 uppercase tracking-wider">Legend</div>
+        <button
+          type="button"
+          onClick={closeLegend}
+          aria-label="Hide legend"
+          title="Hide legend"
+          className="-mt-0.5 -mr-1 px-1.5 text-[14px] leading-none text-zinc-500 hover:text-zinc-200 transition-colors"
+        >
+          ×
+        </button>
+      </div>

      {/* Status */}
      <div className="mb-2">
--- a/canvas/src/components/MissingKeysModal.tsx
+++ b/canvas/src/components/MissingKeysModal.tsx
@ -1,6 +1,7 @@
 "use client";

 import { useState, useEffect, useCallback, useRef, useMemo } from "react";
+import { createPortal } from "react-dom";
 import { api } from "@/lib/api";
 import { getKeyLabel, type ProviderChoice } from "@/lib/deploy-preflight";

@ -196,6 +197,12 @@ function ProviderPickerModal({
  );

  if (!open) return null;
+  // Portal to document.body for the same reason as
+  // OrgImportPreflightModal — several callers (TemplatePalette,
+  // EmptyState) render the modal inside their own fixed+filtered
+  // containers, which re-anchor the "fixed" positioning to the
+  // wrapper's bounds instead of the viewport.
+  if (typeof document === "undefined") return null;

  const allSaved = entries.length > 0 && entries.every((e) => e.saved);
  const anySaving = entries.some((e) => e.saving);
@ -203,8 +210,14 @@ function ProviderPickerModal({
    .replace(/[-_]/g, " ")
    .replace(/\b\w/g, (c) => c.toUpperCase());

-  return (
-    <div className="fixed inset-0 z-50 flex items-center justify-center">
+  return createPortal(
+    // z-[60] so this stacks ABOVE OrgImportPreflightModal (z-50).
+    // Both can be on screen at once during an org import: the org-
+    // preflight is open while the user clicks a per-workspace deploy
+    // that triggers MissingKeys. Without the explicit z-order the
+    // backdrop click might dismiss the wrong modal depending on
+    // React's commit ordering.
+    <div className="fixed inset-0 z-[60] flex items-center justify-center">
      <div
        aria-hidden="true"
        className="absolute inset-0 bg-black/70 backdrop-blur-sm"
@ -215,7 +228,7 @@ function ProviderPickerModal({
        role="dialog"
        aria-modal="true"
        aria-labelledby="missing-keys-title"
-        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[480px] w-full mx-4 overflow-hidden"
+        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[480px] w-full mx-4 max-h-[80vh] overflow-auto"
      >
        <div className="px-5 py-4 border-b border-zinc-800">
          <div className="flex items-center gap-2 mb-1">
@ -360,7 +373,8 @@ function ProviderPickerModal({
          </div>
        </div>
      </div>
-    </div>
+    </div>,
+    document.body,
  );
 }

@ -474,6 +488,7 @@ function AllKeysModal({
  }, [open]);

  if (!open) return null;
+  if (typeof document === "undefined") return null;

  const allSaved = entries.length > 0 && entries.every((e) => e.saved);
  const anySaving = entries.some((e) => e.saving);
@ -481,8 +496,14 @@ function AllKeysModal({
    .replace(/[-_]/g, " ")
    .replace(/\b\w/g, (c) => c.toUpperCase());

-  return (
-    <div className="fixed inset-0 z-50 flex items-center justify-center">
+  return createPortal(
+    // z-[60] so this stacks ABOVE OrgImportPreflightModal (z-50).
+    // Both can be on screen at once during an org import: the org-
+    // preflight is open while the user clicks a per-workspace deploy
+    // that triggers MissingKeys. Without the explicit z-order the
+    // backdrop click might dismiss the wrong modal depending on
+    // React's commit ordering.
+    <div className="fixed inset-0 z-[60] flex items-center justify-center">
      <div
        className="absolute inset-0 bg-black/70 backdrop-blur-sm"
        aria-hidden="true"
@ -493,7 +514,7 @@ function AllKeysModal({
        role="dialog"
        aria-modal="true"
        aria-labelledby="missing-keys-title"
-        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[440px] w-full mx-4 overflow-hidden"
+        className="relative bg-zinc-900 border border-zinc-700 rounded-xl shadow-2xl shadow-black/50 max-w-[440px] w-full mx-4 max-h-[80vh] overflow-auto"
      >
        <div className="px-5 py-4 border-b border-zinc-800">
          <div className="flex items-center gap-2 mb-1">
@ -608,6 +629,7 @@ function AllKeysModal({
          </div>
        </div>
      </div>
-    </div>
+    </div>,
+    document.body,
  );
 }
--- a/canvas/src/components/OrgImportPreflightModal.tsx
+++ b/canvas/src/components/OrgImportPreflightModal.tsx
@ -0,0 +1,540 @@
+"use client";
+
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+import { createPortal } from "react-dom";
+import { createSecret } from "@/lib/api/secrets";
+
+/**
+ * One entry from the server's preflight `required_env` / `recommended_env`.
+ *
+ *   - A plain string is a STRICT requirement: that exact env var must be
+ *     configured.
+ *   - A `{any_of: [...]}` object is an OR group: at least one member
+ *     must be configured to satisfy it. Lets a template say "either
+ *     ANTHROPIC_API_KEY or CLAUDE_CODE_OAUTH_TOKEN" without forcing
+ *     both.
+ *
+ * Matches the Go `EnvRequirement` type's JSON shape (MarshalJSON in
+ * workspace-server/internal/handlers/org.go). The union is written so
+ * that a narrow check — `typeof e === "string"` — distinguishes cleanly.
+ */
+export type EnvRequirement = string | { any_of: string[] };
+
+/** Flat member list for a requirement. */
+export function envReqMembers(r: EnvRequirement): string[] {
+  return typeof r === "string" ? [r] : r.any_of;
+}
+
+/** True if any member is present in `configured`. */
+export function envReqSatisfied(r: EnvRequirement, configured: Set<string>): boolean {
+  if (typeof r === "string") return configured.has(r);
+  return r.any_of.some((m) => configured.has(m));
+}
+
+/** Stable react-key / dedup key for a requirement. Sorted for groups so
+ *  reordered-member variants still collapse to one entry. */
+export function envReqKey(r: EnvRequirement): string {
+  if (typeof r === "string") return r;
+  return [...r.any_of].sort().join("|");
+}
+
+interface Props {
+  open: boolean;
+  /** Display name of the org template — headline only. */
+  orgName: string;
+  /** Total workspace count so the header can read "12 workspaces". */
+  workspaceCount: number;
+  /** Env vars the server has declared MUST be set as global secrets.
+   *  Import is disabled until every entry here is configured. Entries
+   *  are either a single key name or an any-of group. */
+  requiredEnv: EnvRequirement[];
+  /** Env vars the server suggests — import can proceed without them,
+   *  but the user sees them listed so they can decide. Same union
+   *  shape as `requiredEnv`. */
+  recommendedEnv: EnvRequirement[];
+  /** Names of env vars already configured globally. Used to strike
+   *  through entries the user has already set up in another
+   *  session. Passed in rather than queried inside the modal so the
+   *  parent can refresh after each save without prop-driven effects. */
+  configuredKeys: Set<string>;
+  /** Called after a successful secret save so the parent can refresh
+   *  `configuredKeys`. */
+  onSecretSaved: () => void;
+  /** User clicked Import with all required envs satisfied. */
+  onProceed: () => void;
+  /** User dismissed the modal. Import is NOT fired. */
+  onCancel: () => void;
+}
+
+interface DraftEntry {
+  key: string;
+  value: string;
+  saving: boolean;
+  error: string | null;
+}
+
+/**
+ * OrgImportPreflightModal
+ * -----------------------
+ * Two-tier env preflight before POST /org/import:
+ *
+ *   - REQUIRED section (red, blocking) — every entry MUST be configured
+ *     globally before the Import button enables. Matches the server-
+ *     side preflight that would 412 the import anyway.
+ *
+ *   - RECOMMENDED section (yellow, non-blocking) — listed so the user
+ *     can add them if they want the full experience, but the Import
+ *     button stays enabled regardless.
+ *
+ * Saving goes to the GLOBAL secrets endpoint (PUT /settings/secrets)
+ * because org-level templates deploy shared resources. Per-workspace
+ * overrides still work via the Config tab on an individual node
+ * after import. The modal does NOT enable Import the moment a key is
+ * typed — only after it saves successfully (so a half-entered token
+ * can't proceed and then fail at container-start time instead).
+ */
+export function OrgImportPreflightModal({
+  open,
+  orgName,
+  workspaceCount,
+  requiredEnv,
+  recommendedEnv,
+  configuredKeys,
+  onSecretSaved,
+  onProceed,
+  onCancel,
+}: Props) {
+  const [drafts, setDrafts] = useState<Record<string, DraftEntry>>({});
+
+  // Flatten the union-shaped requirement lists to the set of every key
+  // that could ever appear as an input row. Used purely to seed the
+  // drafts map — satisfaction semantics still read from the grouped
+  // EnvRequirement entries (a group can be satisfied by any one
+  // member).
+  const allMemberKeys = useMemo(() => {
+    const keys: string[] = [];
+    for (const r of requiredEnv) keys.push(...envReqMembers(r));
+    for (const r of recommendedEnv) keys.push(...envReqMembers(r));
+    return keys;
+  }, [requiredEnv, recommendedEnv]);
+
+  // Seed a draft entry per declared key the first time the modal
+  // opens. Entries persist across `configuredKeys` changes so a mid-
+  // save recheck doesn't wipe what the user typed.
+  //
+  // Dep: derive a STABLE string from the env-name lists rather than
+  // the array refs themselves. The parent computes
+  // `preflight.org.required_env ?? []`, which produces a fresh []
+  // identity on every re-render (e.g. when refreshConfiguredKeys
+  // bumps state); depending on the array refs would re-fire the
+  // effect on every parent render and mask any future edit that
+  // drops the `if (!next[k])` guard as a silent input-reset bug.
+  const envKeysSignature = useMemo(
+    () => [...allMemberKeys].sort().join("|"),
+    [allMemberKeys],
+  );
+  useEffect(() => {
+    if (!open) return;
+    setDrafts((prev) => {
+      const next = { ...prev };
+      for (const k of allMemberKeys) {
+        if (!next[k]) {
+          next[k] = { key: k, value: "", saving: false, error: null };
+        }
+      }
+      return next;
+    });
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [open, envKeysSignature]);
+
+  const missingRequired = useMemo(
+    () => requiredEnv.filter((r) => !envReqSatisfied(r, configuredKeys)),
+    [requiredEnv, configuredKeys],
+  );
+  const missingRecommended = useMemo(
+    () => recommendedEnv.filter((r) => !envReqSatisfied(r, configuredKeys)),
+    [recommendedEnv, configuredKeys],
+  );
+  const canProceed = missingRequired.length === 0;
+
+  // Synchronous in-flight gate. A ref (not state) so two clicks
+  // dispatched in the SAME microtask both see the gate flip — state
+  // commits don't help here because setState is async. The previous
+  // closure-based `current.saving` gate worked under React Testing
+  // Library's act() flushing but failed for true microtask-level
+  // double-fires (programmatic clicks, dblclick events, Enter-spam
+  // before React commits). Set is keyed by env var name so different
+  // rows can save concurrently.
+  const inFlightRef = useRef<Set<string>>(new Set());
+
+  // Latest-drafts ref so saveOne can read the current input value
+  // without taking `drafts` as a useCallback dep — that dep would
+  // re-create saveOne on every keystroke and re-bind every Save
+  // button's onClick handler, churn that scales with row count.
+  const draftsRef = useRef(drafts);
+  useEffect(() => {
+    draftsRef.current = drafts;
+  }, [drafts]);
+
+  const saveOne = useCallback(
+    async (key: string) => {
+      // Microtask-safe gate: claim the slot synchronously BEFORE any
+      // await so a second click in the same tick bounces immediately.
+      if (inFlightRef.current.has(key)) return;
+      const current = draftsRef.current[key];
+      if (!current || !current.value.trim()) return;
+      inFlightRef.current.add(key);
+
+      const startValue = current.value;
+      setDrafts((d) => ({
+        ...d,
+        [key]: { ...d[key], saving: true, error: null },
+      }));
+      try {
+        await createSecret("global", key, startValue);
+        setDrafts((d) => ({
+          ...d,
+          [key]: { ...d[key], value: "", saving: false, error: null },
+        }));
+        // Let the parent refresh configuredKeys so the strike-through
+        // updates and canProceed recomputes.
+        onSecretSaved();
+      } catch (e) {
+        setDrafts((d) => ({
+          ...d,
+          [key]: {
+            ...d[key],
+            saving: false,
+            error: e instanceof Error ? e.message : "Save failed",
+          },
+        }));
+      } finally {
+        inFlightRef.current.delete(key);
+      }
+    },
+    [onSecretSaved],
+  );
+
+  if (!open) return null;
+
+  // Portal the dialog to document.body so it escapes any ancestor
+  // containing block. TemplatePalette renders this modal inside a
+  // sidebar whose `fixed` container plus backdrop-filter together
+  // re-anchor descendants' `position: fixed` to the sidebar's own
+  // bounds instead of the viewport — the modal ends up glued to the
+  // sidebar's scrollable region and only becomes visible after the
+  // user scrolls the sidebar. Portal dodges that class of issue
+  // once and for all, regardless of what future wrappers do.
+  //
+  // SSR-safe guard: `document` is undefined on the server. Since
+  // the modal is gated by `if (!open) return null` above, this
+  // effectively only runs after open flips true on the client.
+  if (typeof document === "undefined") return null;
+
+  return createPortal(
+    <div
+      role="dialog"
+      aria-modal="true"
+      aria-labelledby="org-preflight-title"
+      className="fixed inset-0 z-50 flex items-center justify-center bg-black/70"
+      onClick={onCancel}
+    >
+      <div
+        className="w-[560px] max-h-[80vh] overflow-auto rounded-xl bg-zinc-900 border border-zinc-700 shadow-2xl"
+        onClick={(e) => e.stopPropagation()}
+      >
+        <header className="px-5 py-4 border-b border-zinc-800">
+          <h2 id="org-preflight-title" className="text-sm font-semibold text-zinc-100">
+            Deploy {orgName}
+          </h2>
+          <p className="mt-0.5 text-[11px] text-zinc-500">
+            {workspaceCount} workspace{workspaceCount === 1 ? "" : "s"}.
+            Review the credentials needed before import.
+          </p>
+        </header>
+
+        <section className="p-5 space-y-5">
+          {requiredEnv.length > 0 && (
+            <EnvList
+              tone="required"
+              title="Required"
+              subtitle="Import is blocked until every key below is saved globally."
+              entries={requiredEnv}
+              configuredKeys={configuredKeys}
+              drafts={drafts}
+              onChange={(key, value) =>
+                setDrafts((d) => ({ ...d, [key]: { ...d[key], value } }))
+              }
+              onSave={saveOne}
+            />
+          )}
+          {recommendedEnv.length > 0 && (
+            <EnvList
+              tone="recommended"
+              title="Recommended"
+              subtitle="Not required, but some features degrade without them. Add them now for the best experience."
+              entries={recommendedEnv}
+              configuredKeys={configuredKeys}
+              drafts={drafts}
+              onChange={(key, value) =>
+                setDrafts((d) => ({ ...d, [key]: { ...d[key], value } }))
+              }
+              onSave={saveOne}
+            />
+          )}
+          {requiredEnv.length === 0 && recommendedEnv.length === 0 && (
+            <p className="text-[12px] text-zinc-400">
+              No additional credentials required for this template.
+            </p>
+          )}
+        </section>
+
+        <footer className="px-5 py-3 border-t border-zinc-800 flex items-center justify-between">
+          <button
+            type="button"
+            onClick={onCancel}
+            className="px-3 py-1.5 text-[11px] rounded bg-zinc-800 hover:bg-zinc-700 text-zinc-300"
+          >
+            Cancel
+          </button>
+          <div className="flex items-center gap-2">
+            {missingRecommended.length > 0 && canProceed && (
+              <span className="text-[10px] text-amber-400/90">
+                {missingRecommended.length} recommended key
+                {missingRecommended.length === 1 ? "" : "s"} still unset
+              </span>
+            )}
+            <button
+              type="button"
+              onClick={onProceed}
+              disabled={!canProceed}
+              className="px-4 py-1.5 text-[11px] font-semibold rounded bg-blue-600 hover:bg-blue-500 text-white disabled:bg-zinc-700 disabled:text-zinc-500 disabled:cursor-not-allowed"
+            >
+              Import
+            </button>
+          </div>
+        </footer>
+      </div>
+    </div>,
+    document.body,
+  );
+}
+
+interface EnvListProps {
+  tone: "required" | "recommended";
+  title: string;
+  subtitle: string;
+  entries: EnvRequirement[];
+  configuredKeys: Set<string>;
+  drafts: Record<string, DraftEntry>;
+  onChange: (key: string, value: string) => void;
+  onSave: (key: string) => void;
+}
+
+function EnvList({
+  tone,
+  title,
+  subtitle,
+  entries,
+  configuredKeys,
+  drafts,
+  onChange,
+  onSave,
+}: EnvListProps) {
+  const accent =
+    tone === "required"
+      ? "border-red-800/60 bg-red-950/20"
+      : "border-amber-800/50 bg-amber-950/15";
+  const headerColor =
+    tone === "required" ? "text-red-300" : "text-amber-300";
+
+  return (
+    <div className={`rounded-lg border ${accent} p-3`}>
+      <h3 className={`text-[11px] font-semibold uppercase tracking-wide ${headerColor}`}>
+        {title}
+      </h3>
+      <p className="mt-0.5 mb-2 text-[10px] text-zinc-400">{subtitle}</p>
+      <ul className="space-y-2">
+        {entries.map((entry) =>
+          typeof entry === "string" ? (
+            <StrictEnvRow
+              key={envReqKey(entry)}
+              envKey={entry}
+              configured={configuredKeys.has(entry)}
+              draft={drafts[entry]}
+              onChange={onChange}
+              onSave={onSave}
+            />
+          ) : (
+            <AnyOfEnvGroup
+              key={envReqKey(entry)}
+              members={entry.any_of}
+              configuredKeys={configuredKeys}
+              drafts={drafts}
+              onChange={onChange}
+              onSave={onSave}
+            />
+          ),
+        )}
+      </ul>
+    </div>
+  );
+}
+
+interface StrictEnvRowProps {
+  envKey: string;
+  configured: boolean;
+  draft: DraftEntry | undefined;
+  onChange: (key: string, value: string) => void;
+  onSave: (key: string) => void;
+}
+
+function StrictEnvRow({
+  envKey,
+  configured,
+  draft: d,
+  onChange,
+  onSave,
+}: StrictEnvRowProps) {
+  return (
+    <li className="flex items-center gap-2 rounded bg-zinc-900/70 border border-zinc-800 px-2 py-1.5">
+      <code
+        className={`text-[11px] font-mono flex-1 ${
+          configured ? "text-zinc-500 line-through" : "text-zinc-200"
+        }`}
+      >
+        {envKey}
+      </code>
+      {configured ? (
+        <span className="text-[10px] text-emerald-400">✓ set</span>
+      ) : (
+        <>
+          <input
+            type="password"
+            aria-label={`Value for ${envKey}`}
+            placeholder="paste value"
+            value={d?.value ?? ""}
+            onChange={(e) => onChange(envKey, e.target.value)}
+            onKeyDown={(e) => {
+              if (e.key === "Enter") {
+                e.preventDefault();
+                onSave(envKey);
+              }
+            }}
+            disabled={d?.saving}
+            className="flex-1 px-2 py-1 rounded bg-zinc-800 border border-zinc-700 text-[11px] text-zinc-200 focus:outline-none focus:border-blue-500 disabled:opacity-50"
+          />
+          <button
+            type="button"
+            onClick={() => onSave(envKey)}
+            disabled={d?.saving || !d?.value.trim()}
+            className="px-2 py-1 text-[10px] rounded bg-blue-600 hover:bg-blue-500 text-white disabled:opacity-40 disabled:cursor-not-allowed"
+          >
+            {d?.saving ? "…" : "Save"}
+          </button>
+        </>
+      )}
+      {d?.error && (
+        <span className="text-[9px] text-red-400 basis-full pl-1">
+          {d.error}
+        </span>
+      )}
+    </li>
+  );
+}
+
+interface AnyOfEnvGroupProps {
+  members: string[];
+  configuredKeys: Set<string>;
+  drafts: Record<string, DraftEntry>;
+  onChange: (key: string, value: string) => void;
+  onSave: (key: string) => void;
+}
+
+/**
+ * Renders an OR group: the user only needs to configure ONE of the
+ * members to satisfy the requirement. Once any member is configured
+ * the group shows a green banner identifying the satisfying key; the
+ * other inputs remain visible but muted so the user can still switch
+ * providers if they want (uncommon but cheap to support).
+ */
+function AnyOfEnvGroup({
+  members,
+  configuredKeys,
+  drafts,
+  onChange,
+  onSave,
+}: AnyOfEnvGroupProps) {
+  const satisfiedBy = members.find((m) => configuredKeys.has(m));
+  return (
+    <li className="rounded border border-zinc-800 bg-zinc-900/50 px-2.5 py-2">
+      <div className="flex items-center justify-between mb-1.5">
+        <span className="text-[10px] uppercase tracking-wide text-zinc-400">
+          Configure any one
+        </span>
+        {satisfiedBy && (
+          <span className="text-[10px] text-emerald-400">
+            ✓ using <code className="font-mono">{satisfiedBy}</code>
+          </span>
+        )}
+      </div>
+      <ul className="space-y-1.5">
+        {members.map((m) => {
+          const isConfigured = configuredKeys.has(m);
+          const d = drafts[m];
+          const dimmed = !!satisfiedBy && !isConfigured;
+          return (
+            <li
+              key={m}
+              className={`flex items-center gap-2 rounded bg-zinc-900/70 border border-zinc-800 px-2 py-1 ${
+                dimmed ? "opacity-50" : ""
+              }`}
+            >
+              <code
+                className={`text-[11px] font-mono flex-1 ${
+                  isConfigured ? "text-zinc-500 line-through" : "text-zinc-200"
+                }`}
+              >
+                {m}
+              </code>
+              {isConfigured ? (
+                <span className="text-[10px] text-emerald-400">✓ set</span>
+              ) : (
+                <>
+                  <input
+                    type="password"
+                    aria-label={`Value for ${m}`}
+                    placeholder="paste value"
+                    value={d?.value ?? ""}
+                    onChange={(e) => onChange(m, e.target.value)}
+                    onKeyDown={(e) => {
+                      if (e.key === "Enter") {
+                        e.preventDefault();
+                        onSave(m);
+                      }
+                    }}
+                    disabled={d?.saving}
+                    className="flex-1 px-2 py-1 rounded bg-zinc-800 border border-zinc-700 text-[11px] text-zinc-200 focus:outline-none focus:border-blue-500 disabled:opacity-50"
+                  />
+                  <button
+                    type="button"
+                    onClick={() => onSave(m)}
+                    disabled={d?.saving || !d?.value.trim()}
+                    className="px-2 py-1 text-[10px] rounded bg-blue-600 hover:bg-blue-500 text-white disabled:opacity-40 disabled:cursor-not-allowed"
+                  >
+                    {d?.saving ? "…" : "Save"}
+                  </button>
+                </>
+              )}
+              {d?.error && (
+                <span className="text-[9px] text-red-400 basis-full pl-1">
+                  {d.error}
+                </span>
+              )}
+            </li>
+          );
+        })}
+      </ul>
+    </li>
+  );
+}
--- a/canvas/src/components/ProvisioningTimeout.tsx
+++ b/canvas/src/components/ProvisioningTimeout.tsx
@ -65,6 +65,12 @@ export function ProvisioningTimeout({
  // banner even if they stay in provisioning. Cleared when the
  // workspace leaves provisioning (status changes).
  const [dismissed, setDismissed] = useState<Set<string>>(new Set());
+  // Watch the live WS health. While it's not "connected", local node
+  // status reflects the last event we received before the drop —
+  // workspaces may have actually transitioned to online minutes ago.
+  // Suppress the banner until WS recovers + rehydrate confirms each
+  // workspace is genuinely still provisioning.
+  const wsStatus = useCanvasStore((s) => s.wsStatus);

  // Subscribe to provisioning nodes — use shallow compare to avoid infinite re-render
  // (filter+map creates new array reference on every store update).
@ -273,8 +279,11 @@ export function ProvisioningTimeout({
  }, []);

  const visibleTimedOut = useMemo(
-    () => timedOut.filter((e) => !dismissed.has(e.workspaceId)),
-    [timedOut, dismissed],
+    () =>
+      wsStatus === "connected"
+        ? timedOut.filter((e) => !dismissed.has(e.workspaceId))
+        : [],
+    [timedOut, dismissed, wsStatus],
  );

  if (visibleTimedOut.length === 0) return null;
--- a/canvas/src/components/SidePanel.tsx
+++ b/canvas/src/components/SidePanel.tsx
@ -29,7 +29,7 @@ const TABS: { id: PanelTab; label: string; icon: string }[] = [
  { id: "chat", label: "Chat", icon: "◈" },
  { id: "activity", label: "Activity", icon: "⊙" },
  { id: "details", label: "Details", icon: "◉" },
-  { id: "skills", label: "Skills", icon: "✦" },
+  { id: "skills", label: "Plugins", icon: "✦" },
  { id: "terminal", label: "Terminal", icon: "▸" },
  { id: "config", label: "Config", icon: "⚙" },
  { id: "schedule", label: "Schedule", icon: "⏲" },
@ -280,7 +280,7 @@ export function SidePanel() {
        className="flex-1 overflow-y-auto focus:outline-none"
      >
        {panelTab === "details" && <DetailsTab key={selectedNodeId} workspaceId={selectedNodeId} data={node.data} />}
-        {panelTab === "skills" && <SkillsTab key={selectedNodeId} data={node.data} />}
+        {panelTab === "skills" && <SkillsTab key={selectedNodeId} workspaceId={selectedNodeId} data={node.data} />}
        {panelTab === "activity" && <ActivityTab key={selectedNodeId} workspaceId={selectedNodeId} />}
        {panelTab === "chat" && <ChatTab key={selectedNodeId} workspaceId={selectedNodeId} data={node.data} />}
        {panelTab === "terminal" && <TerminalTab key={selectedNodeId} workspaceId={selectedNodeId} />}
--- a/canvas/src/components/TemplatePalette.tsx
+++ b/canvas/src/components/TemplatePalette.tsx
@ -1,35 +1,48 @@
 "use client";

 import { useState, useEffect, useCallback, useRef } from "react";
+import { flushSync } from "react-dom";
 import { api } from "@/lib/api";
 import { useCanvasStore } from "@/store/canvas";
 import type { WorkspaceData } from "@/store/socket";
-import { checkDeploySecrets, type PreflightResult, type ModelSpec } from "@/lib/deploy-preflight";
-import { MissingKeysModal } from "./MissingKeysModal";
+import { type Template } from "@/lib/deploy-preflight";
+import { useTemplateDeploy } from "@/hooks/useTemplateDeploy";
+import {
+  OrgImportPreflightModal,
+  type EnvRequirement,
+} from "./OrgImportPreflightModal";
 import { ConfirmDialog } from "./ConfirmDialog";
 import { Spinner } from "./Spinner";
 import { showToast } from "./Toaster";
 import { TIER_CONFIG } from "@/lib/design-tokens";
+import { listSecrets } from "@/lib/api/secrets";

-interface Template {
-  id: string;
-  name: string;
-  description: string;
-  tier: number;
-  runtime?: string;
-  model: string;
-  models?: ModelSpec[];
-  /** AND-required env vars declared at runtime_config.required_env. */
-  required_env?: string[];
-  skills: string[];
-  skill_count: number;
-}
-
+// `Template` type and `resolveRuntime` helper now live in
+// `@/lib/deploy-preflight` so EmptyState can import the same ones. Was
+// redeclared here + a narrower redeclaration in EmptyState; the
+// narrower one dropped `runtime`, `models`, `required_env`, which is
+// exactly the data the preflight needs. See reviewer's "runtime
+// fallback drift" note — single source of truth closes the drift.
 export interface OrgTemplate {
  dir: string;
  name: string;
  description: string;
  workspaces: number;
+  /** Env vars that MUST be set as global secrets before the org can
+   *  import. Server refuses the import with 412 if any are missing;
+   *  the canvas preflights against /secrets/list to avoid the round
+   *  trip. Aggregated from org-level + every workspace in the tree.
+   *
+   *  Each entry is either a key name (strict) or an `{any_of: [...]}`
+   *  group (any one of the listed members satisfies the requirement —
+   *  e.g. `ANTHROPIC_API_KEY` OR `CLAUDE_CODE_OAUTH_TOKEN`). */
+  required_env?: EnvRequirement[];
+  /** "Nice-to-have" tier. Import proceeds without them but features
+   *  may degrade — a channel's webhook posts get dropped, a fallback
+   *  LLM isn't available, etc. Surfaced to the user as a non-blocking
+   *  warning with an "add now" affordance. Same union shape as
+   *  `required_env`. */
+  recommended_env?: EnvRequirement[];
 }

 /** Fetch the list of org templates from the platform. Returns [] on error
@ -91,6 +104,14 @@ export function OrgTemplatesSection() {
  const [loading, setLoading] = useState(false);
  const [importing, setImporting] = useState<string | null>(null);
  const [error, setError] = useState<string | null>(null);
+  // Preflight modal state. `preflight` is non-null when the user
+  // clicked Import on an org with declared required/recommended envs
+  // and we're waiting for them to confirm; null otherwise (direct
+  // import path for orgs with zero env requirements).
+  const [preflight, setPreflight] = useState<{
+    org: OrgTemplate;
+    configuredKeys: Set<string>;
+  } | null>(null);
  // Collapsed by default — org templates are multi-workspace imports
  // that most new users don't reach for first. Keeping them
  // expand-on-demand frees ~400 px of vertical space for the
@ -109,21 +130,55 @@ export function OrgTemplatesSection() {
    loadOrgs();
  }, [loadOrgs]);

-  const handleImport = async (org: OrgTemplate) => {
+  /** Fetch the set of global secret KEYS that are already configured.
+   *  Used to strike through already-set entries in the preflight modal
+   *  and to decide whether the import needs the modal at all. */
+  const loadConfiguredKeys = useCallback(async (): Promise<Set<string>> => {
+    try {
+      const secrets = await listSecrets("global");
+      return new Set(secrets.map((s) => s.name));
+    } catch {
+      // Secrets endpoint unreachable → assume nothing configured.
+      // The server will refuse the import with 412 and the user
+      // retries; safer than letting the import fly blind.
+      return new Set();
+    }
+  }, []);
+
+  /** Actually run the import. Split out so both the "no preflight
+   *  needed" fast path and the "preflight modal approved" path can
+   *  share the fetch + hydrate + toast sequence. */
+  const doImport = useCallback(async (org: OrgTemplate) => {
    setImporting(org.dir);
    setError(null);
    try {
      await importOrgTemplate(org.dir);
-      // Refresh canvas inline — the WebSocket may be offline, in which case
-      // WORKSPACE_PROVISIONING broadcasts never arrive and the user sees
-      // no change from clicking "Import org". A direct fetch guarantees
-      // the new workspaces land on canvas regardless of WS state.
-      try {
-        const workspaces = await api.get<WorkspaceData[]>("/workspaces");
-        useCanvasStore.getState().hydrate(workspaces);
-      } catch {
-        // Rehydrate failure is non-fatal; WS (if alive) or the next
-        // health-check cycle will eventually pick the new workspaces up.
+      // Hydrate is the safety net for the "WS is offline" case —
+      // without live events the canvas stays empty. But calling it
+      // immediately wipes the org-deploy animation (hydrate rebuilds
+      // the node array from scratch, dropping the spawn / shimmer
+      // classes and position tweens). So:
+      //   1. If the number of nodes on the canvas already matches
+      //      (or exceeds) the template's workspace count, WS
+      //      delivered everything — skip hydrate.
+      //   2. Otherwise, wait a short window to let any in-flight WS
+      //      events land, then hydrate only if still behind.
+      const expectedCount = org.workspaces;
+      // Nodes transition through WORKSPACE_REMOVED which physically
+      // drops them from the store — there is no "removed" status in
+      // WorkspaceNodeData — so a simple length check is enough here.
+      const hasAll = () => useCanvasStore.getState().nodes.length >= expectedCount;
+      if (!hasAll()) {
+        await new Promise((r) => setTimeout(r, 1500));
+      }
+      if (!hasAll()) {
+        try {
+          const workspaces = await api.get<WorkspaceData[]>("/workspaces");
+          useCanvasStore.getState().hydrate(workspaces);
+        } catch {
+          // WS (if alive) or the next health-check cycle will
+          // eventually pick the new workspaces up.
+        }
      }
      showToast(`Imported "${org.name || org.dir}" (${org.workspaces} workspaces)`, "success");
    } catch (e) {
@ -133,7 +188,45 @@ export function OrgTemplatesSection() {
    } finally {
      setImporting(null);
    }
-  };
+  }, []);
+
+  /** Entry point for the Import button. Two paths:
+   *
+   *   1. No env declared by the template (required_env + recommended_env
+   *      both empty) → fire doImport directly. Matches the pre-preflight
+   *      behaviour for existing templates.
+   *
+   *   2. Any env declared → load the configured-keys set and open the
+   *      preflight modal. doImport runs only when the user clicks
+   *      Import inside the modal, which is gated to "required envs all
+   *      configured" by the modal itself. */
+  const handleImport = useCallback(async (org: OrgTemplate) => {
+    const hasEnvDeclarations =
+      (org.required_env && org.required_env.length > 0) ||
+      (org.recommended_env && org.recommended_env.length > 0);
+    if (!hasEnvDeclarations) {
+      void doImport(org);
+      return;
+    }
+    // Flip the button to its "Importing…" state while the secrets
+    // lookup runs — on a tenant with 500+ global secrets the round
+    // trip can be > 200 ms and the user otherwise gets zero visual
+    // feedback after clicking. Cleared on modal close / error.
+    setImporting(org.dir);
+    try {
+      const configuredKeys = await loadConfiguredKeys();
+      setPreflight({ org, configuredKeys });
+    } finally {
+      setImporting(null);
+    }
+  }, [doImport, loadConfiguredKeys]);
+
+  /** Called by the preflight modal after a successful key save so the
+   *  strike-through re-renders and canProceed recomputes. */
+  const refreshConfiguredKeys = useCallback(async () => {
+    const keys = await loadConfiguredKeys();
+    setPreflight((prev) => (prev ? { ...prev, configuredKeys: keys } : prev));
+  }, [loadConfiguredKeys]);

  return (
    <div className="space-y-2" data-testid="org-templates-section">
@ -222,6 +315,35 @@ export function OrgTemplatesSection() {
      })}
        </div>
      )}
+
+      {preflight && (
+        <OrgImportPreflightModal
+          open
+          orgName={preflight.org.name || preflight.org.dir}
+          workspaceCount={preflight.org.workspaces}
+          requiredEnv={preflight.org.required_env ?? []}
+          recommendedEnv={preflight.org.recommended_env ?? []}
+          configuredKeys={preflight.configuredKeys}
+          onSecretSaved={refreshConfiguredKeys}
+          onProceed={() => {
+            const org = preflight.org;
+            // flushSync guarantees the modal unmounts BEFORE we kick
+            // off the import network call. Without it, React batches
+            // setPreflight(null) with the setImporting(...) from
+            // doImport's synchronous prefix, both commit at the end
+            // of this handler, AND the await import() POST may yield
+            // a microtask before React schedules the paint. Net
+            // effect: the modal backdrop sat over the canvas during
+            // the first wave of WORKSPACE_PROVISIONING WS events,
+            // hiding the spawn animation. Force the close to land
+            // first so the user sees the canvas reveal + agents
+            // popping into place.
+            flushSync(() => setPreflight(null));
+            void doImport(org);
+          }}
+          onCancel={() => setPreflight(null)}
+        />
+      )}
    </div>
  );
 }
@ -319,14 +441,6 @@ export function TemplatePalette() {

  const [templates, setTemplates] = useState<Template[]>([]);
  const [loading, setLoading] = useState(false);
-  const [creating, setCreating] = useState<string | null>(null);
-  const [error, setError] = useState<string | null>(null);
-
-  // Missing keys modal state
-  const [missingKeysInfo, setMissingKeysInfo] = useState<{
-    template: Template;
-    preflight: PreflightResult;
-  } | null>(null);

  const loadTemplates = useCallback(async () => {
    setLoading(true);
@ -344,65 +458,15 @@ export function TemplatePalette() {
    if (open) loadTemplates();
  }, [open, loadTemplates]);

-  /** Resolve runtime from template ID (e.g., "langgraph", "claude-code-default" → "claude-code") */
-  const resolveRuntime = (templateId: string): string => {
-    const runtimeMap: Record<string, string> = {
-      langgraph: "langgraph",
-      "claude-code-default": "claude-code",
-      openclaw: "openclaw",
-      deepagents: "deepagents",
-      crewai: "crewai",
-      autogen: "autogen",
-    };
-    return runtimeMap[templateId] ?? templateId.replace(/-default$/, "");
-  };
-
-  /** Actually execute the deploy API call */
-  const executeDeploy = useCallback(async (template: Template) => {
-    setCreating(template.id);
-    setError(null);
-    try {
-      await api.post("/workspaces", {
-        name: template.name,
-        template: template.id,
-        tier: template.tier,
-        canvas: {
-          x: Math.random() * 400 + 100,
-          y: Math.random() * 300 + 100,
-        },
-      });
-      setCreating(null);
-    } catch (e) {
-      setError(e instanceof Error ? e.message : "Failed to deploy");
-      setCreating(null);
-    }
-  }, []);
-
-  /** Pre-deploy check: validate secrets before deploying */
-  const handleDeploy = async (template: Template) => {
-    setCreating(template.id);
-    setError(null);
-
-    // Prefer the runtime the Go /templates endpoint returned verbatim —
-    // resolveRuntime() is a legacy id→runtime fallback for installs whose
-    // template summary predates the `runtime` field.
-    const runtime = template.runtime ?? resolveRuntime(template.id);
-    const preflight = await checkDeploySecrets({
-      runtime,
-      models: template.models,
-      required_env: template.required_env,
-    });
-
-    if (!preflight.ok) {
-      // Missing keys — show the modal instead of deploying
-      setMissingKeysInfo({ template, preflight });
-      setCreating(null);
-      return;
-    }
-
-    // All keys present — deploy directly
-    await executeDeploy(template);
-  };
+  // Preflight + POST + modal wiring moved into useTemplateDeploy so
+  // this component and EmptyState use one implementation. The sidebar
+  // uses the hook's default random canvas placement (no override) —
+  // an already-populated canvas shouldn't have new deploys stacking on
+  // a single fixed point. No post-deploy side effect either: the
+  // palette is operator-triggered, so auto-selecting would yank
+  // focus off whatever the user was already looking at.
+  const { deploy: handleDeploy, deploying: creating, error, modal } =
+    useTemplateDeploy();

  return (
    <>
@ -426,21 +490,9 @@ export function TemplatePalette() {
        </svg>
      </button>

-      {/* Missing Keys Modal */}
-      <MissingKeysModal
-        open={!!missingKeysInfo}
-        missingKeys={missingKeysInfo?.preflight.missingKeys ?? []}
-        providers={missingKeysInfo?.preflight.providers ?? []}
-        runtime={missingKeysInfo?.preflight.runtime ?? ""}
-        onKeysAdded={() => {
-          if (missingKeysInfo) {
-            const template = missingKeysInfo.template;
-            setMissingKeysInfo(null);
-            executeDeploy(template);
-          }
-        }}
-        onCancel={() => setMissingKeysInfo(null)}
-      />
+      {/* Missing-keys modal — rendered by the shared hook. Same
+          instance shape used by EmptyState. */}
+      {modal}

      {/* Sidebar */}
      {open && (
@ -483,7 +535,7 @@ export function TemplatePalette() {
                <button
                  type="button"
                  key={t.id}
-                  onClick={() => handleDeploy(t)}
+                  onClick={() => void handleDeploy(t)}
                  disabled={isDeploying}
                  className="w-full text-left bg-zinc-800/40 hover:bg-zinc-800/70 border border-zinc-700/40 hover:border-zinc-600/50 rounded-xl p-3 transition-all disabled:opacity-50 disabled:cursor-not-allowed disabled:hover:bg-zinc-800/40 disabled:hover:border-zinc-700/40 group focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70"
                >
--- a/canvas/src/components/WorkspaceNode.tsx
+++ b/canvas/src/components/WorkspaceNode.tsx
@ -6,6 +6,8 @@ import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
 import { showToast } from "@/components/Toaster";
 import { Tooltip } from "@/components/Tooltip";
 import { STATUS_CONFIG, TIER_CONFIG } from "@/lib/design-tokens";
+import { useOrgDeployState } from "@/components/canvas/useOrgDeployState";
+import { OrgCancelButton } from "@/components/canvas/OrgCancelButton";

 /** Descendant count for the "N sub" badge — children are first-class nodes
 *  rendered as full cards inside this one via React Flow's native parentId,
@ -35,6 +37,10 @@ function EjectIcon(props: React.SVGProps<SVGSVGElement>) {
 export function WorkspaceNode({ id, data }: NodeProps<Node<WorkspaceNodeData>>) {
  const statusCfg = STATUS_CONFIG[data.status] || STATUS_CONFIG.offline;
  const tierCfg = TIER_CONFIG[data.tier] || { label: `T${data.tier}`, color: "text-zinc-500 bg-zinc-800" };
+  // Org-deploy context — four derived flags off one store subscription.
+  // Drives the shimmer while provisioning, the dimmed/non-draggable
+  // treatment on locked descendants, and the Cancel pill on the root.
+  const deploy = useOrgDeployState(id);
  const selectedNodeId = useCanvasStore((s) => s.selectedNodeId);
  const selectNode = useCanvasStore((s) => s.selectNode);
  const openContextMenu = useCanvasStore((s) => s.openContextMenu);
@ -138,8 +144,21 @@ export function WorkspaceNode({ id, data }: NodeProps<Node<WorkspaceNodeData>>)
        }
        backdrop-blur-sm
        focus:outline-none focus-visible:ring-2 focus-visible:ring-blue-500/70 focus-visible:ring-offset-1 focus-visible:ring-offset-zinc-950
+        ${deploy.isActivelyProvisioning ? "mol-deploy-shimmer" : ""}
+        ${deploy.isLockedChild ? "mol-deploy-locked" : ""}
      `}
    >
+      {/* Cancel-deployment pill — rendered on the root of a deploying
+          org only. Positioned absolute inside the card so it moves
+          with drag; class="nodrag" on the button stops React Flow
+          from treating clicks as a drag start. */}
+      {deploy.isDeployingRoot && (
+        <OrgCancelButton
+          rootId={id}
+          rootName={data.name}
+          workspaceCount={deploy.descendantProvisioningCount}
+        />
+      )}
      {/* Status gradient bar at top */}
      <div className={`absolute inset-x-0 top-0 h-8 bg-gradient-to-b ${statusCfg.bar} pointer-events-none`} />

--- a/canvas/src/components/tests/A2ATopologyOverlay.test.tsx
+++ b/canvas/src/components/tests/A2ATopologyOverlay.test.tsx
@ -175,9 +175,28 @@ describe("buildA2AEdges — edge properties", () => {
    expect((edge.style as React.CSSProperties).pointerEvents).toBe("none");
  });

-  it("sets pointerEvents: 'none' on labelStyle", () => {
+  it("tags the edge as type=a2a so React Flow renders the custom A2AEdge component", () => {
+    // The custom edge portals labels above the node layer and makes
+    // them clickable. Without type=a2a, RF falls back to the default
+    // edge whose label sits in the SVG group (hidden under nodes,
+    // pointerEvents:none). Regression guard for the hidden-label /
+    // unclickable-label bug observed 2026-04-25.
    const [edge] = buildA2AEdges([makeRow()], NOW);
-    expect((edge.labelStyle as React.CSSProperties).pointerEvents).toBe("none");
+    expect(edge.type).toBe("a2a");
+  });
+
+  it("populates edge.data with the fields the custom edge component reads", () => {
+    // A2AEdge reads count, lastAt, isHot, label from edge.data so the
+    // shape upstream must keep emitting them. A future buildA2AEdges
+    // refactor that drops any of these silently breaks the rendered
+    // pill (label disappears, hot/warm color swap fails, click handler
+    // can still fire but the label text vanishes).
+    const [edge] = buildA2AEdges([makeRow()], NOW);
+    const data = edge.data as Record<string, unknown>;
+    expect(data.count).toBe(1);
+    expect(typeof data.lastAt).toBe("number");
+    expect(typeof data.isHot).toBe("boolean");
+    expect(data.label).toMatch(/^1 call ·/);
  });

  it("label uses singular 'call' for count === 1", () => {
--- a/canvas/src/components/tests/Canvas.a11y.test.tsx
+++ b/canvas/src/components/tests/Canvas.a11y.test.tsx
@ -72,6 +72,7 @@ const mockStoreState = {
  selectedNodeIds: new Set<string>(),
  clearSelection: vi.fn(),
  toggleNodeSelection: vi.fn(),
+  deletingIds: new Set<string>(),
 };

 vi.mock("@/store/canvas", () => ({
--- a/canvas/src/components/tests/Canvas.pan-to-node.test.tsx
+++ b/canvas/src/components/tests/Canvas.pan-to-node.test.tsx
@ -16,7 +16,9 @@ afterEach(() => {
 // ── Shared fitView spy — must be set up before vi.mock hoisting ──────────────
 const mockFitView = vi.fn();
 const mockFitBounds = vi.fn();
-const mockGetIntersectingNodes = vi.fn(() => []);
+const mockGetIntersectingNodes = vi.fn(
+  (): Array<{ id: string; position: { x: number; y: number } }> => [],
+);

 vi.mock("@xyflow/react", () => {
  const ReactFlow = ({
@ -83,6 +85,12 @@ const mockStoreState = {
  selectedNodeIds: new Set<string>(),
  clearSelection: vi.fn(),
  toggleNodeSelection: vi.fn(),
+  // Cascade-delete / deploy animation state (added in the multilevel-
+  // layout-UX bundle). Canvas.tsx reads deletingIds.size to decide
+  // whether to apply the "locked during delete" class on each node;
+  // an empty Set mirrors the idle canvas and doesn't interact with
+  // any pan/fit behaviour under test here.
+  deletingIds: new Set<string>(),
 };

 vi.mock("@/store/canvas", () => ({
--- a/canvas/src/components/tests/OrgImportPreflightModal.test.tsx
+++ b/canvas/src/components/tests/OrgImportPreflightModal.test.tsx
@ -0,0 +1,225 @@
+// @vitest-environment jsdom
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import { render, screen, fireEvent, cleanup, waitFor } from "@testing-library/react";
+
+// Regression tests for the OrgImportPreflightModal's save path and
+// any-of group rendering. Guards two specific bugs caught in the
+// UX A/B Lab rollout (2026-04-24):
+//
+//   1. saveOne early-returned because it tried to read a local
+//      `startValue` reassigned inside a functional setDrafts
+//      updater. React did not always evaluate the updater
+//      synchronously, so the gate read "" and bailed while
+//      `saving:true` committed at next render, wedging the
+//      button on "…" without ever calling createSecret.
+//
+//   2. Double-click / Enter-spam could race past the disabled-
+//      button UI gate, firing createSecret twice. The production
+//      endpoint is idempotent so no data hazard, but the extra
+//      PUT is wasteful and harder to reason about.
+
+const createSecretMock = vi.fn().mockResolvedValue(undefined);
+
+vi.mock("@/lib/api/secrets", () => ({
+  createSecret: (...args: unknown[]) => createSecretMock(...args),
+}));
+
+import { OrgImportPreflightModal } from "../OrgImportPreflightModal";
+
+beforeEach(() => {
+  createSecretMock.mockClear();
+  createSecretMock.mockResolvedValue(undefined);
+});
+
+afterEach(() => {
+  cleanup();
+});
+
+describe("OrgImportPreflightModal — saveOne", () => {
+  it("calls createSecret exactly once when Save is clicked on an any-of member", async () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    // Both any-of members render their own input + Save.
+    const input = screen.getByLabelText(/Value for ANTHROPIC_API_KEY/i);
+    fireEvent.change(input, { target: { value: "test-secret-value" } });
+
+    // The Save button adjacent to the changed input.
+    const saveButtons = screen
+      .getAllByRole("button")
+      .filter((b) => b.textContent === "Save");
+    // Two saves on screen (one per any-of member). First is ANTHROPIC.
+    fireEvent.click(saveButtons[0]);
+
+    await waitFor(() => {
+      expect(createSecretMock).toHaveBeenCalledTimes(1);
+    });
+    expect(createSecretMock).toHaveBeenCalledWith(
+      "global",
+      "ANTHROPIC_API_KEY",
+      "test-secret-value",
+    );
+  });
+
+  it("synchronous double-click on Save fires createSecret exactly once", async () => {
+    // Pause the first save so we can fire a second click while the
+    // first is still mid-await. The two clicks happen in the SAME
+    // tick — fireEvent runs synchronously through React's event
+    // system — so any guard that depends on a committed setState
+    // (e.g. `disabled={drafts[key].saving}` or a closure read of
+    // `drafts[key].saving`) loses the race: the second click sees
+    // saving=false because React hasn't committed yet. The fix is
+    // a useRef-based gate that flips synchronously before any await.
+    let resolveCreate!: () => void;
+    createSecretMock.mockImplementationOnce(
+      () => new Promise<void>((resolve) => {
+        resolveCreate = resolve;
+      }),
+    );
+
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    const input = screen.getByLabelText(/Value for ANTHROPIC_API_KEY/i);
+    fireEvent.change(input, { target: { value: "test-secret-value" } });
+
+    const saveButtons = screen
+      .getAllByRole("button")
+      .filter((b) => b.textContent === "Save");
+    // Pull the React-bound onClick once so both invocations close
+    // over the SAME callback — simulates a double-fire that happens
+    // before React reconciles between events. Without this, RTL
+    // flushes act() between fireEvent calls and the second click
+    // sees the post-commit state.
+    const saveBtn = saveButtons[0] as HTMLButtonElement;
+    saveBtn.click();
+    saveBtn.click();
+
+    // Give React a tick to process any queued state updates.
+    await waitFor(() => {
+      expect(createSecretMock).toHaveBeenCalledTimes(1);
+    });
+
+    resolveCreate();
+    await waitFor(() => {
+      // Post-save count must remain at exactly one.
+      expect(createSecretMock).toHaveBeenCalledTimes(1);
+    });
+  });
+
+  it("does not call createSecret when value is empty", async () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    // Button is disabled when value is empty — clicking a disabled
+    // button still dispatches onClick in RTL (since fireEvent
+    // bypasses the disabled attribute), so this asserts the code-
+    // level gate catches it, not just the UI.
+    const saveButtons = screen
+      .getAllByRole("button")
+      .filter((b) => b.textContent === "Save");
+    fireEvent.click(saveButtons[0]);
+
+    // Small async wait to let any state updates settle.
+    await new Promise((r) => setTimeout(r, 50));
+    expect(createSecretMock).not.toHaveBeenCalled();
+  });
+});
+
+describe("OrgImportPreflightModal — any-of rendering", () => {
+  it("renders each any-of member as a separate input row", () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    expect(screen.getByText("Configure any one")).toBeTruthy();
+    expect(screen.getByLabelText(/Value for ANTHROPIC_API_KEY/i)).toBeTruthy();
+    expect(screen.getByLabelText(/Value for CLAUDE_CODE_OAUTH_TOKEN/i)).toBeTruthy();
+  });
+
+  it("shows satisfied indicator when any member is configured, and enables Import", () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set(["CLAUDE_CODE_OAUTH_TOKEN"])}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    // "✓ using CLAUDE_CODE_OAUTH_TOKEN" banner renders. Name appears
+    // twice (banner + member row) so use getAllByText.
+    expect(screen.getByText(/using/i)).toBeTruthy();
+    expect(screen.getAllByText("CLAUDE_CODE_OAUTH_TOKEN").length).toBeGreaterThanOrEqual(1);
+
+    const importBtn = screen.getByRole("button", { name: /^Import$/ });
+    expect(importBtn.hasAttribute("disabled")).toBe(false);
+  });
+
+  it("keeps Import disabled when no any-of member is configured", () => {
+    render(
+      <OrgImportPreflightModal
+        open
+        orgName="UX A/B Lab"
+        workspaceCount={7}
+        requiredEnv={[{ any_of: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"] }]}
+        recommendedEnv={[]}
+        configuredKeys={new Set()}
+        onSecretSaved={() => {}}
+        onProceed={() => {}}
+        onCancel={() => {}}
+      />,
+    );
+
+    const importBtn = screen.getByRole("button", { name: /^Import$/ });
+    expect(importBtn.hasAttribute("disabled")).toBe(true);
+  });
+});
--- a/canvas/src/components/tests/SkillsTab.install.test.tsx
+++ b/canvas/src/components/tests/SkillsTab.install.test.tsx
@ -0,0 +1,143 @@
+// @vitest-environment jsdom
+//
+// Behavioral coverage for the install flow. Two regressions to pin
+// down:
+//
+//  1. The install POST URL has to include the workspace id. A pre-fix
+//     bug routed it to /workspaces/undefined/plugins because the
+//     component read `data.id`, but `WorkspaceNodeData` has no `id`
+//     field — its `extends Record<string, unknown>` index signature
+//     hid the bad access from TS. The component now takes
+//     `workspaceId` as an explicit prop; this test asserts the URL.
+//
+//  2. The optimistic install update has to flip the registry row to
+//     "Installed" without waiting for the 15s reload timer (the
+//     PLUGIN_RELOAD_DELAY_MS gap). This test asserts the row's "Install"
+//     button is replaced by the green "Installed" tag synchronously
+//     after the POST resolves.
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import { render, screen, fireEvent, waitFor, cleanup } from "@testing-library/react";
+
+const mockApiGet = vi.fn();
+const mockApiPost = vi.fn();
+vi.mock("@/lib/api", () => ({
+  api: {
+    get: (...args: unknown[]) => mockApiGet(...args),
+    post: (...args: unknown[]) => mockApiPost(...args),
+    put: vi.fn().mockResolvedValue({}),
+    del: vi.fn().mockResolvedValue({}),
+    patch: vi.fn().mockResolvedValue({}),
+  },
+}));
+
+vi.mock("@/store/canvas", () => ({
+  useCanvasStore: Object.assign(
+    vi.fn((selector: (s: Record<string, unknown>) => unknown) =>
+      selector({ setPanelTab: vi.fn() } as Record<string, unknown>),
+    ),
+    { getState: () => ({ setPanelTab: vi.fn() }) },
+  ),
+  summarizeWorkspaceCapabilities: vi.fn(() => ({ skills: [], tools: [] })),
+}));
+
+vi.mock("../Toaster", () => ({ showToast: vi.fn() }));
+
+import { SkillsTab } from "../tabs/SkillsTab";
+
+function makeData() {
+  return {
+    name: "Test WS",
+    status: "online",
+    tier: 1,
+    agentCard: null,
+    activeTasks: 0,
+    collapsed: false,
+    role: "agent",
+    lastErrorRate: 0,
+    lastSampleError: "",
+    url: "http://localhost:9000",
+    parentId: null,
+    currentTask: "",
+    runtime: "langgraph",
+    needsRestart: false,
+    budgetLimit: null,
+  };
+}
+
+const REGISTRY = [
+  {
+    name: "browser-automation",
+    version: "1.1.0",
+    description: "Browser automation + testing",
+    author: "molecule",
+    tags: ["browser", "playwright"],
+    skills: [],
+    runtimes: ["claude-code"],
+  },
+];
+
+beforeEach(() => {
+  // Order matches the component's loadInstalled / loadRegistry
+  // /loadSourceSchemes calls. Schemes endpoint resolves with an
+  // empty list so the Install-from-source input doesn't blow up.
+  mockApiGet.mockReset();
+  mockApiPost.mockReset();
+  mockApiGet.mockImplementation((path: string) => {
+    if (path.endsWith("/plugins") && path.startsWith("/workspaces/")) {
+      return Promise.resolve([]); // installed
+    }
+    if (path === "/plugins") {
+      return Promise.resolve(REGISTRY); // registry
+    }
+    if (path === "/plugins/sources") {
+      return Promise.resolve({ schemes: ["github://", "local://"] });
+    }
+    return Promise.resolve(null);
+  });
+  mockApiPost.mockResolvedValue({ status: "installed", plugin: "browser-automation" });
+});
+
+afterEach(() => {
+  cleanup();
+  vi.clearAllMocks();
+});
+
+// Returns the registry row's Install button. The custom-source input
+// also renders an "Install" button, so `findByRole({name: /install/})`
+// throws on multiple matches; scope by the row's plugin-name text.
+async function findRowInstallButton() {
+  const nameNode = await screen.findByText("browser-automation");
+  const row = nameNode.closest("div.flex.items-center.justify-between") as HTMLElement;
+  if (!row) throw new Error("could not locate row container for browser-automation");
+  const buttons = row.querySelectorAll("button");
+  const install = Array.from(buttons).find((b) => b.textContent?.trim() === "Install");
+  if (!install) throw new Error("row has no Install button (already installed?)");
+  return install;
+}
+
+describe("SkillsTab install flow", () => {
+  it("POSTs to /workspaces/<workspaceId>/plugins (no `undefined` in URL)", async () => {
+    render(<SkillsTab workspaceId="ws-abc-123" data={makeData() as never} />);
+
+    fireEvent.click(await findRowInstallButton());
+
+    await waitFor(() => expect(mockApiPost).toHaveBeenCalled());
+    expect(mockApiPost).toHaveBeenCalledWith(
+      "/workspaces/ws-abc-123/plugins",
+      { source: "local://browser-automation" },
+    );
+  });
+
+  it("flips the registry row to 'Installed' synchronously after POST resolves (no 15s wait)", async () => {
+    render(<SkillsTab workspaceId="ws-abc-123" data={makeData() as never} />);
+
+    fireEvent.click(await findRowInstallButton());
+
+    // The "Installed" green tag must appear without advancing the
+    // reload timer — the optimistic update is the entire point of
+    // this fix. If this test ever regresses to needing fake timers
+    // + advanceTimersByTime, the optimistic path is broken.
+    const installedTag = await screen.findByText(/^Installed$/i);
+    expect(installedTag).toBeDefined();
+  });
+});
--- a/canvas/src/components/tests/tabs.a11y.test.tsx
+++ b/canvas/src/components/tests/tabs.a11y.test.tsx
@ -123,7 +123,7 @@ describe("SkillsTab — aria-label on bare source input (WCAG 1.3.1)", () => {
  });

  it('install source input has aria-label="Install from source URL"', async () => {
-    render(<SkillsTab data={makeSkillsData() as never} />);
+    render(<SkillsTab workspaceId="ws-test-id" data={makeSkillsData() as never} />);

    // The source input is inside the registry section (showRegistry=false initially).
    // Click the "+ Install Plugin" button to reveal it.
@ -138,7 +138,7 @@ describe("SkillsTab — aria-label on bare source input (WCAG 1.3.1)", () => {
  });

  it("install source input is a text input (not hidden)", async () => {
-    render(<SkillsTab data={makeSkillsData() as never} />);
+    render(<SkillsTab workspaceId="ws-test-id" data={makeSkillsData() as never} />);

    const installBtn = screen.getByRole("button", { name: /install plugin/i });
    fireEvent.click(installBtn);
--- a/canvas/src/components/canvas/A2AEdge.tsx
+++ b/canvas/src/components/canvas/A2AEdge.tsx
@ -0,0 +1,133 @@
+"use client";
+
+import { memo } from "react";
+import {
+  BaseEdge,
+  EdgeLabelRenderer,
+  getBezierPath,
+  type EdgeProps,
+} from "@xyflow/react";
+import { useCanvasStore } from "@/store/canvas";
+
+/**
+ * Custom edge for the A2A topology overlay. Solves two problems with the
+ * default React Flow edge label rendering:
+ *
+ *   1. **Z-order.** The default `label` prop renders inside the edge's
+ *      SVG group, which always sits below node DOM in React Flow. When
+ *      a label happened to land underneath a workspace card, it was
+ *      hidden. EdgeLabelRenderer mounts label content in a separate
+ *      portal layer that we can pin above nodes via z-index.
+ *
+ *   2. **Clickability.** Default labels inherit `pointerEvents: none`
+ *      from the SVG path so the user can drag through them. The
+ *      portaled label is a regular HTML element with its own pointer
+ *      events — we set `pointerEvents: all` only on the label pill so
+ *      drags on the edge line still pass through to the canvas.
+ *
+ * On click: selects the source workspace and switches its side panel
+ * to Activity, where the user can inspect the underlying delegations.
+ */
+interface A2AEdgeData {
+  count: number;
+  lastAt: number;
+  isHot: boolean;
+  /** Pre-formatted "5 calls · 2m ago" — built upstream by buildA2AEdges
+   *  so the same string renders here and in any future tooltip layer. */
+  label: string;
+}
+
+function A2AEdgeImpl({
+  id,
+  source,
+  sourceX,
+  sourceY,
+  targetX,
+  targetY,
+  sourcePosition,
+  targetPosition,
+  data,
+  style = {},
+}: EdgeProps) {
+  const [edgePath, labelX, labelY] = getBezierPath({
+    sourceX,
+    sourceY,
+    sourcePosition,
+    targetX,
+    targetY,
+    targetPosition,
+  });
+
+  const selectNode = useCanvasStore((s) => s.selectNode);
+  const setPanelTab = useCanvasStore((s) => s.setPanelTab);
+
+  const edgeData = (data ?? {}) as Partial<A2AEdgeData>;
+  const labelText = edgeData.label ?? "";
+  const isHot = edgeData.isHot ?? false;
+  const count = edgeData.count ?? 0;
+
+  const handleClick = (e: React.MouseEvent) => {
+    e.stopPropagation();
+    // Select the source (the agent that initiated the delegations).
+    // The user's mental model when clicking the edge is "show me the
+    // calls FROM here" — that's the source's activity feed.
+    //
+    // Preserve the current tab when the user re-clicks the same edge
+    // (or another edge whose source is already selected). Yanking
+    // them back to Activity every click would surprise — they may
+    // have intentionally switched to Chat / Memory while looking at
+    // this peer. The first click that lands a *different* selection
+    // still routes them to Activity, which is the discovery affordance.
+    const alreadySelected =
+      useCanvasStore.getState().selectedNodeId === source;
+    selectNode(source);
+    if (!alreadySelected) {
+      setPanelTab("activity");
+    }
+  };
+
+  // The edge stroke color matches what buildA2AEdges sets on the SVG
+  // path style. Mirror it on the badge border so the visual identity
+  // (hot=violet vs warm=blue) carries to the clickable label.
+  const accent = isHot ? "border-violet-500/60" : "border-blue-500/60";
+  const accentText = isHot ? "text-violet-200" : "text-blue-200";
+  const ariaLabel = `${count} delegation${count === 1 ? "" : "s"} from ${
+    edgeData.label?.split(" · ")[1] ?? "recent"
+  }. Click to inspect.`;
+
+  return (
+    <>
+      <BaseEdge id={id} path={edgePath} style={style} markerEnd="url(#a2a-arrow)" />
+      {labelText && (
+        <EdgeLabelRenderer>
+          <div
+            // The label sits in a portal at the canvas root. position:
+            // absolute + the (labelX, labelY) translate places it at
+            // the edge midpoint. zIndex 5 wins against React Flow's
+            // node layer (default z=0) without fighting the controls
+            // strip (z=10).
+            style={{
+              position: "absolute",
+              transform: `translate(-50%, -50%) translate(${labelX}px, ${labelY}px)`,
+              pointerEvents: "all",
+              zIndex: 5,
+            }}
+            className="nodrag nopan"
+          >
+            <button
+              type="button"
+              onClick={handleClick}
+              aria-label={ariaLabel}
+              title="Open source workspace's activity feed"
+              className={`px-2 py-0.5 rounded-full bg-zinc-900/95 border ${accent} ${accentText} text-[10px] font-medium shadow-md shadow-black/40 backdrop-blur-sm hover:bg-zinc-800 hover:border-opacity-100 transition-colors cursor-pointer`}
+            >
+              {labelText}
+            </button>
+          </div>
+        </EdgeLabelRenderer>
+      )}
+    </>
+  );
+}
+
+export const A2AEdge = memo(A2AEdgeImpl);
--- a/canvas/src/components/canvas/OrgCancelButton.tsx
+++ b/canvas/src/components/canvas/OrgCancelButton.tsx
@ -0,0 +1,165 @@
+"use client";
+
+import { useState } from "react";
+import { api } from "@/lib/api";
+import { useCanvasStore } from "@/store/canvas";
+import { showToast } from "@/components/Toaster";
+
+interface Props {
+  /** Root workspace of the org being deployed. The cancel action
+   *  cascades delete through workspace-server's existing recursive
+   *  delete handler, so we only need the root id. */
+  rootId: string;
+  rootName: string;
+  /** Count rendered in the pill label; updated live as children
+   *  come online (the useOrgDeployState hook recomputes on every
+   *  status change). */
+  workspaceCount: number;
+}
+
+/**
+ * Cancel-deployment pill attached to the root of a deploying org.
+ * One click → confirm dialog → DELETE /workspaces/:rootId?confirm=true
+ * which cascades through every descendant server-side.
+ *
+ * Rendered inside the root's WorkspaceNode card via an absolute-
+ * positioned overlay so it sits visually ON the card and moves with
+ * drag. `className="nodrag"` stops React Flow from interpreting
+ * clicks here as the start of a drag gesture.
+ *
+ * Deliberately uses only `.mol-deploy-cancel*` classes for styling —
+ * every color / easing comes from theme-tokens.css, so a future
+ * light-theme (or tenant-branded theme) inherits automatically.
+ */
+export function OrgCancelButton({ rootId, rootName, workspaceCount }: Props) {
+  const [confirming, setConfirming] = useState(false);
+  const [submitting, setSubmitting] = useState(false);
+
+  const handleCancel = async () => {
+    setSubmitting(true);
+    // Populate deletingIds with the subtree so every descendant
+    // (and the root) locks into the dim + non-draggable state for
+    // the duration of the network round-trip + server cascade —
+    // same treatment the regular delete gives. Otherwise the org
+    // looks interactive for the several seconds between click and
+    // the first WORKSPACE_REMOVED event.
+    const preState = useCanvasStore.getState();
+    const subtreeIds = new Set<string>();
+    const walkStack = [rootId];
+    while (walkStack.length) {
+      const nid = walkStack.pop()!;
+      subtreeIds.add(nid);
+      for (const n of preState.nodes) {
+        if (n.data.parentId === nid) walkStack.push(n.id);
+      }
+    }
+    preState.beginDelete(subtreeIds);
+    try {
+      await api.del<{ status: string }>(
+        `/workspaces/${rootId}?confirm=true`,
+      );
+      showToast(`Cancelled deployment of "${rootName}"`, "success");
+      // Optimistic local removal — workspace-server broadcasts
+      // WORKSPACE_REMOVED per node but the WS may lag; strip the
+      // subtree now so the user sees immediate feedback. Re-read
+      // the store AFTER the await: children may have landed (or
+      // already been removed by WS events) during the network
+      // round-trip. If the WS_REMOVED handler already dropped the
+      // root during the network call, bail out — the subtree walk
+      // would miss any now-orphaned descendants (handleCanvasEvent
+      // reparents children of a removed node upward, so they no
+      // longer share the original root's id as parentId).
+      const postDeleteState = useCanvasStore.getState();
+      if (!postDeleteState.nodes.some((n) => n.id === rootId)) {
+        return;
+      }
+      const subtree = new Set<string>();
+      const stack = [rootId];
+      while (stack.length) {
+        const id = stack.pop()!;
+        subtree.add(id);
+        for (const n of postDeleteState.nodes) {
+          if (n.data.parentId === id) stack.push(n.id);
+        }
+      }
+      useCanvasStore.setState({
+        nodes: postDeleteState.nodes.filter((n) => !subtree.has(n.id)),
+        edges: postDeleteState.edges.filter(
+          (e) => !subtree.has(e.source) && !subtree.has(e.target),
+        ),
+      });
+    } catch (e) {
+      // Undo the lock so the user can try again / interact with the
+      // still-deploying subtree.
+      useCanvasStore.getState().endDelete(subtreeIds);
+      showToast(
+        e instanceof Error ? `Cancel failed: ${e.message}` : "Cancel failed",
+        "error",
+      );
+    } finally {
+      // Success path's endDelete is covered implicitly — every node
+      // in the subtree is stripped by the optimistic local removal
+      // above, and any stragglers are removed by WORKSPACE_REMOVED
+      // WS events whose handler is a no-op on already-missing ids.
+      // The deletingIds set will naturally empty as endDelete runs
+      // in both paths below.
+      useCanvasStore.getState().endDelete(subtreeIds);
+      setSubmitting(false);
+      setConfirming(false);
+    }
+  };
+
+  if (confirming) {
+    return (
+      <div
+        className="nodrag absolute -top-10 right-0 z-20 flex items-center gap-1.5 rounded-lg bg-zinc-900/95 px-2 py-1 shadow-lg border border-red-800/60"
+        onClick={(e) => e.stopPropagation()}
+      >
+        <span className="text-[10px] text-zinc-300">
+          Delete {workspaceCount} workspace{workspaceCount === 1 ? "" : "s"}?
+        </span>
+        <button
+          type="button"
+          onClick={handleCancel}
+          disabled={submitting}
+          className="mol-deploy-cancel px-2 py-0.5 rounded text-[10px] font-semibold"
+        >
+          {submitting ? "Deleting…" : "Yes"}
+        </button>
+        <button
+          type="button"
+          onClick={() => setConfirming(false)}
+          disabled={submitting}
+          className="px-2 py-0.5 rounded bg-zinc-700/80 hover:bg-zinc-600 text-[10px] text-zinc-200"
+        >
+          No
+        </button>
+      </div>
+    );
+  }
+
+  return (
+    <button
+      type="button"
+      onClick={(e) => {
+        // Stop the click from bubbling to React Flow (selects the
+        // node) — the Cancel pill is a UI surface, not a node
+        // activation.
+        e.stopPropagation();
+        setConfirming(true);
+      }}
+      className="nodrag mol-deploy-cancel mol-deploy-cancel-pulse absolute -top-7 right-1 z-20 flex items-center gap-1 rounded-full px-2.5 py-0.5 text-[10px] font-semibold shadow-md"
+      aria-label={`Cancel deployment of ${rootName}`}
+    >
+      <svg width="10" height="10" viewBox="0 0 16 16" aria-hidden="true">
+        <path
+          d="M4 4l8 8M12 4l-8 8"
+          stroke="currentColor"
+          strokeWidth="2"
+          strokeLinecap="round"
+        />
+      </svg>
+      <span>Cancel ({workspaceCount})</span>
+    </button>
+  );
+}
--- a/canvas/src/components/canvas/tests/useCanvasViewport.test.ts
+++ b/canvas/src/components/canvas/tests/useCanvasViewport.test.ts
@ -0,0 +1,53 @@
+import { describe, it, expect } from "vitest";
+import { shouldFitGrowing } from "../useCanvasViewport";
+
+// Tests cover the auto-fit gate in isolation. The hook itself is
+// effects + refs + React Flow handles, awkward to exercise directly —
+// extracting the pure decision into shouldFitGrowing(...) lets us
+// pin down the regression-prone logic with unit tests instead.
+
+describe("shouldFitGrowing", () => {
+  it("fits the very first time (no prior snapshot)", () => {
+    expect(shouldFitGrowing(["a"], undefined, null, 0)).toBe(true);
+  });
+
+  it("fits when the prior snapshot is empty", () => {
+    expect(shouldFitGrowing(["a", "b"], new Set(), null, 0)).toBe(true);
+  });
+
+  it("fits when a brand-new id has been added since the last fit", () => {
+    const prev = new Set(["root", "a", "b"]);
+    expect(shouldFitGrowing(["root", "a", "b", "c"], prev, null, 0)).toBe(true);
+  });
+
+  it("respects user pan when the subtree hasn't grown", () => {
+    const prev = new Set(["root", "a", "b"]);
+    // Status update on existing node — same membership.
+    expect(shouldFitGrowing(["root", "a", "b"], prev, 5_000, 1_000)).toBe(false);
+  });
+
+  it("fits when the subtree hasn't grown but the user never panned", () => {
+    const prev = new Set(["root", "a", "b"]);
+    expect(shouldFitGrowing(["root", "a", "b"], prev, null, 1_000)).toBe(true);
+  });
+
+  it("fits when the subtree hasn't grown and the user panned BEFORE the last fit", () => {
+    const prev = new Set(["root", "a", "b"]);
+    expect(shouldFitGrowing(["root", "a", "b"], prev, 500, 1_000)).toBe(true);
+  });
+
+  it("forces fit on delete-then-add even when the count is unchanged", () => {
+    // Subtree was [root, a, b, c, d]. Then `d` got removed and a
+    // sibling `e` arrived. Same length, different membership — a
+    // length-only check would skip the fit and leave `e` off-screen.
+    const prev = new Set(["root", "a", "b", "c", "d"]);
+    expect(
+      shouldFitGrowing(["root", "a", "b", "c", "e"], prev, 5_000, 1_000),
+    ).toBe(true);
+  });
+
+  it("does NOT fit on shrink-only when the user has panned (deletion alone shouldn't override exploration)", () => {
+    const prev = new Set(["root", "a", "b", "c"]);
+    expect(shouldFitGrowing(["root", "a", "b"], prev, 5_000, 1_000)).toBe(false);
+  });
+});
--- a/canvas/src/components/canvas/useCanvasViewport.ts
+++ b/canvas/src/components/canvas/useCanvasViewport.ts
@ -3,11 +3,43 @@
 import { useCallback, useEffect, useRef } from "react";
 import { useReactFlow } from "@xyflow/react";
 import { useCanvasStore } from "@/store/canvas";
+import { appendClass, removeClass } from "@/store/classNames";
 import {
  CHILD_DEFAULT_HEIGHT,
  CHILD_DEFAULT_WIDTH,
 } from "@/store/canvas-topology";

+/**
+ * Decide whether the deploy-time auto-fit should run. Pure function so
+ * the gate logic is unit-testable in isolation — the surrounding
+ * useEffect tangle of refs, timers, and React Flow handles is awkward
+ * to exercise directly.
+ *
+ * Returns true when the auto-fit SHOULD fire:
+ *   - the subtree contains an id that wasn't in the previous snapshot
+ *     (a new node arrived → user has lost context, force the fit
+ *     through regardless of any user-pan in between), OR
+ *   - the user has not panned since the last successful fit (so the
+ *     auto-fit isn't fighting their override).
+ *
+ * `prevSubtreeIds === undefined` means no fit has ever run for this
+ * root — treat every id as "new" and fit. `userPannedAt === null`
+ * means the user has never panned at all in this session — fit.
+ */
+export function shouldFitGrowing(
+  currentSubtreeIds: readonly string[],
+  prevSubtreeIds: ReadonlySet<string> | undefined,
+  userPannedAt: number | null,
+  lastAutoFitAt: number,
+): boolean {
+  if (!prevSubtreeIds || prevSubtreeIds.size === 0) return true;
+  for (const id of currentSubtreeIds) {
+    if (!prevSubtreeIds.has(id)) return true;
+  }
+  if (userPannedAt === null) return true;
+  return userPannedAt <= lastAutoFitAt;
+}
+
 /**
 * Wires the two canvas-wide CustomEvent listeners and the viewport
 * save/restore bookkeeping so Canvas.tsx doesn't have to.
@ -25,17 +57,79 @@ export function useCanvasViewport() {
  const saveViewport = useCanvasStore((s) => s.saveViewport);
  const saveTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
  const panTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
-  const autoFitTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
+  // Two distinct fit timers — DO NOT collapse to one.
+  //   - settleFitTimerRef:   1200ms one-shot run by the
+  //     "transition from any-provisioning to none" effect (the deploy
+  //     just finished — settle on the whole org once).
+  //   - trackingFitTimerRef: 500ms debounced by the per-arrival
+  //     molecule:fit-deploying-org event handler (track the org's
+  //     bounds as children land during the deploy).
+  // They MUST NOT share a ref: the two effects fire interleaved
+  // (every WS event during a deploy resets the tracking timer; the
+  // settle timer arms the moment provisioning hits zero), and a
+  // shared ref made each effect silently clearTimeout the other's
+  // pending fit. Today's behavior happened to land in the right
+  // order out of luck; splitting the refs makes ordering independent
+  // of fire sequence.
+  const settleFitTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
+  const trackingFitTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);
  // Tracks whether any workspace was provisioning on the previous
  // render so we can detect the boundary when the last one finishes
  // and auto-fit the viewport around the whole tree.
  const hadProvisioningRef = useRef(false);
+  // Respect-user-pan gate for the deploy-time auto-fit. Earlier
+  // revisions tried to detect user pans via `onMoveEnd`, but React
+  // Flow v12 fires that callback with a truthy event at the END of
+  // a programmatic fitView animation — so the first auto-fit we
+  // triggered would immediately look like a user pan and block
+  // every subsequent fit for the rest of the deploy, leaving the
+  // viewport stuck wherever the first fit landed. Now we stamp
+  // this ref ONLY on wheel / pointerdown / touchstart on the
+  // React Flow pane itself (see the effect below), which are
+  // unambiguous user-gesture signals.
+  const userPannedAtRef = useRef<number | null>(null);
+  const lastAutoFitAtRef = useRef(0);

  useEffect(() => {
    return () => {
      clearTimeout(saveTimerRef.current);
      clearTimeout(panTimerRef.current);
-      clearTimeout(autoFitTimerRef.current);
+      clearTimeout(settleFitTimerRef.current);
+      clearTimeout(trackingFitTimerRef.current);
+    };
+  }, []);
+
+  // User-gesture listeners for the respect-user-pan gate. Listens on
+  // `document` with capture phase and filters to events whose target
+  // lies inside the React Flow pane — this avoids a mount-order race
+  // (`.react-flow__pane` may not exist when the hook first runs if
+  // RF is behind a Suspense boundary) AND keeps clicks on the
+  // toolbar / modals / side panel from stamping user-pan-intent.
+  // Capture phase runs before target-phase `stopPropagation` so a
+  // handler elsewhere can't swallow the signal.
+  //
+  // Wheel only — NOT pointerdown. A pointerdown on the pane fires for
+  // ordinary clicks (deselect, click-near-a-card, modal-close-bubble)
+  // as well as the start of a drag-pan. Treating every pointerdown as
+  // "user wants to override auto-fit" meant a single accidental click
+  // before/during an org import locked out every subsequent fit, so
+  // the viewport stuck at whatever the first fit landed on while
+  // children kept materialising off-screen. Wheel is the canonical
+  // unambiguous gesture: scroll-to-pan and pinch-zoom both surface as
+  // wheel events. Drag-pans without an accompanying wheel are rare
+  // enough that letting them be overridden by a follow-up auto-fit is
+  // the right tradeoff.
+  useEffect(() => {
+    if (typeof window === "undefined") return;
+    const stamp = (e: Event) => {
+      const target = e.target as HTMLElement | null;
+      if (!target?.closest?.(".react-flow__pane")) return;
+      userPannedAtRef.current = Date.now();
+    };
+    const opts: AddEventListenerOptions = { passive: true, capture: true };
+    document.addEventListener("wheel", stamp, opts);
+    return () => {
+      document.removeEventListener("wheel", stamp, opts);
    };
  }, []);

@ -55,20 +149,64 @@ export function useCanvasViewport() {
    hadProvisioningRef.current = hasProvisioning;

    if (wasProvisioning && !hasProvisioning && nodeCount > 0) {
-      clearTimeout(autoFitTimerRef.current);
+      // Root-complete moment — every root that has children just
+      // finished deploying. Pop + glow once (mol-deploy-root-complete)
+      // then auto-fit the viewport around the whole org. Leaf-only
+      // roots (single workspaces with no children) are skipped so the
+      // effect reads as "your org landed" not "random card flickered".
+      const state = useCanvasStore.getState();
+      const rootsWithChildren = new Set<string>();
+      for (const n of state.nodes) {
+        if (n.data.parentId) continue;
+        if (state.nodes.some((c) => c.data.parentId === n.id)) {
+          rootsWithChildren.add(n.id);
+        }
+      }
+      if (rootsWithChildren.size > 0) {
+        useCanvasStore.setState({
+          nodes: state.nodes.map((n) =>
+            rootsWithChildren.has(n.id)
+              ? { ...n, className: appendClass(n.className, "mol-deploy-root-complete") }
+              : n,
+          ),
+        });
+        // Strip the one-shot class after the keyframe ends so a later
+        // deploy on the same node can fire it again.
+        window.setTimeout(() => {
+          const s = useCanvasStore.getState();
+          useCanvasStore.setState({
+            nodes: s.nodes.map((n) =>
+              rootsWithChildren.has(n.id)
+                ? { ...n, className: removeClass(n.className, "mol-deploy-root-complete") }
+                : n,
+            ),
+          });
+        }, 800);
+      }
+
+      clearTimeout(settleFitTimerRef.current);
      // 1200ms settle delay: lets React Flow's DOM measurement pass
      // resize newly-online parents before we compute bounds.
      // Measuring too early gives us the pre-render skeleton bbox and
      // fitView zooms to that smaller-than-real rectangle.
-      autoFitTimerRef.current = setTimeout(() => {
+      settleFitTimerRef.current = setTimeout(() => {
        fitView({
+          // Deliberately SLOWER than the in-flight tracking fits
+          // (400ms). The asymmetry reads as "settling" on the
+          // finished org rather than "tracking" another arrival,
+          // which is the intended UX for the "deploy done" moment.
+          // Don't normalize these two durations to the same value.
          duration: 1200,
-          padding: 0.25,
+          // Match the deploy-time fit padding (0.45) so end-state
+          // and in-flight state use the same framing — otherwise
+          // the final zoom-out "jumps" relative to the intermediate
+          // fits and looks like a mis-layout.
+          padding: 0.45,
          // Cap zoom-in: a small tree (2-3 nodes) would otherwise end
          // up at the 2x maxZoom, visually implying "something is
-          // wrong". 0.8 reads like "here's your whole org" even when
-          // the tree is small.
-          maxZoom: 0.8,
+          // wrong". 0.65 reads like "here's your whole org" even when
+          // the tree is small — matches deploy-time cap.
+          maxZoom: 0.65,
          // Cap zoom-out: fitView would fall back to the component's
          // minZoom=0.1 on a sparse/outlier layout, leaving the user
          // staring at a postage-stamp canvas. 0.25 is the floor.
@ -92,6 +230,115 @@ export function useCanvasViewport() {
    return () => window.removeEventListener("molecule:pan-to-node", handler);
  }, [fitView]);

+  // Auto pan+zoom to the whole deploying org after each child
+  // arrival — DEBOUNCED. Firing fitView on every event with a
+  // 600ms animation meant rapid sibling arrivals (server paces 2s
+  // apart, HMR bursts can land faster) made the viewport lurch
+  // continuously, which the user read as "parent flashing around".
+  // We now wait until the arrivals GO QUIET for 500ms, then run
+  // exactly one fit. The rootId we captured on the most recent
+  // event drives the fit bounds. Respect-user-pan still short-
+  // circuits: if the user moved after our last auto-fit, we never
+  // fit again this deploy.
+  const pendingFitRootRef = useRef<string | null>(null);
+  // Membership snapshot of the subtree at the moment of the last
+  // successful auto-fit, keyed by root id. When a new event arrives,
+  // we compute growth as "any id in the current subtree that wasn't
+  // in the snapshot". An id-set rather than just a count handles the
+  // delete-then-add case correctly: subtree of 6 → delete one → 5 →
+  // a different child arrives → 6 again. A length-only comparison
+  // would call this "no growth" and skip the fit even though a
+  // brand-new node landed off-screen. The id-set sees the new id
+  // wasn't in the snapshot and forces the fit.
+  //
+  // Map is keyed by root id and never pruned. Acceptable today because
+  // org roots are UUIDs (no collisions on retry / template re-import),
+  // canvas sessions are per-tab, and entries are tiny. Worth a sweep
+  // if long-lived sessions ever start importing hundreds of orgs.
+  const lastFitSubtreeIdsRef = useRef<Map<string, Set<string>>>(new Map());
+  useEffect(() => {
+    const runFit = () => {
+      const rootCandidate = pendingFitRootRef.current;
+      pendingFitRootRef.current = null;
+      if (!rootCandidate) return;
+      const state = useCanvasStore.getState();
+      // Climb to the true root — the event's rootId is the just-
+      // landed child's direct parent, which may itself be nested.
+      let topId = rootCandidate;
+      let cursor = state.nodes.find((n) => n.id === topId);
+      while (cursor?.data.parentId) {
+        const up = state.nodes.find((n) => n.id === cursor!.data.parentId);
+        if (!up) break;
+        cursor = up;
+        topId = up.id;
+      }
+      const subtree: string[] = [];
+      const stack = [topId];
+      while (stack.length) {
+        const id = stack.pop()!;
+        subtree.push(id);
+        for (const n of state.nodes) {
+          if (n.data.parentId === id) stack.push(n.id);
+        }
+      }
+      if (subtree.length === 0) return;
+
+      // Growth check: did any id in the current subtree NOT appear
+      // in the snapshot from the last fit? If yes, fit through
+      // regardless of the user-pan timestamp — the user has lost
+      // context, the new arrival is off-screen, and the deploy is
+      // the primary thing they want to watch. If no, fall back to
+      // the user-pan respect gate so post-deploy exploration isn't
+      // yanked back.
+      if (!shouldFitGrowing(
+        subtree,
+        lastFitSubtreeIdsRef.current.get(topId),
+        userPannedAtRef.current,
+        lastAutoFitAtRef.current,
+      )) {
+        return;
+      }
+      fitView({
+        nodes: subtree.map((id) => ({ id })),
+        // Short animation — server paces children ~2s apart, so a
+        // 400ms fit animation reads as "smoothly tracked" rather
+        // than "constantly lurching". Longer durations (the earlier
+        // 600ms) start to overlap if the user re-triggers deploys.
+        duration: 400,
+        // Generous padding so the right-hand Communications panel,
+        // bottom-left Legend, and bottom-right "New Workspace"
+        // button don't cover the outer cards. React Flow padding
+        // is a fraction of viewport dims, so 0.45 ≈ ~430px of
+        // margin on a 960-wide canvas — enough clearance for the
+        // two side panels (~300px + ~280px).
+        padding: 0.45,
+        // Lower maxZoom so small orgs (2-3 cards) still zoom out
+        // enough to show the parent frame + children clearly with
+        // the padded margins. 0.65 reads as "here's the whole org"
+        // without getting dragged to the maxZoom by fitView's
+        // "fill the viewport" default.
+        maxZoom: 0.65,
+        minZoom: 0.25,
+      });
+      lastAutoFitAtRef.current = Date.now();
+      lastFitSubtreeIdsRef.current.set(topId, new Set(subtree));
+    };
+    const handler = (e: Event) => {
+      const { rootId } = (e as CustomEvent<{ rootId: string }>).detail;
+      // Keep the most recently-requested root. Back-to-back imports
+      // on two different orgs (rare — user would have to click
+      // Import twice within 500ms) "later wins" the viewport rather
+      // than ping-ponging between them. If this becomes a real
+      // pattern we'd flush the pending fit synchronously when
+      // `rootId` changes, rather than resetting the timer.
+      pendingFitRootRef.current = rootId;
+      clearTimeout(trackingFitTimerRef.current);
+      trackingFitTimerRef.current = setTimeout(runFit, 500);
+    };
+    window.addEventListener("molecule:fit-deploying-org", handler);
+    return () => window.removeEventListener("molecule:fit-deploying-org", handler);
+  }, [fitView]);
+
  // Zoom to a team: fit the parent + its direct children in view.
  useEffect(() => {
    const handler = (e: Event) => {
@ -129,6 +376,11 @@ export function useCanvasViewport() {

  const onMoveEnd = useCallback(
    (_event: unknown, vp: { x: number; y: number; zoom: number }) => {
+      // User-pan detection moved to the wheel/pointerdown listener
+      // above — onMoveEnd fires for programmatic fitView too, which
+      // made this callback an unreliable source for user-intent
+      // tracking. This now only handles the debounced viewport
+      // save so a reload lands the user back where they were.
      clearTimeout(saveTimerRef.current);
      saveTimerRef.current = setTimeout(() => {
        saveViewport(vp.x, vp.y, vp.zoom);
--- a/canvas/src/components/canvas/useDragHandlers.ts
+++ b/canvas/src/components/canvas/useDragHandlers.ts
@ -113,6 +113,18 @@ export function useDragHandlers(): DragHandlers {

  const onNodeDragStart: OnNodeDrag<WorkspaceNode> = useCallback(
    (event, node) => {
+      // Belt-and-braces drag-lock: the primary mechanism is the
+      // `draggable: false` projection in Canvas.tsx — React Flow
+      // won't invoke this callback for locked nodes. But a future
+      // change to the projection that forgets a locked subtree
+      // would silently allow dragging, and locked drags mid-deploy
+      // corrupt the spawn animation. Fall through to a state-based
+      // check here so the invariant stays enforced in both places.
+      if (node.draggable === false) {
+        dragStartStateRef.current = null;
+        return;
+      }
+
      dragModifiersRef.current = {
        alt: event.altKey,
        meta: event.metaKey || event.ctrlKey,
--- a/canvas/src/components/canvas/useOrgDeployState.ts
+++ b/canvas/src/components/canvas/useOrgDeployState.ts
@ -0,0 +1,152 @@
+"use client";
+
+import { useMemo } from "react";
+import { useCanvasStore } from "@/store/canvas";
+
+/**
+ * Org-deploy state for a single workspace node. Computed from the
+ * current canvas store snapshot — no per-org status field on the
+ * backend is required (a root "is deploying" iff any descendant in
+ * its subtree still reports status === "provisioning").
+ *
+ * Performance note: the first version of this hook walked the entire
+ * nodes array per node render — O(n²) for a 50-node org. The current
+ * implementation computes ONE map of derived state for the whole
+ * canvas per nodes-array change, then each call site looks up its
+ * own id. The map is built inside useMemo against a cheap projection
+ * (id + parentId + status tuples via useShallow) so unrelated store
+ * mutations (drag, selection, viewport) don't re-run the walk.
+ */
+export interface OrgDeployState {
+  isActivelyProvisioning: boolean;
+  isDeployingRoot: boolean;
+  isLockedChild: boolean;
+  descendantProvisioningCount: number;
+}
+
+const EMPTY: OrgDeployState = {
+  isActivelyProvisioning: false,
+  isDeployingRoot: false,
+  isLockedChild: false,
+  descendantProvisioningCount: 0,
+};
+
+/** Projection used to drive the deploy-state computation. Shallow-
+ *  compared so re-renders only happen when one of these fields
+ *  actually changes across any node. */
+interface NodeProjection {
+  id: string;
+  parentId: string | null;
+  status: string;
+}
+
+function buildDeployMap(
+  projections: NodeProjection[],
+  deletingIds: ReadonlySet<string>,
+): Map<string, OrgDeployState> {
+  const byId = new Map<string, NodeProjection>();
+  const childrenBy = new Map<string, string[]>();
+  for (const p of projections) {
+    byId.set(p.id, p);
+    if (p.parentId) {
+      const arr = childrenBy.get(p.parentId) ?? [];
+      arr.push(p.id);
+      childrenBy.set(p.parentId, arr);
+    }
+  }
+
+  // Walk once from each node up to its root, memoising the root id.
+  // `rootOf.get(id)` short-circuits further walks on the same chain.
+  const rootOf = new Map<string, string>();
+  const findRoot = (id: string): string => {
+    const cached = rootOf.get(id);
+    if (cached) return cached;
+    let cursor: NodeProjection | undefined = byId.get(id);
+    let rootId = id;
+    while (cursor && cursor.parentId) {
+      const parent = byId.get(cursor.parentId);
+      if (!parent) break;
+      cursor = parent;
+      rootId = parent.id;
+      const alreadyKnown = rootOf.get(rootId);
+      if (alreadyKnown) {
+        rootId = alreadyKnown;
+        break;
+      }
+    }
+    rootOf.set(id, rootId);
+    return rootId;
+  };
+
+  // Count provisioning descendants per node. Also walk once per root
+  // using an iterative DFS so we don't stack-overflow on deep trees.
+  const countProvisioning = (rootId: string): number => {
+    let count = 0;
+    const stack = [rootId];
+    while (stack.length) {
+      const id = stack.pop()!;
+      const node = byId.get(id);
+      if (!node) continue;
+      if (node.status === "provisioning") count++;
+      const kids = childrenBy.get(id);
+      if (kids) stack.push(...kids);
+    }
+    return count;
+  };
+
+  // Per-root cache of subtree count so every descendant resolves in O(1).
+  const rootCount = new Map<string, number>();
+
+  const out = new Map<string, OrgDeployState>();
+  for (const p of projections) {
+    const rootId = findRoot(p.id);
+    let provCount = rootCount.get(rootId);
+    if (provCount === undefined) {
+      provCount = countProvisioning(rootId);
+      rootCount.set(rootId, provCount);
+    }
+    const rootIsDeploying = provCount > 0;
+    // A node being deleted gets the same visual + interaction lock
+    // as a deploying child. "The system owns this node right now,
+    // don't touch it" is the shared semantic — the user only cares
+    // that the card is dim and won't drag; they don't need to know
+    // whether it's coming up or going down.
+    const deleting = deletingIds.has(p.id);
+    out.set(p.id, {
+      isActivelyProvisioning: p.status === "provisioning",
+      isDeployingRoot: p.id === rootId && rootIsDeploying,
+      isLockedChild: deleting || (p.id !== rootId && rootIsDeploying),
+      descendantProvisioningCount:
+        p.id === rootId ? provCount : 0, // only roots display the count
+    });
+  }
+  return out;
+}
+
+/** Store-wide derived map. Recomputed whenever the `nodes` array
+ *  reference changes — which is on every store mutation that touches
+ *  nodes, including pure position tweens. The map build is O(n) so
+ *  a 50-node canvas costs ~50μs per tween frame; that's cheap enough
+ *  to not need a projection layer. (An earlier attempt to narrow the
+ *  subscription via `useShallow((s) => s.nodes.map(...))` triggered
+ *  React 18's "getSnapshot should be cached" loop because the
+ *  projection creates fresh object references each call — shallow
+ *  equality always sees "changed", which re-renders, which re-runs
+ *  the selector, ad infinitum.) */
+function useDeployMap(): Map<string, OrgDeployState> {
+  const nodes = useCanvasStore((s) => s.nodes);
+  const deletingIds = useCanvasStore((s) => s.deletingIds);
+  return useMemo(() => {
+    const projections = nodes.map((n) => ({
+      id: n.id,
+      parentId: n.data.parentId,
+      status: n.data.status,
+    }));
+    return buildDeployMap(projections, deletingIds);
+  }, [nodes, deletingIds]);
+}
+
+export function useOrgDeployState(nodeId: string): OrgDeployState {
+  const map = useDeployMap();
+  return map.get(nodeId) ?? EMPTY;
+}
--- a/canvas/src/components/tabs/ActivityTab.tsx
+++ b/canvas/src/components/tabs/ActivityTab.tsx
@ -5,6 +5,7 @@ import { api } from "@/lib/api";
 import { ConversationTraceModal } from "@/components/ConversationTraceModal";
 import { type ActivityEntry } from "@/types/activity";
 import { useWorkspaceName } from "@/hooks/useWorkspaceName";
+import { inferA2AErrorHint } from "./chat/a2aErrorHint";

 interface Props {
  workspaceId: string;
@ -286,6 +287,26 @@ function ActivityRow({
  );
 }

+const A2A_ERROR_PREFIX = "[A2A_ERROR]";
+
+/** Render a [A2A_ERROR]-prefixed response as a structured error block
+ *  with a stripped detail line + a cause hint. The previous raw render
+ *  ("[A2A_ERROR] " literal in the response area) gave the user no
+ *  signal to act on. */
+function A2AErrorPreview({ label, raw }: { label: string; raw: string }) {
+  const detail = raw.slice(A2A_ERROR_PREFIX.length).trim() || "(no detail provided)";
+  const hint = inferA2AErrorHint(detail);
+  return (
+    <div>
+      <div className="text-[8px] text-red-400/80 uppercase tracking-wider mb-1">{label} — delivery failed</div>
+      <div className="text-[10px] text-red-300 bg-red-950/30 border border-red-800/40 rounded p-2 space-y-1.5">
+        <div className="font-mono whitespace-pre-wrap break-words max-h-32 overflow-y-auto">{detail}</div>
+        <div className="text-[9px] text-red-300/70 leading-relaxed border-t border-red-800/30 pt-1.5">{hint}</div>
+      </div>
+    </div>
+  );
+}
+
 /** Extract human-readable text from A2A request/response JSON */
 function MessagePreview({ label, body }: { label: string; body: Record<string, unknown> }) {
  // Try to extract text from A2A message parts
@ -295,6 +316,14 @@ function MessagePreview({ label, body }: { label: string; body: Record<string, u
    if (body.task && typeof body.task === "string") { text = body.task; }
    if (!text && body.result && typeof body.result === "string") { text = body.result; }
    if (text) {
+      // [A2A_ERROR]-prefixed responses get the structured error
+      // treatment. Bare text fallthrough renders a bland gray block
+      // — fine for normal replies, terrible for "[A2A_ERROR] " with
+      // no further context. Detect at the top of the rendering path
+      // so it short-circuits before the generic preview kicks in.
+      if (text.trimStart().startsWith(A2A_ERROR_PREFIX)) {
+        return <A2AErrorPreview label={label} raw={text.trimStart()} />;
+      }
      return (
        <div>
          <div className="text-[8px] text-zinc-500 uppercase tracking-wider mb-1">{label}</div>
--- a/canvas/src/components/tabs/ChatTab.tsx
+++ b/canvas/src/components/tabs/ChatTab.tsx
@ -7,9 +7,12 @@ import { api } from "@/lib/api";
 import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
 import { WS_URL } from "@/store/socket";
 import { closeWebSocketGracefully } from "@/lib/ws-close";
-import { type ChatMessage, createMessage, appendMessageDeduped } from "./chat/types";
-import { extractResponseText, extractRequestText } from "./chat/message-parser";
+import { type ChatMessage, type ChatAttachment, createMessage, appendMessageDeduped } from "./chat/types";
+import { uploadChatFiles, downloadChatFile } from "./chat/uploads";
+import { AttachmentChip, PendingAttachmentPill } from "./chat/AttachmentViews";
+import { extractResponseText, extractRequestText, extractFilesFromTask } from "./chat/message-parser";
 import { AgentCommsPanel } from "./chat/AgentCommsPanel";
+import { appendActivityLine } from "./chat/activityLog";
 import { runtimeDisplayName } from "@/lib/runtime-names";
 import { ConfirmDialog } from "@/components/ConfirmDialog";

@ -21,10 +24,18 @@ interface Props {
 type ChatSubTab = "my-chat" | "agent-comms";

 // A2A response shape (subset). The full schema is in @a2a-js/sdk but we only
-// need parts/artifacts text extraction for the synchronous fallback path.
+// need parts/artifacts text + file extraction for the synchronous fallback.
+interface A2AFileRef {
+  name?: string;
+  mimeType?: string;
+  uri?: string;
+  bytes?: string;
+  size?: number;
+}
 interface A2APart {
  kind: string;
-  text: string;
+  text?: string;
+  file?: A2AFileRef;
 }
 interface A2AResponse {
  result?: {
@ -33,25 +44,81 @@ interface A2AResponse {
  };
 }

+/** Detect activity-log rows that the workspace's own runtime fired
+ *  against itself but were misclassified as canvas-source. The proper
+ *  fix is the X-Workspace-ID header from `self_source_headers()` in
+ *  workspace/platform_auth.py, which makes the platform record
+ *  source_id = workspace_id. But three failure modes still leak a
+ *  self-message into "My Chat":
+ *
+ *    1. Historical rows already in the DB with source_id=NULL.
+ *    2. Workspace containers running pre-fix heartbeat.py / main.py
+ *       (the fix only takes effect after an image rebuild + redeploy).
+ *    3. Future internal triggers added without the helper.
+ *
+ *  This client-side filter recognises the heartbeat trigger by its
+ *  exact prefix — the heartbeat assembles
+ *
+ *    "Delegation results are ready. Review them and take appropriate
+ *     action:\n" + summary_lines + report_instruction
+ *
+ *  in workspace/heartbeat.py. The prefix is template-fixed so a
+ *  string match is reliable. If the heartbeat copy ever changes,
+ *  update this constant in the same commit.
+ *
+ *  This is a backstop, not the primary defence — the X-Workspace-ID
+ *  header is. Filtering content is fragile to copy edits, so keep
+ *  the list narrow. */
+const INTERNAL_SELF_MESSAGE_PREFIXES = [
+  "Delegation results are ready. Review them and take appropriate action",
+];
+
+function isInternalSelfMessage(text: string): boolean {
+  return INTERNAL_SELF_MESSAGE_PREFIXES.some((p) => text.startsWith(p));
+}
+
 // extractReplyText pulls the agent's text reply out of an A2A response.
-// Mirrors the Go-side extractReplyText in workspace-server/internal/channels/manager.go.
+// Concatenates ALL text parts (joined with "\n") rather than returning
+// just the first. Claude Code and other runtimes commonly emit multi-
+// part text replies for long content (markdown tables, code blocks),
+// and the prior "first part wins" implementation silently truncated
+// the rest — observed on a 15k-char Wave 1 brief that rendered only
+// the table header. Mirrors extractTextsFromParts in message-parser.ts.
+//
+// Server-side counterpart in workspace-server/internal/channels/
+// manager.go has the same single-part bug; fix that too if/when a
+// channel-delivered reply (Slack, Lark, etc.) gets truncated.
 function extractReplyText(resp: A2AResponse): string {
+  const collect = (parts: A2APart[] | undefined): string => {
+    if (!parts) return "";
+    return parts
+      .filter((p) => p.kind === "text")
+      .map((p) => p.text ?? "")
+      .filter(Boolean)
+      .join("\n");
+  };
  const result = resp?.result;
-  if (result?.parts) {
-    for (const p of result.parts) {
-      if (p.kind === "text") return p.text;
-    }
-  }
+  const collected: string[] = [];
+  const fromParts = collect(result?.parts);
+  if (fromParts) collected.push(fromParts);
+  // Walk artifacts even if parts had text — some producers (Hermes
+  // tool calls) emit a summary in parts AND details in artifacts.
+  // Returning early on parts dropped the artifact body silently.
  if (result?.artifacts) {
    for (const a of result.artifacts) {
-      for (const p of a.parts || []) {
-        if (p.kind === "text") return p.text;
-      }
+      const t = collect(a.parts);
+      if (t) collected.push(t);
    }
  }
-  return "";
+  return collected.join("\n");
 }

+// Agent-returned files live on the same response shape as text —
+// delegated to extractFilesFromTask in message-parser.ts, which also
+// walks status.message.parts (that ChatTab's legacy text extractor
+// doesn't). Single source of truth for file-part parsing across
+// live chat, activity log replay, and any future consumers.
+
 /**
 * Load chat history from the activity_logs database via the platform API.
 * Uses source=canvas to only get user-initiated messages (not agent-to-agent).
@ -71,16 +138,23 @@ async function loadMessagesFromDB(workspaceId: string): Promise<{ messages: Chat
    for (const a of [...activities].reverse()) {
      // Extract user message from request_body
      const userText = extractRequestText(a.request_body);
-      if (userText) {
+      if (userText && !isInternalSelfMessage(userText)) {
        messages.push(createMessage("user", userText));
      }

-      // Extract agent response
+      // Extract agent response — text AND any file attachments so a
+      // chat reload surfaces historical download chips, not just plain
+      // text. `result` is nested on successful A2A responses; some
+      // older rows stored the raw `result` payload at the top level,
+      // so fall back to the body itself when `.result` is absent.
      if (a.response_body) {
        const text = extractResponseText(a.response_body);
-        if (text) {
+        const attachments = extractFilesFromTask(
+          (a.response_body.result ?? a.response_body) as Record<string, unknown>,
+        );
+        if (text || attachments.length > 0) {
          const role = a.status === "error" || text.toLowerCase().startsWith("agent error") ? "system" : "agent";
-          messages.push({ ...createMessage(role, text), timestamp: a.created_at });
+          messages.push({ ...createMessage(role, text, attachments), timestamp: a.created_at });
        }
      }
    }
@ -178,7 +252,16 @@ export function ChatTab({ workspaceId, data }: Props) {
 function MyChatPanel({ workspaceId, data }: Props) {
  const [messages, setMessages] = useState<ChatMessage[]>([]);
  const [input, setInput] = useState("");
-  const [sending, setSending] = useState(!!data.currentTask);
+  // `sending` is strictly the "this tab kicked off a send and hasn't
+  // seen the reply yet" signal. Previously this was initialized from
+  // data.currentTask to pick up in-flight agent work on mount, but
+  // that conflated agent-busy (workspace heartbeat) with user-
+  // in-flight (local send): when the WS dropped a TASK_COMPLETE event,
+  // currentTask lingered, the component re-mounted with sending=true,
+  // and the Send button stayed disabled forever even though nothing
+  // local was in flight. For the "agent is busy, show spinner" UX,
+  // use data.currentTask directly in the render path.
+  const [sending, setSending] = useState(false);
  const [thinkingElapsed, setThinkingElapsed] = useState(0);
  const [activityLog, setActivityLog] = useState<string[]>([]);
  const [loading, setLoading] = useState(true);
@ -189,6 +272,17 @@ function MyChatPanel({ workspaceId, data }: Props) {
  const [error, setError] = useState<string | null>(null);
  const [confirmRestart, setConfirmRestart] = useState(false);
  const bottomRef = useRef<HTMLDivElement>(null);
+  // Files the user has picked but not yet sent. Cleared on send
+  // (upload success) or by the × on each pill.
+  const [pendingFiles, setPendingFiles] = useState<File[]>([]);
+  const [uploading, setUploading] = useState(false);
+  const fileInputRef = useRef<HTMLInputElement>(null);
+  // Guard against a double-click during the upload phase: React
+  // state updates from the click that started the upload haven't
+  // flushed yet, so the disabled-button logic sees `uploading=false`
+  // from the closure and lets a second `sendMessage` enter. A ref
+  // observes the latest value synchronously.
+  const sendInFlightRef = useRef(false);

  // Load chat history from database on mount
  useEffect(() => {
@ -231,8 +325,10 @@ function MyChatPanel({ workspaceId, data }: Props) {
      // Dedupe in case the agent proactively pushed the same text the
      // HTTP /a2a response already delivered (observed with the Hermes
      // runtime, which emits both a reply body and a send_message_to_user
-      // push for the same content).
-      setMessages((prev) => appendMessageDeduped(prev, createMessage("agent", m.content)));
+      // push for the same content). Attachments ride along with the
+      // message so files returned by the A2A_RESPONSE WS path render
+      // their download chips.
+      setMessages((prev) => appendMessageDeduped(prev, createMessage("agent", m.content, m.attachments)));
    }
    if (sendingFromAPIRef.current && msgs.length > 0) {
      setSending(false);
@ -277,12 +373,21 @@ function MyChatPanel({ workspaceId, data }: Props) {
      try {
        const msg = JSON.parse(event.data);
        if (msg.event === "ACTIVITY_LOGGED") {
+          // Filter to events for THIS workspace. The platform's
+          // BroadcastOnly fires to every connected client, and
+          // without this guard a sibling workspace's a2a_send would
+          // surface as "→ Delegating to X..." inside the wrong
+          // chat panel. (workspace_id on the WS envelope is the
+          // workspace whose activity_log row we just wrote.)
+          if (msg.workspace_id !== workspaceId) return;
+
          const p = msg.payload || {};
          const type = p.activity_type as string;
          const method = (p.method as string) || "";
          const status = (p.status as string) || "";
          const targetId = (p.target_id as string) || "";
          const durationMs = p.duration_ms as number | undefined;
+          const summary = (p.summary as string) || "";

          let line = "";
          if (type === "a2a_receive" && method === "message/send") {
@ -313,17 +418,23 @@ function MyChatPanel({ workspaceId, data }: Props) {
            const targetName = resolveWorkspaceName(targetId);
            line = `→ Delegating to ${targetName}...`;
          } else if (type === "task_update") {
-            const summary = (p.summary as string) || "";
            if (summary) line = `⟳ ${summary}`;
+          } else if (type === "agent_log") {
+            // Per-tool-use telemetry from claude_sdk_executor's
+            // _report_tool_use. The summary already carries an icon
+            // + human-readable args (📄 Read /path, ⚡ Bash: …)
+            // so we render it verbatim. No icon prefix here — the
+            // emoji at the start of summary is the visual marker.
+            if (summary) line = summary;
          }

          if (line) {
-            setActivityLog((prev) => [...prev.slice(-8), line]);
+            setActivityLog((prev) => appendActivityLine(prev, line));
          }
        } else if (msg.event === "TASK_UPDATED" && msg.workspace_id === workspaceId) {
          const task = (msg.payload?.current_task as string) || "";
          if (task) {
-            setActivityLog((prev) => [...prev.slice(-8), `⟳ ${task}`]);
+            setActivityLog((prev) => appendActivityLine(prev, `⟳ ${task}`));
          }
        }
        // A2A_RESPONSE is already consumed by the store and its text is
@ -339,10 +450,35 @@ function MyChatPanel({ workspaceId, data }: Props) {

  const sendMessage = async () => {
    const text = input.trim();
-    if (!text || !agentReachable || sending) return;
+    const filesToSend = pendingFiles;
+    // Allow sending if EITHER text OR attachments are present — a user
+    // can drop a file with no text and the agent still receives it.
+    if ((!text && filesToSend.length === 0) || !agentReachable || sending || uploading) return;
+    // Synchronous re-entry guard — see sendInFlightRef comment.
+    if (sendInFlightRef.current) return;
+    sendInFlightRef.current = true;
+
+    // Upload attachments first so we can include URIs in the A2A
+    // message parts. Sequential-before-send: a message with references
+    // to files not yet staged would fail agent-side; staging happens
+    // synchronously via /chat/uploads before message/send dispatch.
+    let uploaded: ChatAttachment[] = [];
+    if (filesToSend.length > 0) {
+      setUploading(true);
+      try {
+        uploaded = await uploadChatFiles(workspaceId, filesToSend);
+      } catch (e) {
+        setUploading(false);
+        sendInFlightRef.current = false;
+        setError(e instanceof Error ? `Upload failed: ${e.message}` : "Upload failed");
+        return;
+      }
+      setUploading(false);
+    }

    setInput("");
-    setMessages((prev) => [...prev, createMessage("user", text)]);
+    setPendingFiles([]);
+    setMessages((prev) => [...prev, createMessage("user", text, uploaded)]);
    setSending(true);
    sendingFromAPIRef.current = true;
    setError(null);
@ -356,40 +492,228 @@ function MyChatPanel({ workspaceId, data }: Props) {
        parts: [{ kind: "text", text: m.content }],
      }));

+    // A2A parts: text part (if any) + file parts (per attachment). The
+    // agent sees both in a single turn, matching the A2A spec shape.
+    const parts: A2APart[] = [];
+    if (text) parts.push({ kind: "text", text });
+    for (const att of uploaded) {
+      parts.push({
+        kind: "file",
+        file: {
+          name: att.name,
+          mimeType: att.mimeType,
+          uri: att.uri,
+          size: att.size,
+        },
+      });
+    }
+
+    // A2A calls can legitimately take minutes — LLM latency +
+    // multi-turn tool use is common on slower providers (Hermes+minimax,
+    // Claude Code invoking bash/file tools, etc.). The 15s default
+    // would silently abort the fetch here, leaving the server to
+    // complete the reply and the user staring at
+    // "agent may be unreachable". Match the upload timeout (60s × 2)
+    // for the happy-path ceiling; anything longer is genuinely stuck.
    api.post<A2AResponse>(`/workspaces/${workspaceId}/a2a`, {
      method: "message/send",
      params: {
        message: {
          role: "user",
          messageId: crypto.randomUUID(),
-          parts: [{ kind: "text", text }],
+          parts,
        },
        metadata: { history },
      },
-    })
+    }, { timeoutMs: 120_000 })
      .then((resp) => {
        // Skip if the WS A2A_RESPONSE event already handled this response.
        // Both paths (WS + HTTP) check sendingFromAPIRef — whichever clears
        // it first wins, the other becomes a no-op (no duplicate messages).
        if (!sendingFromAPIRef.current) return;
        const replyText = extractReplyText(resp);
-        if (replyText) {
-          setMessages((prev) => appendMessageDeduped(prev, createMessage("agent", replyText)));
+        const replyFiles = extractFilesFromTask((resp?.result ?? {}) as Record<string, unknown>);
+        if (replyText || replyFiles.length > 0) {
+          setMessages((prev) =>
+            appendMessageDeduped(prev, createMessage("agent", replyText, replyFiles)),
+          );
        }
        setSending(false);
        sendingFromAPIRef.current = false;
+        sendInFlightRef.current = false;
      })
      .catch(() => {
+        // Same dedup guard as .then(): if a WS path (pendingAgentMsgs
+        // or ACTIVITY_LOGGED a2a_receive ok) already delivered the
+        // reply, sendingFromAPIRef is already false and there's
+        // nothing to roll back. Surfacing "Failed to send" here would
+        // contradict the agent reply the user is currently reading —
+        // exactly the false-positive observed when the HTTP request
+        // hung up (proxy idle / 502) after WS already won.
+        if (!sendingFromAPIRef.current) {
+          sendInFlightRef.current = false;
+          return;
+        }
        setSending(false);
        sendingFromAPIRef.current = false;
+        sendInFlightRef.current = false;
        setError("Failed to send message — agent may be unreachable");
      });
  };

+  const onFilesPicked = (fileList: FileList | null) => {
+    if (!fileList) return;
+    const picked = Array.from(fileList);
+    // Deduplicate against current pending set by name+size — user
+    // picking the same file twice shouldn't append it.
+    setPendingFiles((prev) => {
+      const keyed = new Set(prev.map((f) => `${f.name}:${f.size}`));
+      return [...prev, ...picked.filter((f) => !keyed.has(`${f.name}:${f.size}`))];
+    });
+    if (fileInputRef.current) fileInputRef.current.value = "";
+  };
+
+  const removePendingFile = (index: number) =>
+    setPendingFiles((prev) => prev.filter((_, i) => i !== index));
+
+  // Monotonic counter so two paste events within the same wall-clock
+  // second still produce distinct filenames. Without this, on
+  // Firefox (where pasted images have an empty `file.name`), two
+  // pastes ~100ms apart could yield identical synthetic names AND
+  // identical sizes, collapsing into one attachment via the
+  // `name:size` dedup in onFilesPicked.
+  const pasteCounterRef = useRef(0);
+
+  /** Paste-from-clipboard image attachment.
+   *
+   *  Browser clipboard image items arrive as `File`s whose `name` is
+   *  often a generic "image.png" (Chrome) or empty (Firefox/Safari),
+   *  so two consecutive screenshot pastes collide on the name+size
+   *  dedup the file-picker uses. Re-tag each pasted image with a
+   *  per-paste unique name so dedup keeps them apart and the upload
+   *  pipeline (which expects a non-empty filename) is happy.
+   *
+   *  Falls through to onFilesPicked via direct File[] (NOT through
+   *  the DataTransfer constructor — that throws on Safari < 14.1
+   *  and old Edge, silently aborting the paste).
+   *
+   *  Only intercepts the paste when the clipboard has at least one
+   *  image; text-only pastes fall through to the textarea's default
+   *  behaviour. */
+  const mimeToExt = (mime: string): string => {
+    // Avoid raw `mime.split("/")[1]` — that yields `"svg+xml"`,
+    // `"jpeg"`, `"webp"` etc. which produce ugly filenames and may
+    // trip server-side extension allowlists. Map known types
+    // explicitly; unknown falls back to a safe default.
+    if (mime === "image/svg+xml") return "svg";
+    if (mime === "image/jpeg") return "jpg";
+    if (mime === "image/png") return "png";
+    if (mime === "image/gif") return "gif";
+    if (mime === "image/webp") return "webp";
+    if (mime === "image/heic") return "heic";
+    return "png";
+  };
+
+  const onPasteIntoComposer = (e: React.ClipboardEvent<HTMLTextAreaElement>) => {
+    if (!dropEnabled) return;
+    const items = e.clipboardData?.items;
+    if (!items || items.length === 0) return;
+    const imageFiles: File[] = [];
+    for (let i = 0; i < items.length; i++) {
+      const item = items[i];
+      if (!item.type.startsWith("image/")) continue;
+      const file = item.getAsFile();
+      if (!file) continue;
+      const ext = mimeToExt(file.type);
+      const stamp = new Date()
+        .toISOString()
+        .replace(/[:.]/g, "-")
+        .slice(0, 19);
+      const seq = pasteCounterRef.current++;
+      const fname = `pasted-${stamp}-${seq}-${i}.${ext}`;
+      imageFiles.push(new File([file], fname, { type: file.type }));
+    }
+    if (imageFiles.length === 0) return;
+    e.preventDefault();
+    // Reuse the picker path so file-size guards, dedup, and pending-
+    // list state all run through the same code. Build a synthetic
+    // FileList-like object to avoid the DataTransfer constructor —
+    // that's missing on Safari < 14.1 / old Edge and would silently
+    // throw, leaving the paste a no-op.
+    addPastedFiles(imageFiles);
+  };
+
+  // Variant of onFilesPicked that accepts a File[] directly, sidestepping
+  // the DataTransfer-FileList round-trip. Same dedup + state shape.
+  const addPastedFiles = (files: File[]) => {
+    setPendingFiles((prev) => {
+      const keyed = new Set(prev.map((f) => `${f.name}:${f.size}`));
+      return [...prev, ...files.filter((f) => !keyed.has(`${f.name}:${f.size}`))];
+    });
+  };
+
+  // Drag-and-drop staging. dragDepthRef counts enter vs leave events so
+  // the overlay doesn't flicker when the cursor crosses nested children
+  // (textarea, buttons) — dragenter/dragleave fire for every boundary.
+  const [dragOver, setDragOver] = useState(false);
+  const dragDepthRef = useRef(0);
+  const dropEnabled = agentReachable && !sending && !uploading;
+  const isFileDrag = (e: React.DragEvent) =>
+    Array.from(e.dataTransfer.types || []).includes("Files");
+
+  const onDragEnter = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    e.preventDefault();
+    dragDepthRef.current += 1;
+    setDragOver(true);
+  };
+  const onDragOver = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    e.preventDefault();
+    e.dataTransfer.dropEffect = "copy";
+  };
+  const onDragLeave = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    dragDepthRef.current = Math.max(0, dragDepthRef.current - 1);
+    if (dragDepthRef.current === 0) setDragOver(false);
+  };
+  const onDrop = (e: React.DragEvent) => {
+    if (!dropEnabled || !isFileDrag(e)) return;
+    e.preventDefault();
+    dragDepthRef.current = 0;
+    setDragOver(false);
+    onFilesPicked(e.dataTransfer.files);
+  };
+
+  const downloadAttachment = (att: ChatAttachment) => {
+    // Errors here are rare but user-visible (401 on a revoked token,
+    // 404 if the agent deleted the file). Surface via the inline
+    // error banner — the message list itself stays untouched.
+    downloadChatFile(workspaceId, att).catch((e) => {
+      setError(e instanceof Error ? `Download failed: ${e.message}` : "Download failed");
+    });
+  };
+
  const isOnline = data.status === "online" || data.status === "degraded";

  return (
-    <div className="flex flex-col h-full">
+    <div
+      className="flex flex-col h-full relative"
+      onDragEnter={onDragEnter}
+      onDragOver={onDragOver}
+      onDragLeave={onDragLeave}
+      onDrop={onDrop}
+    >
+      {dragOver && (
+        <div
+          className="absolute inset-0 z-20 flex items-center justify-center bg-blue-500/10 border-2 border-dashed border-blue-400 rounded pointer-events-none"
+          aria-live="polite"
+        >
+          <div className="bg-zinc-900/90 border border-blue-400/50 rounded-lg px-4 py-2 text-xs text-blue-200">
+            Drop to attach
+          </div>
+        </div>
+      )}
      {/* Messages */}
      <div className="flex-1 overflow-y-auto p-3 space-y-3">
        {loading && (
@ -435,9 +759,23 @@ function MyChatPanel({ workspaceId, data }: Props) {
                    : "bg-zinc-800/80 text-zinc-200 border border-zinc-700/30"
              }`}
            >
-              <div className="prose prose-sm prose-invert max-w-none [&>p]:mb-1 [&>p:last-child]:mb-0">
-                <ReactMarkdown remarkPlugins={[remarkGfm]}>{msg.content}</ReactMarkdown>
-              </div>
+              {msg.content && (
+                <div className="prose prose-sm prose-invert max-w-none [&>p]:mb-1 [&>p:last-child]:mb-0">
+                  <ReactMarkdown remarkPlugins={[remarkGfm]}>{msg.content}</ReactMarkdown>
+                </div>
+              )}
+              {msg.attachments && msg.attachments.length > 0 && (
+                <div className={`flex flex-wrap gap-1 ${msg.content ? "mt-1.5" : ""}`}>
+                  {msg.attachments.map((att, i) => (
+                    <AttachmentChip
+                      key={`${msg.id}-${i}`}
+                      attachment={att}
+                      onDownload={downloadAttachment}
+                      tone={msg.role === "user" ? "user" : "agent"}
+                    />
+                  ))}
+                </div>
+              )}
              <div className="text-[9px] text-zinc-500 mt-1">
                {new Date(msg.timestamp).toLocaleTimeString()}
              </div>
@ -445,8 +783,11 @@ function MyChatPanel({ workspaceId, data }: Props) {
          </div>
        ))}

-        {/* Thinking indicator */}
-        {sending && (
+        {/* Thinking indicator — shows when this tab is awaiting a reply
+           OR when the workspace heartbeat reports an in-flight task
+           (covers the "agent is already busy when I open the tab" case
+           without locking the Send button on a stale currentTask). */}
+        {(sending || !!data.currentTask) && (
          <div className="flex justify-start">
            <div className="bg-zinc-800/50 border border-zinc-700/30 rounded-lg px-3 py-2 max-w-[85%]">
              <div className="flex items-center gap-2 text-xs text-zinc-400">
@ -490,7 +831,37 @@ function MyChatPanel({ workspaceId, data }: Props) {

      {/* Input */}
      <div className="p-3 border-t border-zinc-800">
-        <div className="flex gap-2">
+        {pendingFiles.length > 0 && (
+          <div className="flex flex-wrap gap-1.5 mb-2">
+            {pendingFiles.map((f, i) => (
+              <PendingAttachmentPill
+                key={`${f.name}-${f.size}-${i}`}
+                file={f}
+                onRemove={() => removePendingFile(i)}
+              />
+            ))}
+          </div>
+        )}
+        <div className="flex gap-2 items-end">
+          <input
+            ref={fileInputRef}
+            type="file"
+            multiple
+            className="hidden"
+            onChange={(e) => onFilesPicked(e.target.files)}
+            aria-hidden="true"
+          />
+          <button
+            onClick={() => fileInputRef.current?.click()}
+            disabled={!agentReachable || sending || uploading}
+            aria-label="Attach file"
+            title="Attach file"
+            className="p-2 bg-zinc-800 hover:bg-zinc-700 border border-zinc-700 rounded-lg text-zinc-400 hover:text-zinc-200 transition-colors shrink-0 disabled:opacity-40"
+          >
+            <svg width="14" height="14" viewBox="0 0 16 16" fill="none" aria-hidden="true">
+              <path d="M11 6.5 7 10.5a2 2 0 1 0 2.8 2.8l4-4a3.5 3.5 0 0 0-5-5l-4.5 4.5a5 5 0 0 0 7 7l4-4" stroke="currentColor" strokeWidth="1.4" strokeLinecap="round" strokeLinejoin="round" />
+            </svg>
+          </button>
          <textarea
            aria-label="Message to agent"
            value={input}
@ -501,17 +872,18 @@ function MyChatPanel({ workspaceId, data }: Props) {
                sendMessage();
              }
            }}
-            placeholder={agentReachable ? "Send a message... (Shift+Enter for new line)" : `Agent is ${data.status}`}
+            onPaste={onPasteIntoComposer}
+            placeholder={agentReachable ? "Send a message... (Shift+Enter for new line, paste images to attach)" : `Agent is ${data.status}`}
            disabled={!agentReachable || sending}
            rows={1}
            className="flex-1 bg-zinc-800 border border-zinc-700 rounded-lg px-3 py-2 text-xs text-zinc-200 placeholder-zinc-500 focus:outline-none focus:border-blue-500 resize-none disabled:opacity-50"
          />
          <button
            onClick={sendMessage}
-            disabled={!input.trim() || !agentReachable || sending}
+            disabled={(!input.trim() && pendingFiles.length === 0) || !agentReachable || sending || uploading}
            className="px-4 py-2 bg-blue-600 hover:bg-blue-500 text-xs font-medium rounded-lg text-white disabled:opacity-30 transition-colors shrink-0"
          >
-            Send
+            {uploading ? "Uploading…" : "Send"}
          </button>
        </div>
      </div>
--- a/canvas/src/components/tabs/ConfigTab.tsx
+++ b/canvas/src/components/tabs/ConfigTab.tsx
@ -105,12 +105,17 @@ interface RuntimeOption {
 // Fallback used when /templates can't be fetched (offline, older backend).
 // Keep in sync with manifest.json workspace_templates as a defensive default.
 // Model + env suggestions only flow when the backend is reachable.
+//
 // Runtimes that manage their own config outside the platform's config.yaml
-// template. For these, a missing config.yaml is expected — the user manages
-// config via the runtime's own mechanism (e.g. hermes edits
-// ~/.hermes/config.yaml on the workspace EC2 via the Terminal tab or its
-// own CLI). Showing a "No config.yaml found" error for these is misleading.
-const RUNTIMES_WITH_OWN_CONFIG = new Set<string>(["hermes", "external"]);
+// template. For these, a missing config.yaml is expected and the form
+// genuinely can't edit the runtime's settings (there's no platform file
+// to write). Hermes is NOT on this list: it DOES ship a platform
+// config.yaml via workspace-configs-templates/hermes that controls model,
+// runtime_config, required_env, etc. Editing it through this form is
+// exactly the point of the platform adaptor. The deep `~/.hermes/
+// config.yaml` on the container is a separate runtime-internal file,
+// not this one.
+const RUNTIMES_WITH_OWN_CONFIG = new Set<string>(["external"]);

 const FALLBACK_RUNTIME_OPTIONS: RuntimeOption[] = [
  { value: "", label: "LangGraph (default)", models: [] },
@ -152,9 +157,11 @@ export function ConfigTab({ workspaceId }: Props) {
    // default `LangGraph`. See GH #1894.
    let wsMetadataRuntime = "";
    let wsMetadataModel = "";
+    let wsMetadataTier: number | null = null;
    try {
-      const ws = await api.get<{ runtime?: string }>(`/workspaces/${workspaceId}`);
+      const ws = await api.get<{ runtime?: string; tier?: number }>(`/workspaces/${workspaceId}`);
      wsMetadataRuntime = (ws.runtime || "").trim();
+      if (typeof ws.tier === "number") wsMetadataTier = ws.tier;
    } catch { /* fall back to config.yaml */ }
    try {
      const m = await api.get<{ model?: string }>(`/workspaces/${workspaceId}/model`);
@ -166,11 +173,15 @@ export function ConfigTab({ workspaceId }: Props) {
      const parsed = parseYaml(res.content);
      setOriginalYaml(res.content);
      setRawDraft(res.content);
-      // Merge: config.yaml wins for fields it declares, but workspace metadata
-      // wins for runtime + model when config.yaml doesn't set them.
+      // Merge: workspace-row metadata is authoritative for the DB-backed
+      // fields (tier, runtime, model). config.yaml often lags — handleSave
+      // PATCHes tier/runtime directly and a template snapshot in the
+      // container can differ from the live row. Show the DB value so the
+      // form doesn't contradict the node badge (issue: badge=T3, form=T2).
      const merged = { ...DEFAULT_CONFIG, ...parsed } as ConfigData;
-      if (!merged.runtime && wsMetadataRuntime) merged.runtime = wsMetadataRuntime;
-      if (!merged.model && wsMetadataModel) merged.model = wsMetadataModel;
+      if (wsMetadataRuntime) merged.runtime = wsMetadataRuntime;
+      if (wsMetadataModel) merged.model = wsMetadataModel;
+      if (wsMetadataTier !== null) merged.tier = wsMetadataTier;
      setConfig(merged);
    } catch {
      // No platform-managed config.yaml. Some runtimes (hermes, external)
@ -185,6 +196,7 @@ export function ConfigTab({ workspaceId }: Props) {
        ...DEFAULT_CONFIG,
        runtime: wsMetadataRuntime,
        model: wsMetadataModel,
+        ...(wsMetadataTier !== null ? { tier: wsMetadataTier } : {}),
      } as ConfigData);
    } finally {
      setLoading(false);
--- a/canvas/src/components/tabs/DetailsTab.tsx
+++ b/canvas/src/components/tabs/DetailsTab.tsx
@ -36,7 +36,7 @@ export function DetailsTab({ workspaceId, data }: Props) {
  const [restartError, setRestartError] = useState<string | null>(null);
  const [consoleOpen, setConsoleOpen] = useState(false);
  const updateNodeData = useCanvasStore((s) => s.updateNodeData);
-  const removeNode = useCanvasStore((s) => s.removeNode);
+  const removeSubtree = useCanvasStore((s) => s.removeSubtree);
  const selectNode = useCanvasStore((s) => s.selectNode);
  // Ref for the "Delete Workspace" trigger — Cancel returns focus here
  const deleteButtonRef = useRef<HTMLButtonElement>(null);
@ -94,7 +94,11 @@ export function DetailsTab({ workspaceId, data }: Props) {
    setDeleteError(null);
    try {
      await api.del(`/workspaces/${workspaceId}?confirm=true`);
-      removeNode(workspaceId);
+      // Mirror the server-side cascade — drop the row + every
+      // descendant locally so the canvas reflects the deletion
+      // immediately, even when the WS is dead and the per-descendant
+      // WORKSPACE_REMOVED events never arrive.
+      removeSubtree(workspaceId);
      selectNode(null);
    } catch (e) {
      setDeleteError(e instanceof Error ? e.message : "Failed to delete");
--- a/canvas/src/components/tabs/SkillsTab.tsx
+++ b/canvas/src/components/tabs/SkillsTab.tsx
@ -6,6 +6,14 @@ import { useCanvasStore, summarizeWorkspaceCapabilities, type WorkspaceNodeData
 import { showToast } from "../Toaster";

 interface Props {
+  // The workspace's id is NOT a field on WorkspaceNodeData — that
+  // interface is the React Flow `node.data` blob, while the id lives
+  // on `node.id`. Pass it explicitly (matches every other tab in
+  // SidePanel) so the install/uninstall API calls don't end up
+  // POSTing to /workspaces/undefined/plugins. The interface extending
+  // Record<string, unknown> meant TypeScript silently typed
+  // `data.id` as `unknown` instead of erroring — easy to miss.
+  workspaceId: string;
  data: WorkspaceNodeData;
 }

@ -40,7 +48,7 @@ interface SourceSchemesResponse {
 // Delay before reloading installed plugins after install/uninstall (workspace restarts)
 const PLUGIN_RELOAD_DELAY_MS = 15_000;

-export function SkillsTab({ data }: Props) {
+export function SkillsTab({ workspaceId, data }: Props) {
  const capability = summarizeWorkspaceCapabilities(data);
  const skills = useMemo(() => extractSkills(data.agentCard), [data.agentCard]);
  const setPanelTab = useCanvasStore((s) => s.setPanelTab);
@ -57,32 +65,115 @@ export function SkillsTab({ data }: Props) {
  const reloadTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);

  useEffect(() => {
+    // Re-init `mountedRef.current = true` on every mount. React 18
+    // StrictMode (Next.js dev) double-invokes effects: mount →
+    // cleanup → mount. Without this re-init, the first cleanup sets
+    // mountedRef.current = false, the re-mount runs the effect body
+    // again but never restores the flag, so every subsequent
+    // `if (mountedRef.current) setX(...)` guard skips and the
+    // component appears wedged: fetches complete, state never
+    // updates, "Loading…" sits forever. Production doesn't double-
+    // invoke so the bug only surfaces in dev — but dev is where we
+    // see it, and the cost of being explicit is one assignment.
+    mountedRef.current = true;
    return () => {
      mountedRef.current = false;
      clearTimeout(reloadTimerRef.current);
    };
  }, []);

-  const workspaceId = data.id;
+  // Tracks whether loadInstalled has completed at least once (success
+  // or empty-array success — NOT failure). Without this the auto-
+  // expand effect below would fire on the initial render where
+  // `installed.length === 0` simply because the fetch hasn't returned
+  // yet, and worse, would also fire if the fetch throws (network
+  // blip, auth failure) — both cases falsely look like "no plugins
+  // installed". Gating on a separate "loaded" flag avoids the false
+  // positive.
+  const [installedLoaded, setInstalledLoaded] = useState(false);

  const loadInstalled = useCallback(async () => {
    try {
      const result = await api.get<PluginInfo[]>(`/workspaces/${workspaceId}/plugins`);
-      if (mountedRef.current) setInstalled(Array.isArray(result) ? result : []);
+      if (mountedRef.current) {
+        setInstalled(Array.isArray(result) ? result : []);
+        setInstalledLoaded(true);
+      }
    } catch (e) {
      console.warn("SkillsTab: installed plugins load failed", e);
    }
  }, [workspaceId]);

-  const loadRegistry = useCallback(async () => {
+  // registry-load lifecycle so the UI can show "Loading…" / error /
+  // retry instead of an indistinguishable "No plugins in registry"
+  // banner whether the fetch is in-flight, errored, or genuinely
+  // returned []. The previous silent console.warn-only path made
+  // an auth failure or CORS blip look identical to an empty
+  // registry — exactly the diagnosis dead-end observed when the
+  // server returned 20 plugins via curl but the canvas showed 0.
+  const [registryLoading, setRegistryLoading] = useState(false);
+  const [registryError, setRegistryError] = useState<string | null>(null);
+
+  // Synchronous gate against concurrent loadRegistry runs. Refs survive
+  // Fast Refresh re-renders (ref objects persist across re-runs of
+  // the function body), so a previously-stranded fetch can pin this
+  // ref at true and block every subsequent loadRegistry call. The
+  // `force` parameter on loadRegistry below provides the user-driven
+  // escape hatch for that wedge.
+  const registryFetchInFlight = useRef(false);
+
+  // Reset the in-flight gate on unmount so a Fast Refresh that
+  // tears down + recreates the component without a full page reload
+  // doesn't carry the stuck-true value into the new instance via
+  // dev-server-preserved module state.
+  useEffect(() => {
+    return () => {
+      registryFetchInFlight.current = false;
+    };
+  }, []);
+
+  const loadRegistry = useCallback(async (force = false) => {
+    // Default callers (mount effect, button while not loading) honour
+    // the gate. Explicit force=true callers (Retry button) bypass it
+    // — the user is signalling "forget whatever you thought was in
+    // flight, fetch again now".
+    if (!force && registryFetchInFlight.current) return;
+    registryFetchInFlight.current = true;
+    setRegistryLoading(true);
+    setRegistryError(null);
    try {
-      const result = await api.get<PluginInfo[]>("/plugins");
+      // 10s timeout — tighter than the 15s default. Plugin registry
+      // is local-disk-backed on the platform host (server reads
+      // pluginsDir entries) so a 10s budget is generous. Without
+      // an explicit timeout the UI's "Loading registry…" can sit
+      // for the full 15s + any browser hop time when a Fast
+      // Refresh strands an in-flight promise.
+      const result = await api.get<PluginInfo[]>("/plugins", { timeoutMs: 10_000 });
      if (mountedRef.current) setRegistry(Array.isArray(result) ? result : []);
    } catch (e) {
-      // Registry is the AVAILABLE PLUGINS list. Silent failure here
-      // left the user seeing "No plugins in registry" with no clue
-      // it was a fetch error — log it so devtools shows the cause.
      console.warn("SkillsTab: registry load failed", e);
+      if (mountedRef.current) {
+        // Detect timeout/abort by DOMException.name first — that's
+        // the canonical signal across browsers. Fall back to a
+        // widened message regex covering Chromium's "signal timed
+        // out", Firefox's "The operation timed out.", Safari's
+        // "Aborted". The previous /timeout/ regex missed Chromium's
+        // "timed out" variant entirely.
+        const name = (e as { name?: string })?.name ?? "";
+        const msg = e instanceof Error ? e.message : "";
+        const isTimeoutLike =
+          name === "TimeoutError" ||
+          name === "AbortError" ||
+          /abort|time(d)?\s*out/i.test(msg);
+        setRegistryError(
+          isTimeoutLike
+            ? "Registry fetch timed out (10s). The platform server may be slow or unreachable."
+            : msg || "Failed to load registry",
+        );
+      }
+    } finally {
+      registryFetchInFlight.current = false;
+      if (mountedRef.current) setRegistryLoading(false);
    }
  }, []);

@ -102,17 +193,73 @@ export function SkillsTab({ data }: Props) {
    loadSourceSchemes();
  }, [loadInstalled, loadRegistry, loadSourceSchemes]);

+  // First-time experience: if the workspace has zero plugins
+  // installed but the platform's registry has options to choose
+  // from, expand the registry by default so the user sees what's
+  // available without an extra click. Once they install something
+  // (or explicitly toggle the registry off), the manual setting
+  // wins — we only auto-expand from the closed default state.
+  const hasAutoExpandedRef = useRef(false);
+  useEffect(() => {
+    if (hasAutoExpandedRef.current) return;
+    if (installedLoaded && installed.length === 0 && registry.length > 0) {
+      setShowRegistry(true);
+      hasAutoExpandedRef.current = true;
+    }
+  }, [installedLoaded, installed.length, registry.length]);
+
  const installedNames = useMemo(() => new Set(installed.map((p) => p.name)), [installed]);

  // Install always goes through the source-based API. For registry
  // plugins we build the local:// source on the fly; custom sources
  // (github://, clawhub://, …) are typed into the input below.
-  const installFromSource = async (source: string, labelOverride?: string) => {
+  //
+  // Optional `optimistic` parameter mirrors the uninstall flow's local
+  // state mutation. Without it, the user sees the button revert from
+  // "Installing..." → "Install" the instant the POST returns, and the
+  // green "Installed" tag doesn't appear for ~15s while we wait out
+  // PLUGIN_RELOAD_DELAY_MS for the workspace restart before refetching.
+  // 15s of staring at the same button feels broken. Pushing the
+  // registry entry into `installed` immediately makes the UI reflect
+  // the install instantly; the delayed loadInstalled() reconciles
+  // anything we got wrong (or any server-side filtering we don't
+  // know about locally).
+  const installFromSource = async (
+    source: string,
+    labelOverride?: string,
+    optimistic?: PluginInfo,
+  ) => {
    const label = labelOverride ?? source;
    setInstalling(label);
    try {
      await api.post(`/workspaces/${workspaceId}/plugins`, { source });
      showToast(`Installed ${label} — restarting workspace`, "success");
+      if (optimistic && mountedRef.current) {
+        // Push with `supported_on_runtime` left undefined — the
+        // server's ListInstalled annotates the real value (true /
+        // false) at refetch time. Forcing `true` here would hide the
+        // "inert on this runtime" badge for 15s if the user
+        // installed a plugin that doesn't actually support the
+        // workspace's runtime; the badge only renders on `=== false`,
+        // so undefined keeps it neutral until reconciliation arrives.
+        setInstalled((prev) =>
+          prev.some((p) => p.name === optimistic.name)
+            ? prev
+            : [...prev, { ...optimistic, supported_on_runtime: undefined }],
+        );
+        // Note: we intentionally do NOT set `installedLoaded` here.
+        // That flag means "the initial GET has succeeded at least
+        // once" and gates the auto-expand-registry effect. A fast
+        // optimistic install BEFORE the initial fetch returns must
+        // not flip the gate, or the auto-expand never fires and a
+        // followup loadInstalled racing with the optimistic write
+        // could overwrite our entry with [] mid-restart.
+      }
+      // Drop any prior reload timer before scheduling a new one —
+      // back-to-back installs within PLUGIN_RELOAD_DELAY_MS would
+      // otherwise queue multiple loadInstalled() calls and the
+      // unmount cleanup only clears the latest handle.
+      clearTimeout(reloadTimerRef.current);
      reloadTimerRef.current = setTimeout(() => loadInstalled(), PLUGIN_RELOAD_DELAY_MS);
    } catch (e) {
      showToast(e instanceof Error ? e.message : "Install failed", "error");
@ -121,7 +268,10 @@ export function SkillsTab({ data }: Props) {
    }
  };

-  const handleInstall = (pluginName: string) => installFromSource(`local://${pluginName}`, pluginName);
+  const handleInstall = (pluginName: string) => {
+    const entry = registry.find((p) => p.name === pluginName);
+    return installFromSource(`local://${pluginName}`, pluginName, entry);
+  };

  const handleInstallCustom = async () => {
    const source = customSource.trim();
@ -133,9 +283,12 @@ export function SkillsTab({ data }: Props) {
  const handleUninstall = async (pluginName: string) => {
    setUninstalling(pluginName);
    try {
-      await api.del(`/workspaces/${data.id}/plugins/${pluginName}`);
+      await api.del(`/workspaces/${workspaceId}/plugins/${pluginName}`);
      showToast(`Removed ${pluginName} — restarting workspace`, "success");
      setInstalled((prev) => prev.filter((p) => p.name !== pluginName));
+      // Drop any prior reload timer (see installFromSource for the
+      // back-to-back-action leak rationale).
+      clearTimeout(reloadTimerRef.current);
      reloadTimerRef.current = setTimeout(() => loadInstalled(), PLUGIN_RELOAD_DELAY_MS);
    } catch (e) {
      showToast(e instanceof Error ? e.message : "Uninstall failed", "error");
@ -264,9 +417,53 @@ export function SkillsTab({ data }: Props) {
                Local registry plugins below; paste any scheme URL above for GitHub or other sources.
              </div>
            </div>
-            <div className="text-[10px] uppercase tracking-[0.2em] text-zinc-600 mb-2">Available plugins</div>
-            {registry.length === 0 ? (
-              <div className="text-[10px] text-zinc-600">No plugins in registry</div>
+            <div className="flex items-center justify-between mb-2">
+              <div className="text-[10px] uppercase tracking-[0.2em] text-zinc-600">Available plugins</div>
+              {/* Retry visible whenever registry is empty — including
+                  the loading state — so a stuck fetch (Fast Refresh
+                  stranded promise, slow server, browser quirk) has a
+                  user-driven escape hatch. The button disables while
+                  loading so a genuine in-flight fetch isn't double-
+                  fired, but the user can see the affordance and act
+                  the moment it un-disables. */}
+              {registry.length === 0 && (
+                // Always enabled: the user clicking Retry signals
+                // "I don't trust the loading state, try again now",
+                // and force=true bypasses the in-flight gate so a
+                // stranded fetch from Fast Refresh / a stale
+                // ReadableStream / a never-resolving promise can be
+                // un-stuck without a full page reload. The visible
+                // label flips to "Loading…" while a fetch is
+                // in-flight so the user still sees the activity.
+                <button
+                  type="button"
+                  onClick={() => loadRegistry(true)}
+                  className="text-[10px] text-violet-300 hover:text-violet-200 underline-offset-2 hover:underline"
+                >
+                  {registryLoading ? "Loading… click to retry" : "Retry"}
+                </button>
+              )}
+            </div>
+            {registryLoading && registry.length === 0 ? (
+              <div className="text-[10px] text-zinc-500">Loading registry…</div>
+            ) : registryError ? (
+              <div className="rounded-lg border border-red-800/40 bg-red-950/20 px-2 py-1.5">
+                <div className="text-[10px] text-red-300 font-semibold mb-0.5">
+                  Couldn't load the plugin registry
+                </div>
+                <div className="text-[10px] text-red-400/80">{registryError}</div>
+                <div className="mt-1 text-[10px] text-zinc-500">
+                  Check the platform server is reachable at /plugins. The Retry button is in the header above.
+                </div>
+              </div>
+            ) : registry.length === 0 ? (
+              <div className="rounded-lg border border-zinc-800/40 bg-zinc-950/40 px-2 py-1.5">
+                <div className="text-[10px] text-zinc-400 mb-0.5">Registry returned 0 plugins.</div>
+                <div className="text-[10px] text-zinc-600">
+                  This usually means the platform's plugins/ directory is empty.
+                  Run scripts/clone-manifest.sh to populate it from the standalone repos.
+                </div>
+              </div>
            ) : (
              <div className="space-y-1.5">
                {registry.map((p) => {
--- a/canvas/src/components/tabs/tests/ConfigTab.hermes.test.tsx
+++ b/canvas/src/components/tabs/tests/ConfigTab.hermes.test.tsx
@ -128,7 +128,13 @@ describe("ConfigTab — hermes workspace", () => {
    });
  });

-  it("shows hermes-specific info banner pointing to Terminal tab (#1894)", async () => {
+  it("does NOT show the hermes-specific info banner (removed in #2061)", async () => {
+    // Banner-text inversion: the multilevel-layout-UX PR drops "hermes"
+    // from RUNTIMES_WITH_OWN_CONFIG (now {"external"} only). Hermes now
+    // shows the normal Config form — the banner "Hermes manages its own
+    // config" is reserved for the "external" runtime, not hermes itself.
+    // If this ever flips back, revisit the banner/error UX before
+    // unpinning this assertion.
    wireApi({
      workspaceRuntime: "hermes",
      configYamlContent: null,
@ -137,9 +143,11 @@ describe("ConfigTab — hermes workspace", () => {

    render(<ConfigTab workspaceId="ws-test" />);

-    await waitFor(() => {
-      expect(screen.getByText(/Hermes manages its own config/i)).toBeTruthy();
-    });
+    // Wait for the render+loads to settle (template list drives the runtime combobox).
+    await waitFor(() =>
+      screen.getByRole("combobox", { name: /runtime/i }),
+    );
+    expect(screen.queryByText(/Hermes manages its own config/i)).toBeNull();
  });

  it("DOES show 'No config.yaml found' error for langgraph workspace (default runtime)", async () => {
@ -161,14 +169,28 @@ describe("ConfigTab — hermes workspace", () => {
 });

 describe("ConfigTab — config.yaml on disk", () => {
-  it("config.yaml runtime/model wins when present, workspace metadata is fallback", async () => {
-    // If the workspace DB has runtime=langgraph but config.yaml declares
-    // runtime: crewai, the form should show crewai (config.yaml wins).
-    // Prevents silent runtime drift across reads.
+  it("workspace metadata (DB) wins over config.yaml when both are present (#2061)", async () => {
+    // Priority inversion in #2061: previously config.yaml overrode DB, so
+    // the tier-on-node badge and runtime-in-form could drift when the
+    // user edited config.yaml on disk. The multilevel-layout-UX PR made
+    // the DB authoritative — config.yaml is read for non-DB keys (tools,
+    // MCP server list, etc.) but runtime/model/tier come from the
+    // workspace row so the node badge matches the form.
+    //
+    // Scenario: DB says "hermes", config.yaml says "crewai". The form
+    // must show hermes (DB wins).
+    //
+    // We pick hermes (not langgraph) on the DB side because "langgraph"
+    // is collapsed to the empty-string "LangGraph (default)" option in
+    // the runtime dropdown — so a "langgraph" DB value would render as
+    // the empty-valued option and obscure whether the DB-wins logic
+    // actually fired. Hermes has its own non-empty option value and
+    // gives the assertion a clean signal.
    wireApi({
-      workspaceRuntime: "langgraph", // DB
+      workspaceRuntime: "hermes", // DB — authoritative
      configYamlContent: 'runtime: crewai\nmodel: "claude-opus"\n',
      templates: [
+        { id: "t-hermes", name: "Hermes", runtime: "hermes", models: [] },
        { id: "t-crewai", name: "CrewAI", runtime: "crewai", models: [] },
      ],
    });
@ -176,6 +198,6 @@ describe("ConfigTab — config.yaml on disk", () => {
    render(<ConfigTab workspaceId="ws-test" />);

    const select = await waitFor(() => screen.getByRole("combobox", { name: /runtime/i }));
-    expect((select as HTMLSelectElement).value).toBe("crewai");
+    expect((select as HTMLSelectElement).value).toBe("hermes");
  });
 });
--- a/canvas/src/components/tabs/chat/AgentCommsPanel.tsx
+++ b/canvas/src/components/tabs/chat/AgentCommsPanel.tsx
@ -1,13 +1,17 @@
 "use client";

 import { useState, useEffect, useRef } from "react";
+import ReactMarkdown from "react-markdown";
+import remarkGfm from "remark-gfm";
 import { api } from "@/lib/api";
 import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
 import { WS_URL } from "@/store/socket";
 import { closeWebSocketGracefully } from "@/lib/ws-close";
+import { showToast } from "../../Toaster";
 import { extractResponseText, extractRequestText } from "./message-parser";
+import { inferA2AErrorHint } from "./a2aErrorHint";

-interface ActivityEntry {
+export interface ActivityEntry {
  id: string;
  activity_type: string;
  source_id: string | null;
@ -22,11 +26,29 @@ interface ActivityEntry {

 interface CommMessage {
  id: string;
-  direction: "in" | "out";
+  /** UI-facing flow from THIS workspace's point of view:
+   *
+   *    "out" — this workspace either initiated the call (a2a_send)
+   *            OR self-logged the reply from a peer it had called
+   *            (a2a_receive with source_id == workspaceId).
+   *    "in"  — a peer initiated the call to us (a2a_receive with
+   *            source_id != workspaceId).
+   *
+   *  Distinct from activity_type because the agent runtime self-
+   *  logs its outbound calls' replies as `a2a_receive` rows; without
+   *  this normalisation the UI labels would render those as
+   *  incoming ("← From X") and right-justify them on the wrong
+   *  side, even though from the user's perspective the call WAS
+   *  outgoing. See toCommMessage for the resolution rules. */
+  flow: "in" | "out";
  peerName: string;
  peerId: string;
  text: string;
  responseText: string | null;
+  /** "ok" | "error" — surfaces failed deliveries with their own
+   *  visual treatment + recovery actions instead of an opaque
+   *  "[A2A_ERROR]" body the user can't act on. */
+  status: string;
  timestamp: string;
 }

@ -36,9 +58,31 @@ function resolveName(id: string): string {
  return (node?.data as WorkspaceNodeData)?.name || id.slice(0, 8);
 }

-function toCommMessage(entry: ActivityEntry, workspaceId: string): CommMessage | null {
-  const isOutgoing = entry.activity_type === "a2a_send";
-  const peerId = isOutgoing ? (entry.target_id || "") : (entry.source_id || "");
+export function toCommMessage(entry: ActivityEntry, workspaceId: string): CommMessage | null {
+  // a2a_receive activity rows come in two shapes:
+  //
+  //   1. Real incoming call (a peer called us): source_id = the peer,
+  //      target_id = us. peerId is source_id, flow is "in".
+  //
+  //   2. Self-logged response to an outbound call (the workspace's own
+  //      runtime calls report_activity("a2a_receive", ...) after
+  //      delegating; see workspace/a2a_tools.py:181). source_id =
+  //      our own workspace_id, target_id = the peer that replied.
+  //      peerId must come from target_id (otherwise the peer-name
+  //      resolves to "us" and Restart would target THIS workspace),
+  //      and flow is "out" — from the user's perspective this row
+  //      belongs to the outbound thread, not an incoming one.
+  //
+  // a2a_send rows are always outbound from us: source_id = us,
+  // target_id = the peer.
+  const isSendActivity = entry.activity_type === "a2a_send";
+  const isSelfLoggedReceive =
+    entry.activity_type === "a2a_receive" && entry.source_id === workspaceId;
+  const flow: "in" | "out" = isSendActivity || isSelfLoggedReceive ? "out" : "in";
+  const peerId =
+    isSendActivity || isSelfLoggedReceive
+      ? entry.target_id || ""
+      : entry.source_id || "";
  if (!peerId) return null;

  const text = extractRequestText(entry.request_body) || entry.summary || "";
@ -46,15 +90,35 @@ function toCommMessage(entry: ActivityEntry, workspaceId: string): CommMessage |

  return {
    id: entry.id,
-    direction: isOutgoing ? "out" : "in",
+    flow,
    peerName: resolveName(peerId),
    peerId,
    text,
    responseText,
+    status: entry.status || "ok",
    timestamp: entry.created_at,
  };
 }

+/** Strip the [A2A_ERROR] sentinel prefix the workspace runtime adds
+ *  to failed delegation responses, so the UI can render the underlying
+ *  message (or fall back to a generic explanation when the inner text
+ *  is empty — currently common because httpx exceptions often
+ *  stringify as ""). */
+const A2A_ERROR_PREFIX = "[A2A_ERROR]";
+
+function unwrapErrorText(raw: string | null): string {
+  if (!raw) return "";
+  const trimmed = raw.trim();
+  if (trimmed.startsWith(A2A_ERROR_PREFIX)) {
+    return trimmed.slice(A2A_ERROR_PREFIX.length).trim();
+  }
+  return trimmed;
+}
+
+// inferA2AErrorHint moved to ./a2aErrorHint so the Activity tab and
+// this panel render identical hints for the same symptom.
+
 export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {
  const [messages, setMessages] = useState<CommMessage[]>([]);
  const [loading, setLoading] = useState(true);
@ -67,22 +131,45 @@ export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {
    setLoading(true);
    api.get<ActivityEntry[]>(`/workspaces/${workspaceId}/activity?source=agent&limit=50`)
      .then((entries) => {
-        const filtered = entries
+        const filtered = (entries ?? [])
          .filter((e) => e.activity_type === "a2a_send" || e.activity_type === "a2a_receive")
          .reverse();
        const msgs: CommMessage[] = [];
        for (const e of filtered) {
-          const m = toCommMessage(e, workspaceId);
-          if (m) {
-            const key = `${m.timestamp}:${m.direction}:${m.peerId}`;
-            msgs.push(m);
-            seenKeys.current.add(key);
+          // Per-row try/catch so a single malformed activity row
+          // (e.g. unexpected request_body shape) doesn't kill the
+          // batch — the previous code threw out of the for-loop and
+          // setMessages([3 items]) never ran, leaving the panel
+          // stuck on the empty state with no diagnostic in the
+          // console because the outer .catch silently swallowed
+          // everything.
+          try {
+            const m = toCommMessage(e, workspaceId);
+            if (m) {
+              const key = `${m.timestamp}:${m.flow}:${m.peerId}`;
+              msgs.push(m);
+              seenKeys.current.add(key);
+            }
+          } catch (rowErr) {
+            console.warn(
+              "AgentCommsPanel: failed to map activity row",
+              { id: e.id, type: e.activity_type, err: rowErr },
+            );
          }
        }
        setMessages(msgs);
        setLoading(false);
      })
-      .catch(() => setLoading(false));
+      .catch((err) => {
+        // Surface the failure in the console so a stuck panel is
+        // diagnosable without a debugger. Previous bare
+        // `.catch(() => setLoading(false))` swallowed every load
+        // failure (network errors, JSON parse errors, throws inside
+        // the .then body) — the panel just sat on the empty state
+        // with zero signal.
+        console.warn("AgentCommsPanel: load activity failed", err);
+        setLoading(false);
+      });
  }, [workspaceId]);

  // Live updates via WebSocket
@ -115,7 +202,7 @@ export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {
          };
          const m = toCommMessage(entry, workspaceId);
          if (m) {
-            const key = `${m.timestamp}:${m.direction}:${m.peerId}`;
+            const key = `${m.timestamp}:${m.flow}:${m.peerId}`;
            if (seenKeys.current.has(key)) return;
            seenKeys.current.add(key);
            setMessages((prev) => [...prev, m]);
@ -148,31 +235,177 @@ export function AgentCommsPanel({ workspaceId }: { workspaceId: string }) {

  return (
    <div className="flex-1 overflow-y-auto p-3 space-y-2">
-      {messages.map((msg) => (
-        <div key={msg.id} className={`flex ${msg.direction === "out" ? "justify-end" : "justify-start"}`}>
-          <div
-            className={`max-w-[85%] rounded-lg px-3 py-2 text-xs ${
-              msg.direction === "out"
-                ? "bg-cyan-900/30 text-cyan-100 border border-cyan-700/20"
-                : "bg-zinc-800/80 text-zinc-200 border border-zinc-700/30"
-            }`}
-          >
-            <div className="text-[9px] text-zinc-500 mb-1">
-              {msg.direction === "out" ? `→ To ${msg.peerName}` : `← From ${msg.peerName}`}
-            </div>
-            <div className="text-zinc-300">{msg.text || "(no message text)"}</div>
-            {msg.responseText && (
-              <div className="mt-1.5 pt-1.5 border-t border-zinc-700/30 text-zinc-400">
-                {msg.responseText}
-              </div>
-            )}
-            <div className="text-[9px] text-zinc-500 mt-1">
-              {new Date(msg.timestamp).toLocaleTimeString()}
-            </div>
-          </div>
-        </div>
-      ))}
+      {messages.map((msg) =>
+        msg.status === "error" ? (
+          <ErrorMessage key={msg.id} msg={msg} />
+        ) : (
+          <NormalMessage key={msg.id} msg={msg} />
+        ),
+      )}
      <div ref={bottomRef} />
    </div>
  );
 }
+
+function NormalMessage({ msg }: { msg: CommMessage }) {
+  return (
+    <div className={`flex ${msg.flow === "out" ? "justify-end" : "justify-start"}`}>
+      <div
+        className={`max-w-[85%] rounded-lg px-3 py-2 text-xs ${
+          msg.flow === "out"
+            ? "bg-cyan-900/30 text-cyan-100 border border-cyan-700/20"
+            : "bg-zinc-800/80 text-zinc-200 border border-zinc-700/30"
+        }`}
+      >
+        <div className="text-[9px] text-zinc-500 mb-1">
+          {msg.flow === "out" ? `→ To ${msg.peerName}` : `← From ${msg.peerName}`}
+        </div>
+        {msg.text ? (
+          <MarkdownBody className="text-zinc-300">{msg.text}</MarkdownBody>
+        ) : (
+          <div className="text-zinc-300">(no message text)</div>
+        )}
+        {msg.responseText && (
+          <MarkdownBody className="mt-1.5 pt-1.5 border-t border-zinc-700/30 text-zinc-400">
+            {msg.responseText}
+          </MarkdownBody>
+        )}
+        <div className="text-[9px] text-zinc-500 mt-1">
+          {new Date(msg.timestamp).toLocaleTimeString()}
+        </div>
+      </div>
+    </div>
+  );
+}
+
+/** Failure-state row. Replaces the unactionable "X failed [A2A_ERROR]"
+ *  bubble with: a clear banner naming the peer, the underlying
+ *  error text (if any), an inferred cause hint, and recovery
+ *  actions — Restart workspace, Open workspace.
+ *
+ *  Recovery actions show on BOTH directions because both target the
+ *  same peer (toCommMessage now resolves peerId to the peer in
+ *  either case): an outbound delivery failure ("we called X and it
+ *  errored"), an inbound runtime failure ("X called us and our
+ *  reply errored" — rare), or the agent-self-logged "I called X and
+ *  got an error back" pattern that is the most common shape. The
+ *  user always wants to restart or inspect the failing peer. */
+function ErrorMessage({ msg }: { msg: CommMessage }) {
+  const selectNode = useCanvasStore((s) => s.selectNode);
+  const [restarting, setRestarting] = useState(false);
+  const errorText = unwrapErrorText(msg.responseText);
+  const hint = inferA2AErrorHint(errorText);
+
+  // Guard against acting on a peer whose workspace has been deleted
+  // since this row was logged. Without the guard, restart 404s
+  // surface as a generic toast and Open silently sets a dangling
+  // selection that renders nothing in the side panel.
+  const peerExists = (): boolean => {
+    return useCanvasStore.getState().nodes.some((n) => n.id === msg.peerId);
+  };
+
+  const handleRestart = async () => {
+    if (restarting) return;
+    if (!peerExists()) {
+      showToast(`${msg.peerName} no longer exists`, "error");
+      return;
+    }
+    setRestarting(true);
+    try {
+      await api.post(`/workspaces/${msg.peerId}/restart`, {});
+      showToast(`Restarting ${msg.peerName}…`, "success");
+    } catch (e) {
+      showToast(
+        `Restart failed: ${e instanceof Error ? e.message : "unknown error"}`,
+        "error",
+      );
+    } finally {
+      setRestarting(false);
+    }
+  };
+
+  const handleOpen = () => {
+    if (!peerExists()) {
+      showToast(`${msg.peerName} no longer exists`, "error");
+      return;
+    }
+    selectNode(msg.peerId);
+  };
+
+  return (
+    <div className={`flex ${msg.flow === "out" ? "justify-end" : "justify-start"}`}>
+      <div className="max-w-[85%] rounded-lg border border-red-800/50 bg-red-950/30 px-3 py-2 text-xs">
+        <div className="flex items-center gap-1.5 text-[10px] text-red-300 font-semibold uppercase tracking-wide mb-1.5">
+          <span aria-hidden="true">⚠</span>
+          {msg.flow === "out"
+            ? `Failed to deliver to ${msg.peerName}`
+            : `${msg.peerName} returned an error`}
+        </div>
+
+        {msg.text && (
+          <div className="text-[10px] text-zinc-500 mb-1.5">
+            <span className="uppercase tracking-wide">Task</span>
+            <MarkdownBody className="text-zinc-400">{msg.text}</MarkdownBody>
+          </div>
+        )}
+
+        <div className="rounded bg-zinc-950/60 border border-red-900/40 px-2 py-1.5 mb-1.5">
+          <div className="text-[9px] uppercase tracking-wide text-red-400 mb-0.5">
+            Underlying error
+          </div>
+          <code className="text-[11px] font-mono text-red-200 whitespace-pre-wrap break-words">
+            {errorText || "(no detail returned)"}
+          </code>
+        </div>
+
+        <p className="text-[10px] text-zinc-400 leading-snug mb-2">{hint}</p>
+
+        {msg.peerId && (
+          <div className="flex flex-wrap items-center gap-1.5">
+            <button
+              type="button"
+              onClick={handleRestart}
+              disabled={restarting}
+              className="px-2 py-0.5 rounded bg-red-900/50 hover:bg-red-800/60 border border-red-700/40 text-[10px] text-red-200 disabled:opacity-50 transition-colors"
+            >
+              {restarting ? "Restarting…" : `Restart ${msg.peerName}`}
+            </button>
+            <button
+              type="button"
+              onClick={handleOpen}
+              className="px-2 py-0.5 rounded bg-zinc-800 hover:bg-zinc-700 border border-zinc-700/50 text-[10px] text-zinc-300 transition-colors"
+            >
+              Open {msg.peerName}
+            </button>
+          </div>
+        )}
+
+        <div className="text-[9px] text-zinc-500 mt-1.5">
+          {new Date(msg.timestamp).toLocaleTimeString()}
+        </div>
+      </div>
+    </div>
+  );
+}
+
+/** Tiny markdown wrapper matching ChatTab's My Chat styling. Same
+ *  remark-gfm pipeline (tables, strikethrough, task lists) plus the
+ *  prose tweaks that keep paragraphs tight inside a small bubble.
+ *  Code blocks get an `overflow-x-auto` so a long line of code doesn't
+ *  blow out the bubble's max-width — agent-to-agent replies routinely
+ *  ship code samples and JSON. */
+function MarkdownBody({
+  children,
+  className,
+}: {
+  children: string;
+  className?: string;
+}) {
+  return (
+    <div
+      className={`prose prose-sm prose-invert max-w-none [&>p]:mb-1 [&>p:last-child]:mb-0 [&_pre]:overflow-x-auto [&_table]:block [&_table]:overflow-x-auto ${className ?? ""}`}
+    >
+      <ReactMarkdown remarkPlugins={[remarkGfm]}>{children}</ReactMarkdown>
+    </div>
+  );
+}
--- a/canvas/src/components/tabs/chat/AttachmentViews.tsx
+++ b/canvas/src/components/tabs/chat/AttachmentViews.tsx
@ -0,0 +1,94 @@
+"use client";
+
+// Small presentational components for chat attachments. Kept in a
+// separate file so ChatTab.tsx stays focused on state + send/receive
+// orchestration. Both variants share the file-icon + name + size
+// layout; the only difference is the trailing action (remove for
+// pending, download for completed).
+
+import type { ChatAttachment } from "./types";
+
+function formatSize(bytes: number | undefined): string {
+  if (bytes == null) return "";
+  if (bytes < 1024) return `${bytes} B`;
+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`;
+  return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
+}
+
+/** Inline pill for a file that the user has picked but not yet sent.
+ *  Renders above the textarea; clicking × pops it from the pending
+ *  list without uploading. */
+export function PendingAttachmentPill({
+  file,
+  onRemove,
+}: {
+  file: File;
+  onRemove: () => void;
+}) {
+  return (
+    <div className="flex items-center gap-1.5 rounded-md border border-zinc-700/60 bg-zinc-800/80 px-2 py-1 text-[10px] text-zinc-300 max-w-[200px]">
+      <FileGlyph className="text-zinc-400 shrink-0" />
+      <span className="truncate" title={file.name}>{file.name}</span>
+      <span className="text-zinc-500 shrink-0 tabular-nums">{formatSize(file.size)}</span>
+      <button
+        onClick={onRemove}
+        aria-label={`Remove ${file.name}`}
+        className="ml-0.5 text-zinc-500 hover:text-zinc-200 transition-colors shrink-0"
+      >
+        <svg width="10" height="10" viewBox="0 0 16 16" fill="none" aria-hidden="true">
+          <path d="M4 4l8 8M12 4l-8 8" stroke="currentColor" strokeWidth="1.6" strokeLinecap="round" />
+        </svg>
+      </button>
+    </div>
+  );
+}
+
+/** Chip rendered inside a message bubble for a sent/received file.
+ *  Clicking triggers the download via the passed onDownload callback
+ *  so the parent controls workspace-scoped URL resolution. */
+export function AttachmentChip({
+  attachment,
+  onDownload,
+  tone,
+}: {
+  attachment: ChatAttachment;
+  onDownload: (a: ChatAttachment) => void;
+  tone: "user" | "agent";
+}) {
+  const toneClasses =
+    tone === "user"
+      ? "border-blue-400/30 bg-blue-600/20 hover:bg-blue-600/30 text-blue-100"
+      : "border-zinc-600/50 bg-zinc-700/40 hover:bg-zinc-600/50 text-zinc-100";
+  return (
+    <button
+      onClick={() => onDownload(attachment)}
+      title={`Download ${attachment.name}`}
+      className={`flex items-center gap-1.5 rounded-md border px-2 py-1 text-[10px] transition-colors max-w-full ${toneClasses}`}
+    >
+      <FileGlyph className="shrink-0 opacity-70" />
+      <span className="truncate">{attachment.name}</span>
+      {attachment.size != null && (
+        <span className="opacity-60 shrink-0 tabular-nums">{formatSize(attachment.size)}</span>
+      )}
+      <DownloadGlyph className="opacity-70 shrink-0" />
+    </button>
+  );
+}
+
+function FileGlyph({ className }: { className?: string }) {
+  return (
+    <svg width="10" height="10" viewBox="0 0 16 16" fill="none" className={className} aria-hidden="true">
+      <path d="M4 2h5l3 3v9a1 1 0 0 1-1 1H4a1 1 0 0 1-1-1V3a1 1 0 0 1 1-1Z" stroke="currentColor" strokeWidth="1.3" strokeLinejoin="round" />
+      <path d="M9 2v3h3" stroke="currentColor" strokeWidth="1.3" strokeLinejoin="round" />
+    </svg>
+  );
+}
+
+function DownloadGlyph({ className }: { className?: string }) {
+  return (
+    <svg width="10" height="10" viewBox="0 0 16 16" fill="none" className={className} aria-hidden="true">
+      <path d="M8 2v9M4 7l4 4 4-4" stroke="currentColor" strokeWidth="1.4" strokeLinecap="round" strokeLinejoin="round" />
+      <path d="M3 13h10" stroke="currentColor" strokeWidth="1.4" strokeLinecap="round" />
+    </svg>
+  );
+}
--- a/canvas/src/components/tabs/chat/tests/AgentCommsPanel.test.ts
+++ b/canvas/src/components/tabs/chat/tests/AgentCommsPanel.test.ts
@ -0,0 +1,113 @@
+// @vitest-environment jsdom
+import { describe, it, expect, vi } from "vitest";
+
+// Stub the canvas store before importing the SUT — toCommMessage calls
+// useCanvasStore.getState() inside resolveName to look up peer names,
+// which would otherwise hit the real Zustand store.
+vi.mock("@/store/canvas", () => ({
+  useCanvasStore: {
+    getState: () => ({
+      nodes: [
+        { id: "ws-self", data: { name: "Self" } },
+        { id: "ws-peer", data: { name: "Peer Agent" } },
+      ],
+    }),
+  },
+}));
+
+import { toCommMessage, type ActivityEntry } from "../AgentCommsPanel";
+
+const SELF = "ws-self";
+const PEER = "ws-peer";
+
+function makeEntry(overrides: Partial<ActivityEntry> = {}): ActivityEntry {
+  return {
+    id: "act-1",
+    activity_type: "a2a_send",
+    source_id: SELF,
+    target_id: PEER,
+    method: "message/send",
+    summary: "Delegating to Peer Agent",
+    request_body: null,
+    response_body: null,
+    status: "ok",
+    created_at: "2026-04-25T18:00:00Z",
+    ...overrides,
+  };
+}
+
+describe("toCommMessage — flow derivation", () => {
+  it("a2a_send is always outbound (flow=out, peer=target)", () => {
+    const m = toCommMessage(
+      makeEntry({ activity_type: "a2a_send", source_id: SELF, target_id: PEER }),
+      SELF,
+    );
+    expect(m).toBeTruthy();
+    expect(m!.flow).toBe("out");
+    expect(m!.peerId).toBe(PEER);
+    expect(m!.peerName).toBe("Peer Agent");
+  });
+
+  it("a2a_receive from a peer (peer-initiated call) is inbound", () => {
+    // Real incoming call: source = peer, target = us.
+    const m = toCommMessage(
+      makeEntry({
+        activity_type: "a2a_receive",
+        source_id: PEER,
+        target_id: SELF,
+      }),
+      SELF,
+    );
+    expect(m!.flow).toBe("in");
+    expect(m!.peerId).toBe(PEER);
+    expect(m!.peerName).toBe("Peer Agent");
+  });
+
+  it("a2a_receive self-logged by our runtime AFTER an outbound call is OUTBOUND from the user's POV", () => {
+    // workspace/a2a_tools.py:181 self-logs an a2a_receive on the
+    // CALLER's workspace_id with source_id=us, target_id=peer.
+    // From the user's perspective this row belongs to the outbound
+    // delegation thread — render flow=out + peer=target so the
+    // bubble right-justifies under "Delegating to peer" and the
+    // Restart button targets the actual peer (NOT us). Regression
+    // for the bug where these rows rendered as "← From Self" with
+    // a Restart button that would have restarted the user's own
+    // workspace.
+    const m = toCommMessage(
+      makeEntry({
+        activity_type: "a2a_receive",
+        source_id: SELF,
+        target_id: PEER,
+        summary: "Peer Agent failed",
+        status: "error",
+      }),
+      SELF,
+    );
+    expect(m!.flow).toBe("out");
+    expect(m!.peerId).toBe(PEER);
+    expect(m!.peerName).toBe("Peer Agent");
+    expect(m!.status).toBe("error");
+  });
+
+  it("returns null when no peer can be resolved", () => {
+    // a2a_receive with both ids null — discard rather than render a
+    // ghost bubble pointing at "Unknown".
+    const m = toCommMessage(
+      makeEntry({
+        activity_type: "a2a_receive",
+        source_id: null,
+        target_id: null,
+      }),
+      SELF,
+    );
+    expect(m).toBeNull();
+  });
+
+  it("propagates status through to the message (drives error rendering)", () => {
+    const m = toCommMessage(
+      makeEntry({ status: "error", activity_type: "a2a_send" }),
+      SELF,
+    );
+    expect(m!.status).toBe("error");
+  });
+});
--- a/canvas/src/components/tabs/chat/tests/a2aErrorHint.test.ts
+++ b/canvas/src/components/tabs/chat/tests/a2aErrorHint.test.ts
@ -0,0 +1,67 @@
+import { describe, it, expect } from "vitest";
+import { inferA2AErrorHint } from "../a2aErrorHint";
+
+// Pure logic. Pin every named pattern so a future contributor adding a
+// new symptom doesn't accidentally collapse the buckets — and so the
+// "most specific first" ordering can't drift without a test failing.
+
+describe("inferA2AErrorHint", () => {
+  it("matches the Claude Code SDK init wedge specifically", () => {
+    const hint = inferA2AErrorHint("Control request timeout: initialize");
+    expect(hint).toMatch(/Claude Code SDK is wedged/);
+  });
+
+  it("does NOT misfire on user tasks containing 'initialize' generally", () => {
+    // Regression: an earlier bare-`initialize` pattern would have
+    // false-positived "failed to initialize database" into the SDK
+    // wedge hint. Confirm the full-phrase guard holds.
+    const hint = inferA2AErrorHint("failed to initialize database connection");
+    expect(hint).not.toMatch(/Claude Code SDK/);
+  });
+
+  it("recognises httpx ReadTimeout / ConnectTimeout class names", () => {
+    expect(inferA2AErrorHint("ReadTimeout: timeout")).toMatch(/proxy timeout/);
+    expect(inferA2AErrorHint("ConnectTimeout: ...")).toMatch(/proxy timeout/);
+  });
+
+  it("recognises generic timeout / deadline-exceeded language", () => {
+    expect(inferA2AErrorHint("deadline exceeded after 300s")).toMatch(/proxy timeout/);
+    expect(inferA2AErrorHint("Operation timeout")).toMatch(/proxy timeout/);
+  });
+
+  it("handles connection-reset family (RemoteProtocolError, ConnectionReset, no-message)", () => {
+    expect(inferA2AErrorHint("RemoteProtocolError: ...")).toMatch(/connection.*dropped/);
+    expect(inferA2AErrorHint("ConnectionResetError")).toMatch(/connection.*dropped/);
+    expect(inferA2AErrorHint("connection reset by peer")).toMatch(/connection.*dropped/);
+    expect(inferA2AErrorHint("RemoteProtocolError (no message — likely connection reset)")).toMatch(/connection.*dropped/);
+  });
+
+  it("recognises agent-runtime exceptions", () => {
+    expect(inferA2AErrorHint("Agent error: ValueError raised")).toMatch(/runtime threw an exception/);
+    expect(inferA2AErrorHint("RuntimeException in tool call")).toMatch(/runtime threw an exception/);
+  });
+
+  it("recognises peer-unreachable cases (Activity-tab originals)", () => {
+    expect(inferA2AErrorHint("workspace not found")).toMatch(/can't be reached/);
+    expect(inferA2AErrorHint("not accessible")).toMatch(/can't be reached/);
+    expect(inferA2AErrorHint("workspace is offline")).toMatch(/can't be reached/);
+  });
+
+  it("returns the empty-detail-specific hint when input is exactly empty", () => {
+    expect(inferA2AErrorHint("")).toMatch(/no error detail/);
+  });
+
+  it("returns a generic fallback for unrecognised text", () => {
+    const hint = inferA2AErrorHint("some completely novel error nobody has matched yet");
+    expect(hint).toMatch(/Check the workspace logs|delivery failure/);
+  });
+
+  it("Claude SDK wedge wins over the more general timeout pattern", () => {
+    // Both 'control request timeout' and 'timeout' match the same
+    // input. The SDK wedge hint is more actionable; the ordering in
+    // the function must keep it first. Lock that priority in.
+    const hint = inferA2AErrorHint("Control request timeout: initialize");
+    expect(hint).toMatch(/Claude Code SDK/);
+    expect(hint).not.toMatch(/proxy timeout/);
+  });
+});
--- a/canvas/src/components/tabs/chat/tests/activityLog.test.ts
+++ b/canvas/src/components/tabs/chat/tests/activityLog.test.ts
@ -0,0 +1,41 @@
+import { describe, it, expect } from "vitest";
+import { ACTIVITY_LOG_WINDOW, appendActivityLine } from "../activityLog";
+
+describe("appendActivityLine", () => {
+  it("appends a fresh line", () => {
+    expect(appendActivityLine([], "📄 Read /a")).toEqual(["📄 Read /a"]);
+  });
+
+  it("collapses an immediate duplicate", () => {
+    const prev = ["📄 Read /a"];
+    // Same exact string twice in a row is noise — the helper should
+    // return the original array reference, not a new one.
+    expect(appendActivityLine(prev, "📄 Read /a")).toBe(prev);
+  });
+
+  it("keeps non-adjacent duplicates", () => {
+    const prev = ["📄 Read /a", "⚡ Bash: ls"];
+    expect(appendActivityLine(prev, "📄 Read /a")).toEqual([
+      "📄 Read /a",
+      "⚡ Bash: ls",
+      "📄 Read /a",
+    ]);
+  });
+
+  it("rolls off the oldest line when the window fills", () => {
+    const seed = Array.from({ length: ACTIVITY_LOG_WINDOW }, (_, i) => `line-${i}`);
+    const next = appendActivityLine(seed, "newest");
+    expect(next.length).toBe(ACTIVITY_LOG_WINDOW);
+    expect(next[next.length - 1]).toBe("newest");
+    // Oldest entry is dropped — line-0 is gone.
+    expect(next[0]).toBe("line-1");
+  });
+
+  it("keeps the original array reference when below the window cap", () => {
+    const prev = ["a", "b"];
+    const next = appendActivityLine(prev, "c");
+    // Returned a new array (we appended); must NOT mutate prev.
+    expect(prev).toEqual(["a", "b"]);
+    expect(next).toEqual(["a", "b", "c"]);
+  });
+});
--- a/canvas/src/components/tabs/chat/tests/message-parser.test.ts
+++ b/canvas/src/components/tabs/chat/tests/message-parser.test.ts
@ -4,6 +4,7 @@ import {
  extractResponseText,
  extractAgentText,
  extractTextsFromParts,
+  extractFilesFromTask,
 } from "../message-parser";

 describe("extractRequestText", () => {
@ -99,6 +100,67 @@ describe("extractResponseText", () => {
  it("returns empty when result has no parts", () => {
    expect(extractResponseText({ result: { other: true } })).toBe("");
  });
+
+  // Regression: Claude Code (and other long-reply runtimes) emits
+  // multi-part text replies. The previous implementation returned
+  // only the first part, silently truncating the rest. Observed
+  // 2026-04-25 on a 15k-char Wave 1 brief that rendered as just the
+  // markdown table header.
+  it("joins all text parts when result.parts has multiple", () => {
+    const body = {
+      result: {
+        parts: [
+          { kind: "text", text: "# Header" },
+          { kind: "text", text: "| Col |" },
+          { kind: "text", text: "| --- |" },
+          { kind: "text", text: "| Row |" },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("# Header\n| Col |\n| --- |\n| Row |");
+  });
+
+  it("joins all text parts across multiple artifacts", () => {
+    const body = {
+      result: {
+        artifacts: [
+          { parts: [{ kind: "text", text: "First artifact" }] },
+          { parts: [{ kind: "text", text: "Second artifact" }] },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("First artifact\nSecond artifact");
+  });
+
+  it("joins all .root.text variants when present", () => {
+    const body = {
+      result: {
+        parts: [
+          { root: { text: "alpha" } },
+          { root: { text: "beta" } },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("alpha\nbeta");
+  });
+
+  // Regression: when a response carries BOTH parts and artifacts
+  // (Hermes tool-call replies do this — summary in parts, detail in
+  // artifacts), the early-return-on-parts implementation silently
+  // dropped the artifacts body. The collected-from-every-source
+  // implementation must surface both.
+  it("collects text from BOTH result.parts AND result.artifacts when both present", () => {
+    const body = {
+      result: {
+        parts: [{ kind: "text", text: "Summary" }],
+        artifacts: [
+          { parts: [{ kind: "text", text: "Detail block one" }] },
+          { parts: [{ kind: "text", text: "Detail block two" }] },
+        ],
+      },
+    };
+    expect(extractResponseText(body)).toBe("Summary\nDetail block one\nDetail block two");
+  });
 });

 describe("extractTextsFromParts", () => {
@ -133,3 +195,71 @@ describe("extractTextsFromParts", () => {
    expect(extractTextsFromParts(parts)).toBe("Only text");
  });
 });
+
+describe("extractFilesFromTask", () => {
+  it("pulls A2A file parts out of a result", () => {
+    const task = {
+      parts: [
+        { kind: "text", text: "here's the report" },
+        {
+          kind: "file",
+          file: { name: "report.pdf", mimeType: "application/pdf", uri: "workspace:/reports/report.pdf", size: 4096 },
+        },
+      ],
+    };
+    const files = extractFilesFromTask(task);
+    expect(files).toEqual([
+      { name: "report.pdf", mimeType: "application/pdf", uri: "workspace:/reports/report.pdf", size: 4096 },
+    ]);
+  });
+
+  it("recovers a filename from the URI when `name` is absent", () => {
+    const task = {
+      parts: [
+        { kind: "file", file: { uri: "workspace:/workspace/out/graph.png" } },
+      ],
+    };
+    const files = extractFilesFromTask(task);
+    expect(files[0].name).toBe("graph.png");
+  });
+
+  it("skips file parts without a URI (inline bytes are not supported yet)", () => {
+    const task = {
+      parts: [
+        { kind: "file", file: { name: "inline.bin", bytes: "AAA=" } },
+      ],
+    };
+    expect(extractFilesFromTask(task)).toEqual([]);
+  });
+
+  it("walks artifacts[] so file parts nested inside artifact envelopes are found", () => {
+    const task = {
+      artifacts: [
+        {
+          parts: [
+            { kind: "file", file: { name: "trace.log", uri: "workspace:/logs/trace.log" } },
+          ],
+        },
+      ],
+    };
+    const files = extractFilesFromTask(task);
+    expect(files[0]).toMatchObject({ name: "trace.log", uri: "workspace:/logs/trace.log" });
+  });
+
+  it("returns [] on malformed input rather than throwing", () => {
+    expect(extractFilesFromTask({})).toEqual([]);
+    expect(extractFilesFromTask({ parts: "not-an-array" } as unknown as Record<string, unknown>)).toEqual([]);
+  });
+
+  it("walks result.message.parts — the non-task reply shape some A2A servers use", () => {
+    const task = {
+      message: {
+        parts: [
+          { kind: "file", file: { name: "out.txt", uri: "workspace:/workspace/out.txt" } },
+        ],
+      },
+    };
+    const files = extractFilesFromTask(task);
+    expect(files[0]).toMatchObject({ name: "out.txt", uri: "workspace:/workspace/out.txt" });
+  });
+});
--- a/canvas/src/components/tabs/chat/tests/uploads.test.ts
+++ b/canvas/src/components/tabs/chat/tests/uploads.test.ts
@ -0,0 +1,41 @@
+import { describe, it, expect } from "vitest";
+import { resolveAttachmentHref } from "../uploads";
+
+describe("resolveAttachmentHref — URI scheme normalisation", () => {
+  const wsId = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee";
+
+  it("rewrites the canonical workspace:<path> scheme to /chat/download", () => {
+    const url = resolveAttachmentHref(wsId, "workspace:/workspace/report.pdf");
+    expect(url).toContain(`/workspaces/${wsId}/chat/download`);
+    expect(url).toContain(encodeURIComponent("/workspace/report.pdf"));
+  });
+
+  it("accepts bare absolute container paths (some agents omit the scheme)", () => {
+    const url = resolveAttachmentHref(wsId, "/workspace/report.pdf");
+    expect(url).toContain(`/workspaces/${wsId}/chat/download`);
+    expect(url).toContain(encodeURIComponent("/workspace/report.pdf"));
+  });
+
+  it("accepts file:/// URIs pointing into an allowed root", () => {
+    const url = resolveAttachmentHref(wsId, "file:///workspace/report.pdf");
+    expect(url).toContain(`/workspaces/${wsId}/chat/download`);
+    expect(url).toContain(encodeURIComponent("/workspace/report.pdf"));
+  });
+
+  it("passes through HTTP(S) URIs unchanged so off-platform artefacts still render", () => {
+    const external = "https://example.com/static/report.pdf";
+    expect(resolveAttachmentHref(wsId, external)).toBe(external);
+  });
+
+  it("passes through container paths that are not under any allowed root", () => {
+    // /etc/passwd looks like a path but isn't one of the allowed
+    // roots — falling back to raw passthrough forces the caller into
+    // the external-URL branch, which opens a new tab and lets the
+    // browser refuse. Rewriting would 400 anyway server-side.
+    expect(resolveAttachmentHref(wsId, "/etc/passwd")).toBe("/etc/passwd");
+  });
+
+  it("passes through unknown schemes unchanged", () => {
+    expect(resolveAttachmentHref(wsId, "s3://bucket/key")).toBe("s3://bucket/key");
+  });
+});
--- a/canvas/src/components/tabs/chat/a2aErrorHint.ts
+++ b/canvas/src/components/tabs/chat/a2aErrorHint.ts
@ -0,0 +1,54 @@
+/**
+ * Maps an A2A delivery-failure detail string (the bit AFTER stripping
+ * the [A2A_ERROR] sentinel prefix) to a one-line operator-actionable
+ * hint. Pattern matches are lowercase substring checks, ordered most-
+ * specific first so the right hint wins when multiple patterns
+ * overlap (e.g. "control request timeout" wins over generic "timeout").
+ *
+ * Used by both the chat Agent Comms panel and the Activity tab so the
+ * same symptom reads identically across surfaces. Two prior copies
+ * had already drifted (Activity tab gained `not found`/`offline`
+ * cases AgentCommsPanel never picked up) — this module is the merged
+ * superset and the only place hint text should change.
+ */
+export function inferA2AErrorHint(detail: string): string {
+  const t = detail.toLowerCase();
+
+  // "control request timeout" is the specific Claude Code SDK init
+  // wedge symptom. Pattern on the full phrase, not bare "initialize"
+  // — a user task containing "failed to initialize database" would
+  // false-positive into the SDK-wedge hint.
+  if (t.includes("control request timeout")) {
+    return "The remote agent's Claude Code SDK is wedged on initialization (often after a long idle period or OAuth refresh). A workspace restart usually clears it.";
+  }
+  if (
+    t.includes("readtimeout") ||
+    t.includes("connecttimeout") ||
+    t.includes("deadline exceeded") ||
+    t.includes("timeout")
+  ) {
+    return "The remote agent didn't respond within the proxy timeout. It may be busy with a long task, or the runtime is stuck — restart the workspace if this repeats.";
+  }
+  if (
+    t.includes("connectionreset") ||
+    t.includes("remoteprotocolerror") ||
+    t.includes("connection reset") ||
+    t.includes("no message")
+  ) {
+    return "The connection to the remote agent dropped before a reply arrived. Usually a transient network blip — retry once. If it repeats, the remote container may have crashed mid-request; check its logs.";
+  }
+  if (t.includes("agent error") || t.includes("exception")) {
+    return "The remote agent's runtime threw an exception. Check the workspace's container logs for the traceback. Restart usually clears transient runtime crashes.";
+  }
+  if (
+    t.includes("not found") ||
+    t.includes("not accessible") ||
+    t.includes("offline")
+  ) {
+    return "The remote workspace can't be reached — it may be stopped, removed, or outside the access control list. Verify the peer is online before retrying.";
+  }
+  if (detail === "") {
+    return "The remote agent returned no error detail (the underlying httpx exception had an empty message — typically a connection-reset or silent timeout). A workspace restart is the safe first move.";
+  }
+  return "The remote agent reported a delivery failure. Check the workspace logs or try restarting.";
+}
--- a/canvas/src/components/tabs/chat/activityLog.ts
+++ b/canvas/src/components/tabs/chat/activityLog.ts
@ -0,0 +1,23 @@
+/**
+ * Sliding-window log for the in-chat activity feed (the live progress
+ * lines under the spinner while a chat reply is in flight).
+ *
+ * Sized to fit the spinner area without forcing a scroll; per-tool-use
+ * rows from the workspace's _report_tool_use can fire dozens per turn
+ * (Read 5 files + Grep + Bash + Edits + delegations), so a too-small
+ * window flushes useful early context before the user can read it.
+ *
+ * Consecutive identical lines collapse to a single entry — the same
+ * tool repeated on the same target (e.g. Read of the same file twice
+ * within a turn) is noise, not new progress.
+ */
+export const ACTIVITY_LOG_WINDOW = 20;
+
+export function appendActivityLine(prev: string[], line: string): string[] {
+  if (prev[prev.length - 1] === line) return prev; // collapse duplicates
+  const next =
+    prev.length >= ACTIVITY_LOG_WINDOW
+      ? prev.slice(-(ACTIVITY_LOG_WINDOW - 1))
+      : prev;
+  return [...next, line];
+}
--- a/canvas/src/components/tabs/chat/message-parser.ts
+++ b/canvas/src/components/tabs/chat/message-parser.ts
@ -32,6 +32,64 @@ export function extractTextsFromParts(parts: unknown): string | null {
  return texts.length > 0 ? texts.join("\n") : null;
 }

+export interface ParsedFilePart {
+  name: string;
+  uri: string;
+  mimeType?: string;
+  size?: number;
+}
+
+/** Extract file parts from an A2A response. Walks parts[] + artifacts[].
+ *  Per the A2A spec a file part looks like:
+ *    { kind: "file", file: { name, mimeType, uri | bytes } }
+ *  We only surface parts that carry a `uri` — inline bytes would
+ *  require a different renderer (data URL) and are out of scope for
+ *  MVP. Names fall back to the URI's basename when absent. */
+export function extractFilesFromTask(task: Record<string, unknown>): ParsedFilePart[] {
+  const out: ParsedFilePart[] = [];
+  const pushFromParts = (parts: unknown) => {
+    if (!Array.isArray(parts)) return;
+    for (const raw of parts as Array<Record<string, unknown>>) {
+      if (raw.kind !== "file" && raw.type !== "file") continue;
+      const file = (raw.file ?? raw) as Record<string, unknown>;
+      const uri = typeof file.uri === "string" ? file.uri : "";
+      if (!uri) continue;
+      const name = (typeof file.name === "string" && file.name) || basename(uri);
+      out.push({
+        name,
+        uri,
+        mimeType: typeof file.mimeType === "string" ? file.mimeType : undefined,
+        size: typeof file.size === "number" ? file.size : undefined,
+      });
+    }
+  };
+  try {
+    pushFromParts(task.parts);
+    const artifacts = task.artifacts as Array<Record<string, unknown>> | undefined;
+    if (artifacts) for (const a of artifacts) pushFromParts(a.parts);
+    const status = task.status as Record<string, unknown> | undefined;
+    if (status?.message) {
+      const msg = status.message as Record<string, unknown>;
+      pushFromParts(msg.parts);
+    }
+    // Some A2A servers wrap a non-task reply as
+    // {result: {message: {parts: [...]}}} rather than {result: {parts}}.
+    // Without this branch we'd silently drop file parts returned by
+    // third-party implementations.
+    const message = task.message as Record<string, unknown> | undefined;
+    if (message) pushFromParts(message.parts);
+  } catch {
+    /* tolerate malformed shapes — chat falls through to text-only */
+  }
+  return out;
+}
+
+function basename(uri: string): string {
+  const cleaned = uri.replace(/^workspace:/, "").replace(/^https?:\/\//, "");
+  const slash = cleaned.lastIndexOf("/");
+  return slash >= 0 ? cleaned.slice(slash + 1) : cleaned || "file";
+}
+
 /** Extract user message text from an activity log request_body */
 export function extractRequestText(body: Record<string, unknown> | null): string {
  if (!body) return "";
@ -41,22 +99,54 @@ export function extractRequestText(body: Record<string, unknown> | null): string
  return (parts?.[0]?.text as string) || "";
 }

-/** Extract text from an activity log response_body (multiple possible formats) */
+/** Extract text from an activity log response_body (multiple possible formats).
+ *
+ *  Collects from EVERY source — top-level `parts[].text`, `parts[].root.text`
+ *  (older nested shape), and `artifacts[].parts[].text` (task-shaped
+ *  replies) — and joins them with "\n". Two reasons to collect rather
+ *  than early-return:
+ *
+ *    1. Claude Code and other long-reply runtimes emit multiple text
+ *       parts in a single `parts` array. Returning just the first
+ *       silently truncates 15k-char briefs to their leading line
+ *       (observed UX A/B Lab Wave 1, 2026-04-25).
+ *
+ *    2. Some producers emit a summary in `parts[].text` AND details in
+ *       `artifacts[].parts[].text` (Hermes does this for tool calls).
+ *       The previous "first source wins" returned only the summary;
+ *       artifacts dropped silently. */
 export function extractResponseText(body: Record<string, unknown>): string {
  try {
    // {result: "text"} — from MCP server delegation logs
    if (typeof body.result === "string") return body.result;

-    // A2A JSON-RPC response: {result: {parts: [{kind: "text", text: "..."}]}}
    const result = body.result as Record<string, unknown> | undefined;
    if (result) {
+      const collected: string[] = [];
+
+      // A2A JSON-RPC: {result: {parts: [{kind: "text", text: "..."}]}}
+      const fromParts = extractTextsFromParts(result.parts);
+      if (fromParts) collected.push(fromParts);
+
+      // Older nested shape: {parts: [{root: {text: "..."}}]}
      const parts = (result.parts || []) as Array<Record<string, unknown>>;
+      const rootTexts: string[] = [];
      for (const p of parts) {
-        const t = (p.text as string) || "";
-        if (t) return t;
        const root = p.root as Record<string, unknown> | undefined;
-        if (root?.text) return root.text as string;
+        if (root?.text) rootTexts.push(root.text as string);
      }
+      if (rootTexts.length > 0) collected.push(rootTexts.join("\n"));
+
+      // Task shape: {result: {artifacts: [{parts: [...]}]}}
+      const artifacts = result.artifacts as Array<Record<string, unknown>> | undefined;
+      if (artifacts) {
+        for (const a of artifacts) {
+          const t = extractTextsFromParts(a.parts);
+          if (t) collected.push(t);
+        }
+      }
+
+      if (collected.length > 0) return collected.join("\n");
    }

    // {task: "text"} — request body format, shouldn't be in response but handle it
--- a/canvas/src/components/tabs/chat/types.ts
+++ b/canvas/src/components/tabs/chat/types.ts
@ -1,12 +1,38 @@
+/** One file attached to a chat message. Shared shape for both
+ *  directions: when a user attaches a file the UI uploads it and
+ *  stashes the returned metadata here; when an agent returns a
+ *  `kind: file` part in an A2A response, the parser populates the
+ *  same fields. `uri` uses the `workspace:<abs-path>` scheme the
+ *  server returns — the renderer translates that to a download
+ *  request against GET /workspaces/:id/chat/download. */
+export interface ChatAttachment {
+  name: string;
+  uri: string;
+  mimeType?: string;
+  size?: number;
+}
+
 export interface ChatMessage {
  id: string;
  role: "user" | "agent" | "system";
  content: string;
+  /** Attachments sent with or returned alongside this message. */
+  attachments?: ChatAttachment[];
  timestamp: string; // ISO string for serialization
 }

-export function createMessage(role: ChatMessage["role"], content: string): ChatMessage {
-  return { id: crypto.randomUUID(), role, content, timestamp: new Date().toISOString() };
+export function createMessage(
+  role: ChatMessage["role"],
+  content: string,
+  attachments?: ChatAttachment[],
+): ChatMessage {
+  return {
+    id: crypto.randomUUID(),
+    role,
+    content,
+    attachments: attachments && attachments.length > 0 ? attachments : undefined,
+    timestamp: new Date().toISOString(),
+  };
 }

 // appendMessageDeduped adds a ChatMessage to `prev` unless the tail
@ -25,11 +51,23 @@ export function createMessage(role: ChatMessage["role"], content: string): ChatM
 // messages ("hi", "hi") from a real user/agent still render.
 export function appendMessageDeduped(prev: ChatMessage[], msg: ChatMessage, dedupeWindowMs = 3000): ChatMessage[] {
  const cutoff = Date.now() - dedupeWindowMs;
+  const sig = attachmentSignature(msg.attachments);
  const alreadyThere = prev.some((m) => {
    if (m.role !== msg.role || m.content !== msg.content) return false;
+    // Attachments participate in the dedupe key so a text-only push
+    // doesn't shadow the file-carrying HTTP response (and vice versa).
+    // When both carry the same text AND the same files, collapse.
+    if (attachmentSignature(m.attachments) !== sig) return false;
    const t = Date.parse(m.timestamp);
    return !Number.isNaN(t) && t >= cutoff;
  });
  if (alreadyThere) return prev;
  return [...prev, msg];
 }
+
+function attachmentSignature(atts: ChatAttachment[] | undefined): string {
+  if (!atts || atts.length === 0) return "";
+  // URI is the stable identity — name can differ across delivery
+  // paths (agent vs our parser's basename fallback).
+  return atts.map((a) => a.uri).sort().join("|");
+}
--- a/canvas/src/components/tabs/chat/uploads.ts
+++ b/canvas/src/components/tabs/chat/uploads.ts
@ -0,0 +1,135 @@
+import { PLATFORM_URL } from "@/lib/api";
+import { getTenantSlug } from "@/lib/tenant";
+import type { ChatAttachment } from "./types";
+
+/** Chat attachments are intentionally uploaded via a direct fetch()
+ *  instead of the `api.post` helper — `api.post` JSON-stringifies the
+ *  body, which would 500 on a Blob. Mirrors the header plumbing
+ *  (tenant slug, admin token, credentials) so SaaS + self-hosted
+ *  callers work the same way. */
+export async function uploadChatFiles(
+  workspaceId: string,
+  files: File[],
+): Promise<ChatAttachment[]> {
+  if (files.length === 0) return [];
+
+  const form = new FormData();
+  for (const f of files) form.append("files", f, f.name);
+
+  const headers: Record<string, string> = {};
+  const slug = getTenantSlug();
+  if (slug) headers["X-Molecule-Org-Slug"] = slug;
+  const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
+  if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
+
+  // Uploads legitimately take a while on cold cache (tar write +
+  // docker cp into the container). 60s is comfortable for the 25MB/
+  // 50MB caps the server enforces.
+  const res = await fetch(`${PLATFORM_URL}/workspaces/${workspaceId}/chat/uploads`, {
+    method: "POST",
+    headers,
+    body: form,
+    credentials: "include",
+    signal: AbortSignal.timeout(60_000),
+  });
+  if (!res.ok) {
+    const text = await res.text().catch(() => "");
+    throw new Error(`upload failed: ${res.status} ${text}`);
+  }
+  const json = (await res.json()) as { files: ChatAttachment[] };
+  return json.files ?? [];
+}
+
+/** Resolve a file URI into a browser-downloadable URL. Accepts:
+ *    - `workspace:<abs-path>` (our canonical form)
+ *    - `file:///workspace/...` (some agents emit this)
+ *    - `/workspace/...` (bare absolute path inside the container)
+ *  Everything that looks like an allowed-root container path is
+ *  rewritten to the authenticated /chat/download endpoint. HTTP(S)
+ *  URIs pass through unchanged so we can also render links to
+ *  artefacts hosted off-platform. Unknown schemes fall back to the
+ *  raw URI — the caller gets to decide how to render it. */
+export function resolveAttachmentHref(
+  workspaceId: string,
+  uri: string,
+): string {
+  const containerPath = normalizeWorkspaceUri(uri);
+  if (containerPath) {
+    return `${PLATFORM_URL}/workspaces/${workspaceId}/chat/download?path=${encodeURIComponent(containerPath)}`;
+  }
+  return uri;
+}
+
+/** Extracts the absolute container path from a workspace-scoped URI,
+ *  or null if the URI isn't a container path. The matching roots
+ *  mirror the server's `allowedRoots` allowlist. */
+const ALLOWED_CONTAINER_ROOTS = ["/configs", "/workspace", "/home", "/plugins"];
+
+function normalizeWorkspaceUri(uri: string): string | null {
+  let path: string | null = null;
+  if (uri.startsWith("workspace:")) {
+    path = uri.slice("workspace:".length);
+  } else if (uri.startsWith("file:///")) {
+    path = uri.slice("file://".length); // keep the leading slash
+  } else if (uri.startsWith("/")) {
+    path = uri;
+  }
+  if (!path) return null;
+  // Only rewrite when the path lands in an allowed root; otherwise
+  // return null so the caller falls through to raw-URI handling
+  // (which will open a new tab for HTTP-ish schemes).
+  for (const root of ALLOWED_CONTAINER_ROOTS) {
+    if (path === root || path.startsWith(root + "/")) return path;
+  }
+  return null;
+}
+
+/** Trigger a browser download for an attachment. Uses fetch+blob
+ *  rather than an anchor navigation because the download endpoint
+ *  requires workspace auth — and the browser won't attach
+ *  `Authorization: Bearer` or `X-Molecule-Org-Slug` to a bare anchor
+ *  click. A 25MB per-file cap server-side keeps the blob buffer
+ *  bounded. HTTP(S) URIs skip the fetch path and open directly
+ *  since they're off-platform artefacts that we don't own auth for. */
+export async function downloadChatFile(
+  workspaceId: string,
+  attachment: ChatAttachment,
+): Promise<void> {
+  const href = resolveAttachmentHref(workspaceId, attachment.uri);
+  const isContainerPath = normalizeWorkspaceUri(attachment.uri) !== null;
+  if (!isContainerPath) {
+    // External URL — let the browser navigate. Opens in new tab so
+    // the canvas context survives a navigation. `href` here is the
+    // raw URI (http(s), or anything else the agent sent back).
+    window.open(href, "_blank", "noopener,noreferrer");
+    return;
+  }
+
+  const headers: Record<string, string> = {};
+  const slug = getTenantSlug();
+  if (slug) headers["X-Molecule-Org-Slug"] = slug;
+  const adminToken = process.env.NEXT_PUBLIC_ADMIN_TOKEN;
+  if (adminToken) headers["Authorization"] = `Bearer ${adminToken}`;
+
+  const res = await fetch(href, {
+    headers,
+    credentials: "include",
+    signal: AbortSignal.timeout(60_000),
+  });
+  if (!res.ok) {
+    throw new Error(`download failed: ${res.status}`);
+  }
+  const blob = await res.blob();
+  // Revoke the object URL after the click — browsers hold the blob
+  // until the URL is either revoked or the document unloads. 30s is
+  // plenty of headroom for the click → save dialog round-trip.
+  const url = URL.createObjectURL(blob);
+  const a = document.createElement("a");
+  a.href = url;
+  a.download = attachment.name;
+  a.rel = "noopener";
+  document.body.appendChild(a);
+  a.click();
+  a.remove();
+  setTimeout(() => URL.revokeObjectURL(url), 30_000);
+}
--- a/canvas/src/hooks/useTemplateDeploy.tsx
+++ b/canvas/src/hooks/useTemplateDeploy.tsx
@ -0,0 +1,170 @@
+"use client";
+
+import { useCallback, useState, type ReactNode } from "react";
+import { api } from "@/lib/api";
+import {
+  checkDeploySecrets,
+  resolveRuntime,
+  type PreflightResult,
+  type Template,
+} from "@/lib/deploy-preflight";
+import { MissingKeysModal } from "@/components/MissingKeysModal";
+
+/**
+ * useTemplateDeploy — shared preflight + POST + modal wiring for
+ * every surface that deploys a workspace from a template.
+ *
+ * Owns: `checkDeploySecrets` call, `MissingKeysModal` render, the
+ * `POST /workspaces` that follows, and per-template `deploying`
+ * state. Returns `modal` as a `ReactNode` ready to place inline.
+ *
+ * Why a hook rather than two copies: the runtime-fallback table
+ * (`resolveRuntime`) and the preflight wiring were previously
+ * copy-pasted between TemplatePalette and EmptyState. When the
+ * copies drifted (palette had the full id-to-runtime map,
+ * empty-state had only the `-default` strip), the two surfaces
+ * could silently disagree on future templates that need a
+ * non-identity mapping. Single owner closes the drift surface.
+ */
+export interface UseTemplateDeployOptions {
+  /** Compute canvas coords for the new workspace. Called once per
+   *  successful deploy. Defaults to random coords in the [100, 500] ×
+   *  [100, 400] band, matching the sidebar palette's historical
+   *  placement. Override for surfaces that want deterministic
+   *  placement (e.g. EmptyState's first-deploy "center-ish" target). */
+  canvasCoords?: () => { x: number; y: number };
+
+  /** Optional post-deploy side effect — passed the id of the new
+   *  workspace. EmptyState uses this to auto-select the node and
+   *  flip the side panel to Chat so a fresh tenant sees something
+   *  useful. */
+  onDeployed?: (workspaceId: string) => void;
+}
+
+/** Paired template + preflight result carried through the "user
+ *  clicked deploy → modal opens → keys saved → retry" loop. Named
+ *  so the `useState` generic and any future signature change have
+ *  a single place to track. */
+interface MissingKeysInfo {
+  template: Template;
+  preflight: PreflightResult;
+}
+
+export interface UseTemplateDeployResult {
+  /** Template id currently being deployed (incl. the preflight
+   *  network call), or null when idle. Callers pass this to disable
+   *  the relevant button and show a spinner. */
+  deploying: string | null;
+
+  /** Last deploy error message, or null. Cleared on next `deploy`
+   *  call. */
+  error: string | null;
+
+  /** Kick off a deploy. Opens the missing-keys modal if preflight
+   *  returns not-ok; otherwise fires POST /workspaces directly. */
+  deploy: (template: Template) => Promise<void>;
+
+  /** The missing-keys modal, ready to place inline. Always non-null
+   *  (the underlying component self-gates on `open`), so the caller
+   *  can drop `{modal}` anywhere without conditionals. */
+  modal: ReactNode;
+}
+
+export function useTemplateDeploy(
+  options: UseTemplateDeployOptions = {},
+): UseTemplateDeployResult {
+  const [deploying, setDeploying] = useState<string | null>(null);
+  const [error, setError] = useState<string | null>(null);
+  const [missingKeysInfo, setMissingKeysInfo] = useState<MissingKeysInfo | null>(null);
+
+  const { canvasCoords, onDeployed } = options;
+
+  /** Actually execute the POST /workspaces call. Split from `deploy`
+   *  so the "modal → keys added → retry" path can reuse it without
+   *  re-running preflight (the user just proved the keys are now set). */
+  const executeDeploy = useCallback(
+    async (template: Template) => {
+      setDeploying(template.id);
+      setError(null);
+      try {
+        const coords = canvasCoords
+          ? canvasCoords()
+          : {
+              x: Math.random() * 400 + 100,
+              y: Math.random() * 300 + 100,
+            };
+        const ws = await api.post<{ id: string }>("/workspaces", {
+          name: template.name,
+          template: template.id,
+          tier: template.tier,
+          canvas: coords,
+        });
+        onDeployed?.(ws.id);
+      } catch (e) {
+        setError(e instanceof Error ? e.message : "Deploy failed");
+      } finally {
+        setDeploying(null);
+      }
+    },
+    [canvasCoords, onDeployed],
+  );
+
+  const deploy = useCallback(
+    async (template: Template) => {
+      setDeploying(template.id);
+      setError(null);
+      let preflight: PreflightResult;
+      try {
+        const runtime = template.runtime ?? resolveRuntime(template.id);
+        preflight = await checkDeploySecrets({
+          runtime,
+          models: template.models,
+          required_env: template.required_env,
+        });
+      } catch (e) {
+        // Preflight network failure used to strand `deploying` — the
+        // button stayed disabled forever because the throw bypassed
+        // the setDeploying(null) in the non-ok branch below. Any
+        // future refactor that drops this try block will regress the
+        // same way; keep it narrow around just the preflight call
+        // so a successful preflight still lets executeDeploy own
+        // its own error path.
+        setError(e instanceof Error ? e.message : "Preflight check failed");
+        setDeploying(null);
+        return;
+      }
+      if (!preflight.ok) {
+        setMissingKeysInfo({ template, preflight });
+        setDeploying(null);
+        return;
+      }
+      await executeDeploy(template);
+    },
+    [executeDeploy],
+  );
+
+  // No useCallback here — consumers call this on every render anyway
+  // (it's placed inline in JSX), and useCallback's deps would
+  // invalidate on every state change, making the memoisation a wash.
+  // Plain ReactNode is simpler and equally performant.
+  const modal: ReactNode = (
+    <MissingKeysModal
+      open={!!missingKeysInfo}
+      missingKeys={missingKeysInfo?.preflight.missingKeys ?? []}
+      providers={missingKeysInfo?.preflight.providers ?? []}
+      runtime={missingKeysInfo?.preflight.runtime ?? ""}
+      onKeysAdded={() => {
+        if (missingKeysInfo) {
+          const template = missingKeysInfo.template;
+          setMissingKeysInfo(null);
+          // Intentional fire-and-forget — executeDeploy manages
+          // its own error state via setError.
+          void executeDeploy(template);
+        }
+      }}
+      onCancel={() => setMissingKeysInfo(null)}
+    />
+  );
+
+  return { deploying, error, deploy, modal };
+}
--- a/canvas/src/lib/tests/api.test.ts
+++ b/canvas/src/lib/tests/api.test.ts
@ -7,7 +7,7 @@ import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
 const mockFetch = vi.fn();
 globalThis.fetch = mockFetch;

-import { api } from "../api";
+import { api, PlatformUnavailableError } from "../api";

 // ---------------------------------------------------------------------------
 // Helpers
@ -380,3 +380,99 @@ describe("api – request timeout signal", () => {
    expect(sigA).not.toBe(sigB);
  });
 });
+
+// ---------------------------------------------------------------------------
+// PlatformUnavailableError classification
+// ---------------------------------------------------------------------------
+//
+// When the platform's wsauth middleware can't reach Postgres/Redis to
+// validate a token, it returns 503 + {error, code:"platform_unavailable"}.
+// api.ts must surface that as a typed error so the page-level renderer
+// can show a dedicated diagnostic instead of a generic 5xx toast.
+
+describe("PlatformUnavailableError classification", () => {
+  beforeEach(() => {
+    mockFetch.mockReset();
+  });
+
+  function mock503Platform(detail = "platform datastore unavailable — retry shortly") {
+    const body = JSON.stringify({ error: detail, code: "platform_unavailable" });
+    mockFetch.mockResolvedValueOnce({
+      ok: false,
+      status: 503,
+      json: () => Promise.reject(new Error("not used")),
+      text: () => Promise.resolve(body),
+    } as unknown as Response);
+  }
+
+  it("throws PlatformUnavailableError on 503 + code=platform_unavailable", async () => {
+    mock503Platform();
+    let thrown: unknown;
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      thrown = e;
+    }
+    expect(thrown).toBeInstanceOf(PlatformUnavailableError);
+    expect((thrown as PlatformUnavailableError).code).toBe("platform_unavailable");
+  });
+
+  it("preserves the server-provided error string as the Error message", async () => {
+    mock503Platform("Postgres unreachable");
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      expect(e).toBeInstanceOf(PlatformUnavailableError);
+      expect((e as Error).message).toBe("Postgres unreachable");
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+
+  it("does NOT classify a generic 503 (no platform_unavailable code) as PlatformUnavailableError", async () => {
+    // Generic upstream-busy 503 — should keep the legacy generic-Error
+    // path so existing busy-retry UX isn't disrupted.
+    mockFetch.mockResolvedValueOnce({
+      ok: false,
+      status: 503,
+      json: () => Promise.reject(new Error("not used")),
+      text: () => Promise.resolve(JSON.stringify({ error: "upstream busy" })),
+    } as unknown as Response);
+    try {
+      await api.get("/workspaces/x/a2a");
+    } catch (e) {
+      expect(e).not.toBeInstanceOf(PlatformUnavailableError);
+      expect((e as Error).message).toContain("503");
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+
+  it("does NOT classify on 500 (server kept legacy 500 for true internal errors)", async () => {
+    mockFailure(500, "boom");
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      expect(e).not.toBeInstanceOf(PlatformUnavailableError);
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+
+  it("falls back to generic Error when 503 body isn't JSON", async () => {
+    mockFetch.mockResolvedValueOnce({
+      ok: false,
+      status: 503,
+      json: () => Promise.reject(new Error("not used")),
+      text: () => Promise.resolve("Service Unavailable"),
+    } as unknown as Response);
+    try {
+      await api.get("/workspaces");
+    } catch (e) {
+      expect(e).not.toBeInstanceOf(PlatformUnavailableError);
+      expect((e as Error).message).toContain("503");
+      return;
+    }
+    throw new Error("expected to throw");
+  });
+});
--- a/canvas/src/lib/api.ts
+++ b/canvas/src/lib/api.ts
@ -107,11 +107,39 @@ async function request<T>(
  }
  if (!res.ok) {
    const text = await res.text();
+    // Recognise the platform's structured "datastore unreachable"
+    // shape (returned by wsauth_middleware.abortAuthLookupError when
+    // Postgres/Redis is down). Surface as a typed error so callers
+    // can render a dedicated diagnostic instead of a generic toast.
+    if (res.status === 503 && text) {
+      try {
+        const parsed = JSON.parse(text) as { code?: string; error?: string };
+        if (parsed.code === "platform_unavailable") {
+          throw new PlatformUnavailableError(parsed.error || "platform datastore unavailable");
+        }
+      } catch (err) {
+        // Re-throw the typed error if that's what we just constructed.
+        // JSON.parse failures fall through to the generic Error below.
+        if (err instanceof PlatformUnavailableError) throw err;
+      }
+    }
    throw new Error(`API ${method} ${path}: ${res.status} ${text}`);
  }
  return res.json();
 }

+/** Thrown when the platform reports its datastore (Postgres/Redis) is
+ *  unreachable. Surface with a dedicated diagnostic UI rather than a
+ *  generic API-error toast — the user's next action is to check local
+ *  services, not to retry the API call. */
+export class PlatformUnavailableError extends Error {
+  readonly code = "platform_unavailable" as const;
+  constructor(message: string) {
+    super(message);
+    this.name = "PlatformUnavailableError";
+  }
+}
+
 export const api = {
  get: <T>(path: string, options?: RequestOptions) => request<T>("GET", path, undefined, 0, options),
  post: <T>(path: string, body?: unknown, options?: RequestOptions) => request<T>("POST", path, body, 0, options),
--- a/canvas/src/lib/deploy-preflight.ts
+++ b/canvas/src/lib/deploy-preflight.ts
@ -33,6 +33,46 @@ export interface TemplateLike {
  required_env?: string[];
 }

+/** Full /templates response shape shared by TemplatePalette (sidebar)
+ *  and EmptyState (welcome grid). Was previously re-declared in each
+ *  with subtly different fields — EmptyState's narrower shape silently
+ *  dropped `runtime`, `models`, and `required_env`, so the preflight
+ *  couldn't see provider alternatives the template declared. Keep this
+ *  the single source of truth.  */
+export interface Template extends TemplateLike {
+  id: string;
+  name: string;
+  description: string;
+  tier: number;
+  model: string;
+  skills: string[];
+  skill_count: number;
+}
+
+/** Map from a template id to the runtime name the per-workspace
+ *  preflight expects. Used only when the server's `/templates`
+ *  response predates the `runtime` field on the summary (legacy
+ *  installs) — modern responses carry it verbatim. Strip `-default`
+ *  for the claude-code template and identity-map everything else
+ *  that matches our current runtime registry.
+ *
+ *  Lives in the preflight module (not TemplatePalette) so EmptyState
+ *  uses the SAME fallback table. A previous duplication in both call
+ *  sites left EmptyState with only the `-default` suffix strip, which
+ *  would silently disagree with TemplatePalette on templates whose
+ *  id needs a non-identity mapping. */
+export function resolveRuntime(templateId: string): string {
+  const runtimeMap: Record<string, string> = {
+    langgraph: "langgraph",
+    "claude-code-default": "claude-code",
+    openclaw: "openclaw",
+    deepagents: "deepagents",
+    crewai: "crewai",
+    autogen: "autogen",
+  };
+  return runtimeMap[templateId] ?? templateId.replace(/-default$/, "");
+}
+
 export interface SecretEntry {
  key: string;
  has_value: boolean;
--- a/canvas/src/store/tests/canvas-batch-partial-failure.test.ts
+++ b/canvas/src/store/tests/canvas-batch-partial-failure.test.ts
@ -5,27 +5,34 @@ import { describe, it, expect, beforeEach, vi } from "vitest";
 global.fetch = vi.fn();

 import { useCanvasStore } from "../canvas";
-import type { WorkspaceData } from "../socket";
+import type { WorkspaceNodeData } from "../canvas";

-function makeWS(overrides: Partial<WorkspaceData> & { id: string }): WorkspaceData {
+function makeWS(
+  overrides: Partial<WorkspaceNodeData> & { id: string },
+): WorkspaceNodeData {
+  // makeWS builds a minimal WorkspaceNodeData for tests that set state
+  // directly on the store (bypassing hydrate). The `id` override is
+  // ignored — node IDs live on the outer Node<> wrapper, not inside
+  // `data`. It's accepted here so callers can keep their existing
+  // `makeWS({ id: "ws-foo" })` call sites even though the id is only
+  // used on the Node<> wrapper at the call site.
+  void overrides.id;
  return {
    name: "WS",
    role: "agent",
    tier: 1,
    status: "online",
-    agent_card: null,
+    agentCard: null,
    url: "http://localhost:9000",
-    parent_id: null,
-    active_tasks: 0,
-    last_error_rate: 0,
-    last_sample_error: "",
-    uptime_seconds: 60,
-    current_task: "",
-    x: 0,
-    y: 0,
+    parentId: null,
+    activeTasks: 0,
+    lastErrorRate: 0,
+    lastSampleError: "",
+    currentTask: "",
    collapsed: false,
    runtime: "",
-    budget_limit: null,
+    needsRestart: false,
+    budgetLimit: null,
    ...overrides,
  };
 }
@ -148,13 +155,13 @@ describe("batchRestart — partial failure", () => {
          id: "ws-ok",
          type: "workspace",
          position: { x: 0, y: 0 },
-          data: { ...makeWS({ id: "ws-ok" }), needsRestart: true } as WorkspaceData & { needsRestart: boolean },
+          data: { ...makeWS({ id: "ws-ok" }), needsRestart: true } as WorkspaceNodeData,
        },
        {
          id: "ws-fail",
          type: "workspace",
          position: { x: 0, y: 0 },
-          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceData & { needsRestart: boolean },
+          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceNodeData,
        },
      ],
      selectedNodeIds: new Set(["ws-ok", "ws-fail"]),
@ -166,7 +173,7 @@ describe("batchRestart — partial failure", () => {
    });

    const byId = Object.fromEntries(
-      useCanvasStore.getState().nodes.map((n) => [n.id, n.data as WorkspaceData & { needsRestart?: boolean }])
+      useCanvasStore.getState().nodes.map((n) => [n.id, n.data as WorkspaceNodeData])
    );
    expect(byId["ws-ok"].needsRestart).toBe(false);
    expect(byId["ws-fail"].needsRestart).toBe(true);
@ -179,7 +186,7 @@ describe("batchRestart — partial failure", () => {
          id: "ws-fail",
          type: "workspace",
          position: { x: 0, y: 0 },
-          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceData & { needsRestart: boolean },
+          data: { ...makeWS({ id: "ws-fail" }), needsRestart: true } as WorkspaceNodeData,
        },
      ],
      selectedNodeIds: new Set(["ws-fail"]),
--- a/canvas/src/store/tests/canvas-events-pan.test.ts
+++ b/canvas/src/store/tests/canvas-events-pan.test.ts
@ -67,7 +67,19 @@ describe("canvas-events – molecule:pan-to-node dispatch", () => {
    vi.restoreAllMocks();
  });

-  it("dispatches molecule:pan-to-node with the new nodeId for a NEW provision", () => {
+  it("dispatches both molecule:pan-to-node AND molecule:fit-deploying-org for a NEW root-level provision", () => {
+    // Two custom events are dispatched on NEW root-level provision:
+    //   1. molecule:fit-deploying-org — tells useCanvasViewport to
+    //      frame the whole deploying subtree. Fires for root nodes
+    //      too (commit 5adc8a74) so the canvas centers the just-
+    //      landed root immediately instead of waiting for the
+    //      first child to arrive.
+    //   2. molecule:pan-to-node — pans/zooms to the single node;
+    //      only for standalone creates (no parent), so org-import
+    //      children don't chase the spawn animation.
+    // A previous version of this test expected only #2 and failed
+    // when #1 was added for roots. If only one of these ever fires
+    // again, this test should flag the regression.
    const { get, set } = makeStore([]);
    const dispatched: Event[] = [];
    const spy = vi.spyOn(window, "dispatchEvent").mockImplementation((e) => {
@ -81,9 +93,15 @@ describe("canvas-events – molecule:pan-to-node dispatch", () => {
      set
    );

-    expect(dispatched).toHaveLength(1);
-    expect(dispatched[0].type).toBe("molecule:pan-to-node");
-    expect((dispatched[0] as CustomEvent).detail?.nodeId).toBe("ws-new");
+    expect(dispatched).toHaveLength(2);
+    const panEvent = dispatched.find((e) => e.type === "molecule:pan-to-node");
+    const fitEvent = dispatched.find((e) => e.type === "molecule:fit-deploying-org");
+    expect(panEvent, "molecule:pan-to-node should fire for standalone create").toBeDefined();
+    expect(fitEvent, "molecule:fit-deploying-org should fire so the viewport frames the root").toBeDefined();
+    expect((panEvent as CustomEvent).detail?.nodeId).toBe("ws-new");
+    expect((fitEvent as CustomEvent).detail?.rootId).toBe("ws-new");
+
+    spy.mockRestore();
  });

  it("does NOT dispatch molecule:pan-to-node when restarting an existing node", () => {
--- a/canvas/src/store/tests/canvas-topology.test.ts
+++ b/canvas/src/store/tests/canvas-topology.test.ts
@ -149,6 +149,75 @@ describe("buildNodesAndEdges – parent + child workspaces", () => {
  });
 });

+describe("buildNodesAndEdges – auto-rescue respects live grown parent size", () => {
+  // Regression: child the user dragged into a user-grown area was
+  // false-rescued by every periodic rehydrate (socket health check
+  // every 30s) because the rescue heuristic used the initial
+  // grid-derived parent bbox, not the currently-grown size. Result:
+  // child snapped to a stale grid slot, then settled back ~1 frame
+  // later when growParentsToFitChildren re-ran. Observed 2026-04-25
+  // as "child jumps to weird location, then 30s later it's fine".
+
+  it("does NOT rescue a child placed inside the user-grown parent area", () => {
+    // Parent's initial grid-derived size is small; user has since grown it
+    // to 800×600. Child sits at relative (700, 400) — inside the grown
+    // bbox but outside the initial bbox. Without currentParentSizes,
+    // the rescue would re-place the child into a default grid slot.
+    const parentAbs = { x: 100, y: 100 };
+    const childAbs = { x: parentAbs.x + 700, y: parentAbs.y + 400 };
+    const workspaces = [
+      makeWS({ id: "parent", x: parentAbs.x, y: parentAbs.y }),
+      makeWS({ id: "child", parent_id: "parent", x: childAbs.x, y: childAbs.y }),
+    ];
+    const grownDims = new Map([
+      ["parent", { width: 800, height: 600 }],
+    ]);
+
+    const { nodes } = buildNodesAndEdges(workspaces, new Map(), grownDims);
+    const child = nodes.find((n) => n.id === "child")!;
+    // Child's relative position should match what we passed in.
+    expect(child.position).toEqual({ x: 700, y: 400 });
+  });
+
+  it("DOES rescue a child whose stored position is outside even the grown parent", () => {
+    // Same parent but child is way outside (relative 5000, 5000).
+    // The rescue must still fire — the heuristic isn't "always trust
+    // the user", it's "trust the user up to the current parent bbox".
+    const parentAbs = { x: 100, y: 100 };
+    const childAbs = { x: parentAbs.x + 5000, y: parentAbs.y + 5000 };
+    const workspaces = [
+      makeWS({ id: "parent", x: parentAbs.x, y: parentAbs.y }),
+      makeWS({ id: "child", parent_id: "parent", x: childAbs.x, y: childAbs.y }),
+    ];
+    const grownDims = new Map([
+      ["parent", { width: 800, height: 600 }],
+    ]);
+
+    const { nodes } = buildNodesAndEdges(workspaces, new Map(), grownDims);
+    const child = nodes.find((n) => n.id === "child")!;
+    // Rescued: NOT the original (5000, 5000); some grid slot instead.
+    expect(child.position.x).toBeLessThan(5000);
+    expect(child.position.y).toBeLessThan(5000);
+  });
+
+  it("falls back to initial-min bbox when no live size is provided (preserves legacy behavior)", () => {
+    // Empty currentParentSizes — first hydrate or test without store
+    // priming. Child outside the initial bbox should still be rescued.
+    const parentAbs = { x: 100, y: 100 };
+    const childAbs = { x: parentAbs.x + 700, y: parentAbs.y + 400 };
+    const workspaces = [
+      makeWS({ id: "parent", x: parentAbs.x, y: parentAbs.y }),
+      makeWS({ id: "child", parent_id: "parent", x: childAbs.x, y: childAbs.y }),
+    ];
+
+    const { nodes } = buildNodesAndEdges(workspaces);
+    const child = nodes.find((n) => n.id === "child")!;
+    // Without a live size hint, the initial bbox applies — rescue
+    // fires, child gets a fresh slot, NOT the user-supplied (700,400).
+    expect(child.position).not.toEqual({ x: 700, y: 400 });
+  });
+});
+
 describe("buildNodesAndEdges – deeply nested hierarchy", () => {
  it("handles three levels of nesting", () => {
    const workspaces = [
--- a/canvas/src/store/tests/canvas.test.ts
+++ b/canvas/src/store/tests/canvas.test.ts
@ -484,6 +484,70 @@ describe("removeNode", () => {
  });
 });

+// ---------- removeSubtree ----------
+
+describe("removeSubtree", () => {
+  beforeEach(() => {
+    useCanvasStore.getState().hydrate([
+      makeWS({ id: "root" }),
+      makeWS({ id: "mid", parent_id: "root" }),
+      makeWS({ id: "leaf", parent_id: "mid" }),
+      makeWS({ id: "sibling", parent_id: "root" }),
+      makeWS({ id: "unrelated" }), // separate root
+    ]);
+  });
+
+  it("removes the root and every descendant in one shot", () => {
+    useCanvasStore.getState().removeSubtree("root");
+    const ids = useCanvasStore
+      .getState()
+      .nodes.map((n) => n.id)
+      .sort();
+    expect(ids).toEqual(["unrelated"]);
+  });
+
+  it("removes a mid-level node and its descendants but leaves siblings + ancestors", () => {
+    useCanvasStore.getState().removeSubtree("mid");
+    const ids = useCanvasStore
+      .getState()
+      .nodes.map((n) => n.id)
+      .sort();
+    expect(ids).toEqual(["root", "sibling", "unrelated"]);
+  });
+
+  it("removing a leaf is a no-op cascade (just drops the leaf)", () => {
+    useCanvasStore.getState().removeSubtree("leaf");
+    const ids = useCanvasStore
+      .getState()
+      .nodes.map((n) => n.id)
+      .sort();
+    expect(ids).toEqual(["mid", "root", "sibling", "unrelated"]);
+  });
+
+  it("clears selection when the selected node is anywhere in the removed subtree", () => {
+    useCanvasStore.getState().selectNode("leaf");
+    useCanvasStore.getState().removeSubtree("root");
+    expect(useCanvasStore.getState().selectedNodeId).toBeNull();
+  });
+
+  it("preserves selection when the selected node is outside the removed subtree", () => {
+    useCanvasStore.getState().selectNode("unrelated");
+    useCanvasStore.getState().removeSubtree("root");
+    expect(useCanvasStore.getState().selectedNodeId).toBe("unrelated");
+  });
+
+  it("drops edges incident to any removed node", () => {
+    // The hydrate-built edges connect parent → child. After removing
+    // `root`, no edge involving root/mid/leaf/sibling should remain.
+    useCanvasStore.getState().removeSubtree("root");
+    const remaining = useCanvasStore.getState().edges;
+    for (const e of remaining) {
+      expect(["root", "mid", "leaf", "sibling"]).not.toContain(e.source);
+      expect(["root", "mid", "leaf", "sibling"]).not.toContain(e.target);
+    }
+  });
+});
+
 // ---------- isDescendant ----------

 describe("isDescendant", () => {
--- a/canvas/src/store/tests/socket.test.ts
+++ b/canvas/src/store/tests/socket.test.ts
@ -1,7 +1,7 @@
 import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";

 // ---------------------------------------------------------------------------
-// Mock the canvas store before importing socket.ts
+// Mock the canvas store and api before importing socket.ts
 // ---------------------------------------------------------------------------
 vi.mock("../canvas", () => ({
  useCanvasStore: {
@ -13,6 +13,7 @@ vi.mock("../canvas", () => ({
  },
 }));

+
 // ---------------------------------------------------------------------------
 // Mock WebSocket
 // ---------------------------------------------------------------------------
@ -76,7 +77,6 @@ function getLastWS(): MockWebSocket {
 beforeEach(() => {
  MockWebSocket.instances = [];
  vi.useFakeTimers();
-
  // Reset mocked store state
  vi.mocked(useCanvasStore.getState).mockReturnValue({
    applyEvent: vi.fn(),
@ -263,13 +263,59 @@ describe("WebSocket onclose – auto-reconnect", () => {
    const ws = getLastWS();
    ws.triggerClose();

-    // Fast-forward timers to trigger the reconnect
-    vi.runAllTimers();
+    // First reconnect attempt is scheduled at 1s (Math.min(1000 * 2^0,
+    // 30000)). Advance just past that — vi.runAllTimers() would
+    // additionally re-fire the fallback poll setInterval forever and
+    // hit the 10000-timer abort.
+    vi.advanceTimersByTime(1100);

    expect(MockWebSocket.instances.length).toBeGreaterThan(1);
  });
 });

+describe("HTTP fallback poll while WS unhealthy", () => {
+  it("starts a setInterval after onclose so /workspaces stays fresh", () => {
+    const setIntervalSpy = vi.spyOn(globalThis, "setInterval");
+    connectSocket();
+    const ws = getLastWS();
+    ws.triggerClose();
+    // The fallback poll runs at 10s; the reconnect uses setTimeout, so
+    // any setInterval registered between connect and close must be the
+    // fallback poll.
+    const fallbackCalls = setIntervalSpy.mock.calls.filter(
+      ([, delay]) => delay === 10_000,
+    );
+    expect(fallbackCalls.length).toBeGreaterThan(0);
+    setIntervalSpy.mockRestore();
+  });
+
+  it("clears the fallback poll once the WS reconnects (onopen)", () => {
+    const clearIntervalSpy = vi.spyOn(globalThis, "clearInterval");
+    connectSocket();
+    const ws = getLastWS();
+    ws.triggerClose(); // starts fallback poll
+    clearIntervalSpy.mockClear();
+    // Advance past the first reconnect delay so a fresh ws exists,
+    // then trigger its open.
+    vi.advanceTimersByTime(1100);
+    const ws2 = getLastWS();
+    ws2.triggerOpen();
+    expect(clearIntervalSpy).toHaveBeenCalled();
+    clearIntervalSpy.mockRestore();
+  });
+
+  it("clears the fallback poll on disconnect", () => {
+    const clearIntervalSpy = vi.spyOn(globalThis, "clearInterval");
+    connectSocket();
+    const ws = getLastWS();
+    ws.triggerClose(); // starts fallback poll
+    clearIntervalSpy.mockClear();
+    disconnectSocket();
+    expect(clearIntervalSpy).toHaveBeenCalled();
+    clearIntervalSpy.mockRestore();
+  });
+});
+
 // ---------------------------------------------------------------------------
 // onerror handler
 // ---------------------------------------------------------------------------
@ -328,3 +374,45 @@ describe("health check", () => {
    clearIntervalSpy.mockRestore();
  });
 });
+
+// Rehydrate dedup logic itself is exercised by `RehydrateDedup` unit
+// tests in this file (below). End-to-end coupling through the
+// dynamic-imported `@/lib/api` was non-trivial under our existing
+// fake-timer setup; isolating the gate in a pure helper keeps
+// regression coverage without that mocking complexity.
+
+import { RehydrateDedup } from "../socket";
+
+describe("RehydrateDedup", () => {
+  it("first call passes the gate (no prior fetch)", () => {
+    const d = new RehydrateDedup(1500);
+    expect(d.shouldSkip(0)).toBe(false);
+  });
+
+  it("blocks while a fetch is in flight", () => {
+    const d = new RehydrateDedup(1500);
+    d.beginFetch();
+    expect(d.shouldSkip(100)).toBe(true);
+  });
+
+  it("blocks within the post-completion window", () => {
+    const d = new RehydrateDedup(1500);
+    d.beginFetch();
+    d.completeFetch(1_000);
+    // 1100 - 1000 = 100 < 1500 → skip
+    expect(d.shouldSkip(1_100)).toBe(true);
+    // 2600 - 1000 = 1600 > 1500 → allow
+    expect(d.shouldSkip(2_600)).toBe(false);
+  });
+
+  it("a completed fetch followed by another beginFetch blocks for the new in-flight", () => {
+    const d = new RehydrateDedup(1500);
+    d.beginFetch();
+    d.completeFetch(1_000);
+    // First wait out the dedup window
+    expect(d.shouldSkip(2_600)).toBe(false);
+    d.beginFetch();
+    // Now a second fetch is in flight; further calls block again
+    expect(d.shouldSkip(2_700)).toBe(true);
+  });
+});
--- a/canvas/src/store/canvas-events.ts
+++ b/canvas/src/store/canvas-events.ts
@ -1,7 +1,7 @@
 import type { Node, Edge } from "@xyflow/react";
 import type { WSMessage } from "./socket";
 import type { WorkspaceNodeData } from "./canvas";
-import { extractResponseText } from "@/components/tabs/chat/message-parser";
+import { extractResponseText, extractFilesFromTask } from "@/components/tabs/chat/message-parser";

 // ---------------------------------------------------------------------------
 // Monotonically increasing counter used to assign grid positions.
@ -21,13 +21,46 @@ import { extractResponseText } from "@/components/tabs/chat/message-parser";
 //
 // A monotonic counter is immune to deletions: it only ever increases.
 // ---------------------------------------------------------------------------
+import { appendClass, removeClass, scheduleNodeClassRemoval } from "./classNames";
+
 let _provisioningSequence = 0;

 /** Reset the sequence counter — exposed for test teardown only. */
 export function resetProvisioningSequence(): void {
  _provisioningSequence = 0;
+  _pendingOnline.clear();
 }

+/** WORKSPACE_ONLINE events that arrived BEFORE the matching
+ *  WORKSPACE_PROVISIONING — buffered here so the late-arriving
+ *  provision event can immediately flip to the correct status
+ *  instead of leaving the node stuck as "provisioning" forever.
+ *  Cleared when applied, or on module reset (tests). */
+const _pendingOnline = new Set<string>();
+
+/** Debounced parent-grow. Each child arrival schedules this; the
+ *  timer keeps resetting as more siblings land, so the actual
+ *  width/height update runs ONCE after arrivals go quiet. Avoids
+ *  the visible size-pulse that happened when growParentsToFitChildren
+ *  ran per event. */
+let _growTimer: ReturnType<typeof setTimeout> | null = null;
+function scheduleParentGrow(): void {
+  if (typeof window === "undefined") return;
+  if (_growTimer) clearTimeout(_growTimer);
+  _growTimer = setTimeout(() => {
+    _growTimer = null;
+    import("./canvas").then(({ useCanvasStore }) => {
+      useCanvasStore.getState().growParentsToFitChildren?.();
+    });
+  }, 300);
+}
+
+// (absoluteNodePosition was used by an earlier "spawn from parent"
+// revision that subtracted parent absolute coords from server-sent
+// absolute child coords. The server now ships parent-relative coords
+// directly, so the walk is no longer needed. Deleted rather than
+// kept as dead code.)
+
 /**
 * Standalone event handler extracted from the canvas store.
 * Applies a single WebSocket event to the current node/edge state.
@ -38,7 +71,7 @@ export function handleCanvasEvent(
    nodes: Node<WorkspaceNodeData>[];
    edges: Edge[];
    selectedNodeId: string | null;
-    agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string }>>;
+    agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string; attachments?: Array<{ name: string; uri: string; mimeType?: string; size?: number }> }>>;
  },
  set: (partial: Record<string, unknown>) => void,
 ): void {
@ -47,14 +80,44 @@ export function handleCanvasEvent(
  switch (msg.event) {
    case "WORKSPACE_ONLINE": {
      const existing = nodes.find((n) => n.id === msg.workspace_id);
-      if (existing) {
-        set({
-          nodes: nodes.map((n) =>
-            n.id === msg.workspace_id
-              ? { ...n, data: { ...n.data, status: "online" } }
-              : n
-          ),
-        });
+      if (!existing) {
+        // PROVISIONING event hasn't been applied yet (WS reorder or
+        // this tab joined mid-deploy). Buffer so the later PROVISIONING
+        // handler can flip status in one pass instead of leaving the
+        // node stuck in "provisioning" forever.
+        _pendingOnline.add(msg.workspace_id);
+        break;
+      }
+      // Flip incoming edge from blueprint → laser so the link is
+      // drawn solid the moment this child is live. The laser class
+      // plays the stroke-dashoffset keyframe once; after ~500ms the
+      // edge falls back to the default solid style (see
+      // org-deploy.css and the follow-up setTimeout below).
+      const updatedEdges = edges.map((e) =>
+        e.target === msg.workspace_id && e.className?.includes("mol-deploy-edge-blueprint")
+          ? { ...e, className: "mol-deploy-edge-laser" }
+          : e,
+      );
+      set({
+        edges: updatedEdges,
+        nodes: nodes.map((n) =>
+          n.id === msg.workspace_id
+            ? { ...n, data: { ...n.data, status: "online" } }
+            : n,
+        ),
+      });
+      // Remove the laser class after its keyframe ends so the edge
+      // settles into the app's default solid styling. Fire-and-forget.
+      if (typeof window !== "undefined") {
+        const targetEdgeId = `${existing.data.parentId ?? ""}-${msg.workspace_id}`;
+        window.setTimeout(() => {
+          const s = get();
+          set({
+            edges: s.edges.map((e) =>
+              e.id === targetEdgeId ? { ...e, className: undefined } : e,
+            ),
+          });
+        }, 600);
      }
      break;
    }
@ -113,25 +176,73 @@ export function handleCanvasEvent(
          ),
        });
      } else {
-        // Spread new nodes in a grid so they don't stack at the viewport origin.
-        // Use the monotonic _provisioningSequence counter (not nodes.length) so
-        // deletions never cause two live nodes to share a grid slot.
-        const GRID_COLS = 4;
-        const COL_SPACING = 320;
-        const ROW_SPACING = 160;
-        const GRID_ORIGIN_X = 100;
-        const GRID_ORIGIN_Y = 100;
-        const idx = _provisioningSequence++;
-        const x = GRID_ORIGIN_X + (idx % GRID_COLS) * COL_SPACING;
-        const y = GRID_ORIGIN_Y + Math.floor(idx / GRID_COLS) * ROW_SPACING;
+        // Payload may carry parent_id + final x/y (org import broadcasts
+        // these so the canvas can animate the "spawn from parent" motion).
+        // Standalone workspace creates still omit them — fall back to the
+        // grid-slot behaviour that handled that case historically.
+        const parentIdRaw = (msg.payload.parent_id as string | undefined) ?? null;
+        const finalX = msg.payload.x as number | undefined;
+        const finalY = msg.payload.y as number | undefined;

+        let spawnX: number;
+        let spawnY: number;
+        let targetX: number;
+        let targetY: number;
+        let parentId: string | null = null;
+
+        // Place the node at its final slot immediately — no
+        // spring-from-parent motion. The earlier "materialize from
+        // parent then tween to target" was expensive (two set()
+        // calls + rAF) and produced wrong offsets because the
+        // server sends absolute coords computed against the template's
+        // own coord system while the client had placed the parent at
+        // a grid slot, so the target math always landed off-grid.
+        // Now: server coords are parent-relative (see org_import.go),
+        // we trust them verbatim.
+        const parentInStore = parentIdRaw
+          ? nodes.find((n) => n.id === parentIdRaw)
+          : undefined;
+        if (parentIdRaw && parentInStore && finalX !== undefined && finalY !== undefined) {
+          targetX = finalX;
+          targetY = finalY;
+          parentId = parentIdRaw;
+        } else {
+          // Standalone create OR org-child whose parent hasn't arrived
+          // yet (rare WS reorder) — monotonic-grid placement. The
+          // follow-up hydrate pass reconciles parent_id + the correct
+          // nested position if parent lands later.
+          const GRID_COLS = 4;
+          const COL_SPACING = 320;
+          const ROW_SPACING = 160;
+          const GRID_ORIGIN_X = 100;
+          const GRID_ORIGIN_Y = 100;
+          const idx = _provisioningSequence++;
+          targetX = GRID_ORIGIN_X + (idx % GRID_COLS) * COL_SPACING;
+          targetY = GRID_ORIGIN_Y + Math.floor(idx / GRID_COLS) * ROW_SPACING;
+        }
+        spawnX = targetX;
+        spawnY = targetY;
+
+        // Parent→child relationship is already visible via React
+        // Flow's nested rendering (the child card sits INSIDE the
+        // parent container). An explicit edge on top of that was
+        // visual double-counting and made the canvas look busy;
+        // removed per demo feedback. A2A edges (showA2AEdges) still
+        // render when enabled — those represent runtime traffic,
+        // which nesting doesn't express.
        set({
          nodes: [
            ...nodes,
            {
              id: msg.workspace_id,
              type: "workspaceNode",
-              position: { x, y },
+              position: { x: spawnX, y: spawnY },
+              // React Flow's parentId (distinct from data.parentId)
+              // triggers parent-relative positioning. Set it when the
+              // server told us this is an org-import child so the
+              // node renders nested inside the parent container.
+              ...(parentId ? { parentId } : {}),
+              className: "mol-deploy-spawn",
              data: {
                name: (msg.payload.name as string) ?? "New Workspace",
                status: "provisioning",
@ -143,7 +254,7 @@ export function handleCanvasEvent(
                lastErrorRate: 0,
                lastSampleError: "",
                url: "",
-                parentId: null,
+                parentId, // data.parentId mirrors React Flow's parentId
                currentTask: "",
                runtime: (msg.payload.runtime as string) ?? "",
                needsRestart: false,
@ -152,8 +263,76 @@ export function handleCanvasEvent(
          ],
        });

-        // Pan the canvas to the new node
+        // Grow the parent to fit the just-landed child. DEBOUNCED
+        // across rapid sibling arrivals — firing width/height updates
+        // on every child made the parent card visibly pulse in size
+        // as each kid landed, which read as the parent "flashing
+        // around". One grow pass ~300ms after the last arrival
+        // coalesces the whole burst into a single layout change.
+        if (parentId && typeof window !== "undefined") {
+          scheduleParentGrow();
+        }
+        // Parent-border pulse removed per demo feedback — the soft
+        // box-shadow ring on each arrival compounded with the size
+        // grow to make the whole parent card look unstable. The
+        // dim-light signal on the provisioning child is sufficient
+        // acknowledgement that something is happening.
+
+        // Remove the one-shot spawn class after the keyframe ends so
+        // future re-renders don't replay it.
+        scheduleNodeClassRemoval(msg.workspace_id, "mol-deploy-spawn", 400, get, set);
+
+        // Auto-pan+zoom to the whole deploying org after each
+        // arrival so the user always sees the full picture — unless
+        // they've panned themselves (handled by the viewport hook,
+        // which aborts the fit when the user moved after the last
+        // auto-fit). Event name matches the existing handler in
+        // useCanvasViewport that knows how to compute subtree bounds.
+        //
+        // Fire for roots too (not just children) so the canvas
+        // centers on the just-landed root immediately instead of
+        // waiting for the first child to arrive ~2s later. The
+        // viewport hook walks UP to find the true root, so passing
+        // the node's own id when there's no parent is equivalent
+        // to passing the root.
        if (typeof window !== "undefined") {
+          window.dispatchEvent(
+            new CustomEvent("molecule:fit-deploying-org", {
+              detail: { rootId: parentIdRaw ?? msg.workspace_id },
+            }),
+          );
+        }
+
+        // Race handling: if a WORKSPACE_ONLINE event beat the
+        // matching PROVISIONING to this tab, the online flag was
+        // buffered in _pendingOnline. Apply it now so the node
+        // doesn't stay stuck as "provisioning" forever.
+        //
+        // Only flip to "online" if the current status is still
+        // "provisioning" at drain time. Otherwise a WORKSPACE_DEGRADED
+        // / FAILED / PAUSED that arrived between the set() above and
+        // the scheduled drain would be silently clobbered — the
+        // buffered ONLINE is stale by then.
+        if (_pendingOnline.has(msg.workspace_id)) {
+          _pendingOnline.delete(msg.workspace_id);
+          if (typeof window !== "undefined") {
+            window.setTimeout(() => {
+              const s = get();
+              set({
+                nodes: s.nodes.map((n) =>
+                  n.id === msg.workspace_id && n.data.status === "provisioning"
+                    ? { ...n, data: { ...n.data, status: "online" } }
+                    : n,
+                ),
+              });
+            }, 0);
+          }
+        }
+
+        // Pan the canvas to the new node (standalone create only —
+        // during an org import, zooming to every child chases the
+        // spawn animation around the viewport which is jarring).
+        if (!parentIdRaw && typeof window !== "undefined") {
          window.dispatchEvent(
            new CustomEvent("molecule:pan-to-node", {
              detail: { nodeId: msg.workspace_id },
@ -252,12 +431,19 @@ export function handleCanvasEvent(
    }

    case "A2A_RESPONSE": {
-      // A2A proxy completed — extract response text and store as agent message.
-      // This gives the ChatTab instant response delivery via WebSocket instead of polling.
+      // A2A proxy completed — extract response text AND any `kind: file`
+      // parts. Without the file extraction, agent-returned attachments
+      // delivered via this WebSocket path would disappear (the canvas
+      // would render a text-only message while the HTTP fallback
+      // rendered the same reply with download chips, depending on
+      // which delivery path raced to completion first).
      const responseBody = msg.payload.response_body as Record<string, unknown> | undefined;
      if (responseBody) {
        const text = extractResponseText(responseBody);
-        if (text) {
+        const attachments = extractFilesFromTask(
+          (responseBody.result ?? responseBody) as Record<string, unknown>,
+        );
+        if (text || attachments.length > 0) {
          const { agentMessages } = get();
          const existing = agentMessages[msg.workspace_id] || [];
          set({
@ -265,7 +451,12 @@ export function handleCanvasEvent(
              ...agentMessages,
              [msg.workspace_id]: [
                ...existing,
-                { id: crypto.randomUUID(), content: text, timestamp: new Date().toISOString() },
+                {
+                  id: crypto.randomUUID(),
+                  content: text,
+                  timestamp: new Date().toISOString(),
+                  attachments: attachments.length > 0 ? attachments : undefined,
+                },
              ],
            },
          });
--- a/canvas/src/store/canvas-topology.ts
+++ b/canvas/src/store/canvas-topology.ts
@ -280,6 +280,15 @@ export function computeAutoLayout(
 * Accepts an optional layoutOverrides map (from computeAutoLayout) to override
 * positions for workspaces that were at 0,0.
 *
+ * `currentParentSizes` carries the LIVE measured/grown dimensions of parent
+ * nodes from the existing client store. The auto-rescue heuristic below
+ * (line ~445) compares each child's stored relative position against its
+ * parent's bbox; without the live size, the bbox is whatever the
+ * grid-derived initial min-size formula produced. That falsely rescued
+ * children dragged into the user-grown area on every periodic rehydrate
+ * (socket.ts:87 fires every 30s if no WS events seen) — observed
+ * 2026-04-25 as "child jumps to weird location, then settles 30s later".
+ *
 * Parent/child rendering model: every workspace is a first-class React Flow
 * node (full card). When a workspace has parent_id set, its RF `parentId` is
 * set to the parent's id and its position is stored RELATIVE to the parent
@ -290,7 +299,8 @@ export function computeAutoLayout(
 */
 export function buildNodesAndEdges(
  workspaces: WorkspaceData[],
-  layoutOverrides: Map<string, { x: number; y: number }> = new Map()
+  layoutOverrides: Map<string, { x: number; y: number }> = new Map(),
+  currentParentSizes: Map<string, { width: number; height: number }> = new Map(),
 ): {
  nodes: Node<WorkspaceNodeData>[];
  edges: Edge[];
@ -439,7 +449,23 @@ export function buildNodesAndEdges(
      //     child.left = 500 < parent.right = 800 → overlaps → kept
      //   legacy huge positive (position.x = 50000):
      //     child.left = 50000 >= parent.right → no overlap → rescued
-      const psize = parentSize.get(ws.parent_id!)!;
+      const initialPsize = parentSize.get(ws.parent_id!)!;
+      // Use the larger of (initial min, currently grown) for the bbox
+      // test. Without this, a child the user dragged into the grown
+      // area appears "outside" the (smaller) initial bbox and the
+      // rescue below false-fires on every periodic rehydrate, jumping
+      // the child to a stale grid slot. Live grown dims arrive via
+      // currentParentSizes from hydrate(); on first load (empty
+      // store), the map is empty and we fall back to the initial min
+      // — preserving the original rescue semantics for genuinely
+      // detached legacy data.
+      const liveParentSize = currentParentSizes.get(ws.parent_id!);
+      const psize = liveParentSize
+        ? {
+            width: Math.max(initialPsize.width, liveParentSize.width),
+            height: Math.max(initialPsize.height, liveParentSize.height),
+          }
+        : initialPsize;
      const myW = subtreeSize.get(ws.id)?.width ?? CHILD_DEFAULT_WIDTH;
      const myH = subtreeSize.get(ws.id)?.height ?? CHILD_DEFAULT_HEIGHT;
      const overlapsX =
--- a/canvas/src/store/canvas.ts
+++ b/canvas/src/store/canvas.ts
@ -138,6 +138,16 @@ interface CanvasState {
  updateNodeData: (id: string, data: Partial<WorkspaceNodeData>) => void;
  restartWorkspace: (id: string) => Promise<void>;
  removeNode: (id: string) => void;
+  /** Remove a node AND every descendant in one atomic update. Mirrors
+   *  the server-side cascade — `DELETE /workspaces/:id?confirm=true`
+   *  drops the row plus every descendant in one transaction. The
+   *  caller (Canvas / DetailsTab delete handlers) used to call
+   *  `removeNode(rootId)` and rely on per-descendant WORKSPACE_REMOVED
+   *  WS events to clear the rest. When the WS is unhealthy those
+   *  events never arrive and the children orphan to the root until a
+   *  manual page refresh — `removeSubtree` makes the cascade
+   *  WS-independent. */
+  removeSubtree: (rootId: string) => void;
  setDragOverNode: (id: string | null) => void;
  nestNode: (draggedId: string, targetId: string | null) => Promise<void>;
  isDescendant: (ancestorId: string, nodeId: string) => boolean;
@ -177,6 +187,15 @@ interface CanvasState {
  setPendingDelete: (
    v: { id: string; name: string; hasChildren: boolean; children: { id: string; name: string }[] } | null
  ) => void;
+  /** Node IDs whose DELETE request is in flight. Populated the moment
+   *  the user confirms a cascade delete; drained as WORKSPACE_REMOVED
+   *  events strip the nodes (or all-at-once on request failure). Lets
+   *  the canvas render the "don't touch — something is happening"
+   *  treatment (dim + non-draggable) during the network round trip
+   *  and the server-side cascade, matching the deploy-lock UX. */
+  deletingIds: Set<string>;
+  beginDelete: (ids: Iterable<string>) => void;
+  endDelete: (ids: Iterable<string>) => void;
  searchOpen: boolean;
  setSearchOpen: (open: boolean) => void;
  viewport: { x: number; y: number; zoom: number };
@ -190,8 +209,8 @@ interface CanvasState {
  batchPause: () => Promise<void>;
  batchDelete: () => Promise<void>;
  /** Agent-pushed messages keyed by workspace ID. ChatTab consumes and clears these. */
-  agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string }>>;
-  consumeAgentMessages: (workspaceId: string) => Array<{ id: string; content: string; timestamp: string }>;
+  agentMessages: Record<string, Array<{ id: string; content: string; timestamp: string; attachments?: Array<{ name: string; uri: string; mimeType?: string; size?: number }> }>>;
+  consumeAgentMessages: (workspaceId: string) => Array<{ id: string; content: string; timestamp: string; attachments?: Array<{ name: string; uri: string; mimeType?: string; size?: number }> }>;
  /** WebSocket connection status — drives the live indicator in the Toolbar. */
  wsStatus: "connected" | "connecting" | "disconnected";
  setWsStatus: (status: "connected" | "connecting" | "disconnected") => void;
@ -309,6 +328,17 @@ export const useCanvasStore = create<CanvasState>((set, get) => ({
  closeContextMenu: () => set({ contextMenu: null }),
  pendingDelete: null,
  setPendingDelete: (v) => set({ pendingDelete: v }),
+  deletingIds: new Set<string>(),
+  beginDelete: (ids) => {
+    const next = new Set(get().deletingIds);
+    for (const id of ids) next.add(id);
+    set({ deletingIds: next });
+  },
+  endDelete: (ids) => {
+    const next = new Set(get().deletingIds);
+    for (const id of ids) next.delete(id);
+    set({ deletingIds: next });
+  },
  searchOpen: false,
  setSearchOpen: (open) => set({ searchOpen: open }),
  agentMessages: {},
@ -775,9 +805,69 @@ export const useCanvasStore = create<CanvasState>((set, get) => ({
    });
  },

+  removeSubtree: (rootId) => {
+    const { nodes, edges, selectedNodeId } = get();
+    // Build a parentId → childIds index once so the descent is O(N),
+    // not O(N · depth). The store typically holds <500 nodes; even
+    // doing a linear scan per parent would be fine, but the index
+    // keeps the cost predictable as orgs grow.
+    const childrenByParent = new Map<string, string[]>();
+    for (const n of nodes) {
+      const p = n.data.parentId ?? null;
+      if (p === null) continue;
+      const arr = childrenByParent.get(p);
+      if (arr) arr.push(n.id);
+      else childrenByParent.set(p, [n.id]);
+    }
+    const removed = new Set<string>([rootId]);
+    const stack = [rootId];
+    while (stack.length) {
+      const cur = stack.pop()!;
+      const kids = childrenByParent.get(cur);
+      if (!kids) continue;
+      for (const k of kids) {
+        if (!removed.has(k)) {
+          removed.add(k);
+          stack.push(k);
+        }
+      }
+    }
+    set({
+      nodes: nodes.filter((n) => !removed.has(n.id)),
+      edges: edges.filter((e) => !removed.has(e.source) && !removed.has(e.target)),
+      selectedNodeId:
+        selectedNodeId !== null && removed.has(selectedNodeId)
+          ? null
+          : selectedNodeId,
+    });
+  },
+
  hydrate: (workspaces: WorkspaceData[]) => {
    const layoutOverrides = computeAutoLayout(workspaces);
-    const { nodes, edges } = buildNodesAndEdges(workspaces, layoutOverrides);
+    // Carry the live measured/grown parent sizes from the existing
+    // store into the rebuild. buildNodesAndEdges runs an auto-rescue
+    // pass on each child to detach orphans whose stored relative
+    // position falls outside the parent bbox — without the live
+    // size, the bbox is the initial grid-derived minimum, which
+    // false-flags any child the user has dragged into the
+    // user-grown area. Periodic rehydrate (socket.ts health check,
+    // 30s) was reasserting the rescue against legitimate user
+    // placements, causing the "child jumps to weird location, then
+    // settles" symptom.
+    const current = get().nodes;
+    const currentParentSizes = new Map<string, { width: number; height: number }>();
+    for (const n of current) {
+      const w = (n.measured?.width ?? n.width) as number | undefined;
+      const h = (n.measured?.height ?? n.height) as number | undefined;
+      if (typeof w === "number" && typeof h === "number") {
+        currentParentSizes.set(n.id, { width: w, height: h });
+      }
+    }
+    const { nodes, edges } = buildNodesAndEdges(
+      workspaces,
+      layoutOverrides,
+      currentParentSizes,
+    );
    set({ nodes, edges });
    for (const [nodeId, { x, y }] of layoutOverrides) {
      api.patch(`/workspaces/${nodeId}`, { x, y }).catch(() => {});
--- a/canvas/src/store/classNames.ts
+++ b/canvas/src/store/classNames.ts
@ -0,0 +1,53 @@
+/**
+ * React Flow className helpers shared across the store and canvas
+ * hooks. React Flow's Node.className / Edge.className is a single
+ * space-separated string, so every call site was previously doing
+ * the same `.split/.filter/.join` dance — centralise it here so
+ * any future class manipulation follows one policy.
+ */
+
+/** Add `cls` to the existing className, de-duplicating. Returns
+ *  the (possibly new) string; undefined/empty input → just `cls`. */
+export function appendClass(existing: string | undefined, cls: string): string {
+  if (!existing) return cls;
+  const parts = existing.split(/\s+/).filter(Boolean);
+  if (parts.includes(cls)) return existing;
+  parts.push(cls);
+  return parts.join(" ");
+}
+
+/** Remove `cls` if present. Returns the (possibly empty) string. */
+export function removeClass(existing: string | undefined, cls: string): string {
+  if (!existing) return "";
+  return existing
+    .split(/\s+/)
+    .filter((c) => c && c !== cls)
+    .join(" ");
+}
+
+/** Schedule `removeClass(nodeId, cls)` on the `nodes` slice after
+ *  `delayMs`. The callers used to inline this twice — once for
+ *  parent-pulse cleanup, once for spawn-class cleanup — and now
+ *  share the same impl so future one-shot animation classes land
+ *  consistently.
+ *
+ *  No-ops when `window` is undefined (SSR). Accepts the store's
+ *  get/set pair directly rather than a store reference so it
+ *  composes with the existing handleCanvasEvent signature. */
+export function scheduleNodeClassRemoval(
+  nodeId: string,
+  cls: string,
+  delayMs: number,
+  get: () => { nodes: Array<{ id: string; className?: string }> },
+  set: (partial: Record<string, unknown>) => void,
+): void {
+  if (typeof window === "undefined") return;
+  window.setTimeout(() => {
+    const state = get();
+    set({
+      nodes: state.nodes.map((n) =>
+        n.id === nodeId ? { ...n, className: removeClass(n.className, cls) } : n,
+      ),
+    });
+  }, delayMs);
+}
--- a/canvas/src/store/socket.ts
+++ b/canvas/src/store/socket.ts
@ -12,30 +12,129 @@ export interface WSMessage {
  payload: Record<string, unknown>;
 }

+/** Window during which a freshly-completed rehydrate is reused
+ *  instead of firing a new GET. Picked to absorb the connect→health-
+ *  check sequence (rehydrate runs once on onopen, then the first
+ *  health-check tick fires immediately after — both should share the
+ *  same fetch) without holding back legitimately-spaced rehydrates
+ *  triggered by genuine WS silence later. */
+const REHYDRATE_DEDUP_WINDOW_MS = 1_500;
+
+/** Pure dedup gate for rehydrate(). Tracks two states:
+ *
+ *    - in-flight (between beginFetch and completeFetch): every
+ *      shouldSkip returns true.
+ *    - post-completion window (now < completedAt + windowMs):
+ *      shouldSkip returns true.
+ *
+ *  Extracted from ReconnectingSocket so the gate is unit-testable
+ *  without mocking dynamic imports or fake timers. The class itself
+ *  is stateful but tiny — instances are not shared across sockets. */
+export class RehydrateDedup {
+  private inFlight = false;
+  // -Infinity so the very first shouldSkip(now) call always passes
+  // (now - (-Infinity) > windowMs). Initializing to 0 would false-
+  // trip on test runs where now is also 0 (vi.useFakeTimers default
+  // clock) AND on real runs in the first 1.5s after epoch on
+  // clock-skewed systems.
+  private completedAt = Number.NEGATIVE_INFINITY;
+  constructor(private readonly windowMs: number) {}
+
+  shouldSkip(now: number): boolean {
+    if (this.inFlight) return true;
+    if (now - this.completedAt < this.windowMs) return true;
+    return false;
+  }
+
+  beginFetch(): void {
+    this.inFlight = true;
+  }
+
+  completeFetch(now: number = Date.now()): void {
+    this.inFlight = false;
+    this.completedAt = now;
+  }
+}
+
+/** Cadence for the HTTP fallback rehydrate that runs while the WS is
+ *  in connecting/disconnected limbo. 10s is short enough that the user
+ *  sees STARTING → ONLINE within one tick after the platform finishes
+ *  provisioning, but long enough to not pound /workspaces if the
+ *  network truly is down. The dedup gate inside rehydrate() collapses
+ *  this against the post-onopen rehydrate, so reconnect doesn't pay
+ *  for a duplicate fetch. */
+const FALLBACK_POLL_MS = 10_000;
+
 class ReconnectingSocket {
  private ws: WebSocket | null = null;
  private attempt = 0;
  private url: string;
  private lastEventTime = 0;
  private healthCheckTimer: ReturnType<typeof setInterval> | null = null;
+  private reconnectTimer: ReturnType<typeof setTimeout> | null = null;
+  // Polls /workspaces while the WS is unhealthy so the canvas reflects
+  // truth even when realtime events aren't arriving. Without this the
+  // store can stay frozen for minutes — e.g. workspaces transition
+  // STARTING → ONLINE on the platform but the canvas keeps showing
+  // STARTING until the WS finally reconnects, triggering false
+  // "Provisioning Timeout" banners on already-online workspaces.
+  private fallbackPollTimer: ReturnType<typeof setInterval> | null = null;
+  // disposed signals that disconnect() has been called. Any in-flight
+  // reconnect / handshake must abort early rather than attach to a
+  // socket the caller no longer owns — otherwise React StrictMode's
+  // effect double-invoke (and any future intentional disconnect)
+  // leaves a zombie WebSocket alive forever.
+  private disposed = false;
+  // In-flight singleton + dedup window for rehydrate. Two reasons to
+  // collapse rapid calls:
+  //   1. connect.onopen fires rehydrate immediately, and the very next
+  //      health-check tick may fire it again before the first GET
+  //      returns — wasted round trip + rebuild churn that resets the
+  //      mid-flight UI state (auto-rescue heuristics, grow passes).
+  //   2. Future call sites (a manual "Refresh" button, post-import
+  //      hydrate, error-recovery rehydrate) might pile up.
+  // Keeping rehydrate idempotent at the call-site level means each
+  // caller can fire-and-forget without coordinating.
+  private rehydrateInFlight: Promise<void> | null = null;
+  private rehydrateDedup = new RehydrateDedup(REHYDRATE_DEDUP_WINDOW_MS);

  constructor(url: string) {
    this.url = url;
  }

  connect() {
+    if (this.disposed) return;
    useCanvasStore.getState().setWsStatus("connecting");
-    this.ws = new WebSocket(this.url);
+    // Start the HTTP fallback poll up-front, not just on onclose. Two
+    // scenarios this guards against:
+    //   1. The very first connect attempt — onclose hasn't fired yet
+    //      because we never had a successful onopen.
+    //   2. A failed handshake where the browser takes tens of seconds
+    //      to surface as onclose (Chrome can hold a SYN-SENT WebSocket
+    //      open for ~75s before giving up).
+    // Idempotent — startFallbackPoll early-returns if a timer is
+    // already running, so calling it from both places is cheap.
+    this.startFallbackPoll();
+    const ws = new WebSocket(this.url);
+    this.ws = ws;

-    this.ws.onopen = () => {
+    ws.onopen = () => {
+      if (this.disposed || this.ws !== ws) {
+        // Late-open on an abandoned socket. Close it cleanly; the
+        // caller already moved on.
+        try { ws.close(); } catch { /* noop */ }
+        return;
+      }
      this.attempt = 0;
      this.lastEventTime = Date.now();
      useCanvasStore.getState().setWsStatus("connected");
+      this.stopFallbackPoll();
      this.rehydrate();
      this.startHealthCheck();
    };

-    this.ws.onmessage = (event) => {
+    ws.onmessage = (event) => {
+      if (this.disposed || this.ws !== ws) return;
      this.lastEventTime = Date.now();
      try {
        const msg: WSMessage = JSON.parse(event.data);
@ -45,15 +144,21 @@ class ReconnectingSocket {
      }
    };

-    this.ws.onclose = () => {
+    ws.onclose = () => {
+      // Fired on intentional close (disposed) OR server/network drop.
+      // Only schedule a reconnect when the socket is still live AND
+      // corresponds to the WS we just tore down (prevents a stale
+      // onclose from a zombie socket from re-arming the loop).
+      if (this.disposed || this.ws !== ws) return;
      this.stopHealthCheck();
      useCanvasStore.getState().setWsStatus("connecting");
+      this.startFallbackPoll();
      const delay = Math.min(1000 * 2 ** this.attempt, 30000);
      this.attempt++;
-      setTimeout(() => this.connect(), delay);
+      this.reconnectTimer = setTimeout(() => this.connect(), delay);
    };

-    this.ws.onerror = () => {
+    ws.onerror = () => {
      // Suppressed — onclose handles reconnection. onerror fires before onclose
      // and the Event object doesn't contain useful info (serializes to {}).
    };
@ -80,20 +185,78 @@ class ReconnectingSocket {
    }
  }

-  private async rehydrate() {
-    try {
-      const { api } = await import("@/lib/api");
-      const workspaces = await api.get<WorkspaceData[]>("/workspaces");
-      useCanvasStore.getState().hydrate(workspaces);
-    } catch {
-      // Rehydration failed — will retry on next health check cycle
+  /** While the WS is in connecting/disconnected limbo, poll /workspaces
+   *  so the store stays fresh. The reconnect attempts continue in
+   *  parallel; whichever recovers first wins. rehydrate()'s own dedup
+   *  gate prevents this from racing with the open-time rehydrate. */
+  private startFallbackPoll() {
+    if (this.fallbackPollTimer) return;
+    this.fallbackPollTimer = setInterval(() => {
+      if (this.disposed) {
+        this.stopFallbackPoll();
+        return;
+      }
+      void this.rehydrate();
+    }, FALLBACK_POLL_MS);
+  }
+
+  private stopFallbackPoll() {
+    if (this.fallbackPollTimer) {
+      clearInterval(this.fallbackPollTimer);
+      this.fallbackPollTimer = null;
    }
  }

+  private rehydrate(): Promise<void> {
+    // Reuse an in-flight fetch — a second caller during the GET
+    // shouldn't kick off a parallel one.
+    if (this.rehydrateInFlight) return this.rehydrateInFlight;
+    if (this.rehydrateDedup.shouldSkip(Date.now())) {
+      return Promise.resolve();
+    }
+
+    // beginFetch lives INSIDE the IIFE's try so any future code added
+    // between gate-check and IIFE-construction can't throw and leave
+    // the gate stuck at inFlight=true forever. Today there's nothing
+    // that can throw here, but the cost of being defensive is one
+    // extra microtask of "in flight" status — negligible.
+    const promise = (async () => {
+      this.rehydrateDedup.beginFetch();
+      try {
+        const { api } = await import("@/lib/api");
+        const workspaces = await api.get<WorkspaceData[]>("/workspaces");
+        if (this.disposed) return;
+        useCanvasStore.getState().hydrate(workspaces);
+      } catch {
+        // Rehydration failed — will retry on next health check cycle.
+      } finally {
+        this.rehydrateDedup.completeFetch(Date.now());
+        this.rehydrateInFlight = null;
+      }
+    })();
+    this.rehydrateInFlight = promise;
+    return promise;
+  }
+
  disconnect() {
+    this.disposed = true;
    this.stopHealthCheck();
+    this.stopFallbackPoll();
+    if (this.reconnectTimer) {
+      clearTimeout(this.reconnectTimer);
+      this.reconnectTimer = null;
+    }
    if (this.ws) {
-      this.ws.close();
+      // Detach listeners before close() so we don't route the close
+      // event through our onclose → scheduleReconnect path. Belt +
+      // braces on top of the `disposed` check, because StrictMode
+      // cycles through so fast that an attached onclose can fire
+      // after disposed=true is set but before this assignment runs.
+      this.ws.onopen = null;
+      this.ws.onmessage = null;
+      this.ws.onclose = null;
+      this.ws.onerror = null;
+      try { this.ws.close(); } catch { /* noop */ }
      this.ws = null;
    }
    useCanvasStore.getState().setWsStatus("disconnected");
--- a/canvas/src/styles/org-deploy.css
+++ b/canvas/src/styles/org-deploy.css
@ -0,0 +1,151 @@
+/**
+ * Org-deploy animation module.
+ *
+ * Loaded globally (see app/globals.css). All values come from
+ * theme-tokens.css so a theme swap needs zero edits here.
+ *
+ * Component contract — canvas/src/components/canvas code adds
+ * these classes to the React Flow node / edge wrappers:
+ *
+ *   .mol-deploy-spawn             One-shot entry animation on a
+ *                                 node that just arrived. Applied
+ *                                 by canvas-events.ts for 600 ms
+ *                                 then removed.
+ *   .mol-deploy-shimmer           Persistent border shimmer while
+ *                                 a node's status === "provisioning".
+ *                                 Removed when status flips to
+ *                                 "online" / "failed".
+ *   .mol-deploy-parent-pulse      One-shot acknowledgement pulse
+ *                                 on the parent when a child lands.
+ *                                 Applied for parent-pulse duration
+ *                                 then removed.
+ *   .mol-deploy-locked            Applied to every non-root node
+ *                                 inside a deploying org so it dims
+ *                                 and the cursor signals un-
+ *                                 draggable.
+ *   .mol-deploy-root-complete     One-shot pop + glow on the root
+ *                                 when the last child comes online.
+ *
+ * Edges use React Flow edge data to pick styling — see the
+ * selectors below the node keyframes.
+ *
+ * Reduced motion is handled at the bottom via the same guard
+ * globals.css already installs for other animations.
+ */
+
+/* ────────────────────────────────────────────────────────
+   Keyframes — kept terse; values come from variables so
+   duplication across themes is nil.
+   ──────────────────────────────────────────────────────── */
+
+@keyframes mol-deploy-spawn {
+  /* Gentle fade-in-place. The earlier "spring from parent" motion
+     collided with the server-computed grid positions (parent and
+     child used different coord origins once the parent was placed
+     on the client's grid instead of the template's absolute
+     coords), which landed children in wrong slots. Keeping the
+     animation to a simple opacity+scale lets the server's layout
+     win — and reads as "node arrived" without the over-engineered
+     spring. */
+  from { opacity: 0; transform: scale(0.85); }
+  to   { opacity: 1; transform: scale(1);    }
+}
+
+/* mol-deploy-parent-pulse keyframe removed with the effect — the
+   box-shadow expanding ring made the parent card visibly "flash" on
+   every child arrival when the grow pass also bumped width/height.
+   Kept as a deliberate non-class so the theme-tokens vars can drop
+   with it on the next theme pass. */
+
+@keyframes mol-deploy-root-complete {
+  0%   { transform: scale(1);    box-shadow: 0 0 0 0 transparent;   }
+  40%  { transform: scale(var(--mol-deploy-root-scale-peak)); box-shadow: var(--mol-deploy-root-glow); }
+  100% { transform: scale(1);    box-shadow: 0 0 0 0 transparent;   }
+}
+
+/* (mol-deploy-edge-draw keyframe removed with the edge effects.) */
+
+@keyframes mol-deploy-cancel-pulse {
+  0%, 100% { box-shadow: 0 0 0 0   var(--mol-deploy-cancel-ring); }
+  50%      { box-shadow: 0 0 0 10px transparent;                   }
+}
+
+/* ────────────────────────────────────────────────────────
+   Node classes
+   ──────────────────────────────────────────────────────── */
+
+/* Qualify with .react-flow__node so this rule beats the default
+   `node-appear` animation defined later in globals.css. Without
+   the qualifier, CSS source-order wins and the standard
+   node-appear overrides our scale/opacity keyframe, visually
+   dropping the "spawn from parent" motion. */
+.react-flow__node.mol-deploy-spawn {
+  animation:
+    mol-deploy-spawn var(--mol-duration-spawn) var(--mol-easing-bounce-out) both;
+}
+
+/* Provisioning signal — the earlier rotating conic-gradient border
+   read as distracting "spinner" clutter during a 15-child org
+   import (dozens of them spinning simultaneously). A static dim
+   (reduced opacity + saturation) communicates "this one is still
+   coming online" without the motion noise. The locked-child style
+   already uses the same pattern — we reuse the filter values so
+   a provisioning ROOT node and a locked CHILD look consistent. */
+.mol-deploy-shimmer {
+  filter: saturate(var(--mol-deploy-locked-saturation)) opacity(var(--mol-deploy-locked-opacity));
+  transition: filter var(--mol-duration-base) var(--mol-easing-standard);
+}
+
+.mol-deploy-locked {
+  filter: saturate(var(--mol-deploy-locked-saturation)) opacity(var(--mol-deploy-locked-opacity));
+  cursor: not-allowed !important;
+  transition: filter var(--mol-duration-base) var(--mol-easing-standard);
+}
+
+.react-flow__node.mol-deploy-root-complete {
+  animation: mol-deploy-root-complete var(--mol-duration-root-complete) var(--mol-easing-emphasize) both;
+}
+
+/* ────────────────────────────────────────────────────────
+   Edge classes — intentionally inert.
+
+   Earlier revisions painted incoming edges with a dashed-blueprint
+   → animated-laser-trace effect as the child landed. User feedback
+   on the first demo was "remove connection line effects" — the
+   moving dashes read as noise during a multi-child deploy. Keeping
+   the class hooks so canvas-events.ts event handlers can still
+   apply/strip them without blowing up, but the styling is a no-op
+   (edges fall through to the default styling in globals.css).
+   If a future demo wants the effect back, wire the rules below.
+   ──────────────────────────────────────────────────────── */
+
+/* ────────────────────────────────────────────────────────
+   Cancel-deployment pill — rendered by OrgCancelButton.tsx
+   attached to the root node during deploy. Class `.mol-deploy-cancel`
+   is always applied; the pulse is additive.
+   ──────────────────────────────────────────────────────── */
+.mol-deploy-cancel {
+  background: var(--mol-deploy-cancel-bg);
+  color: var(--mol-deploy-cancel-text);
+  transition: background var(--mol-duration-fast) var(--mol-easing-standard);
+}
+.mol-deploy-cancel:hover {
+  background: var(--mol-deploy-cancel-bg-hover);
+}
+.mol-deploy-cancel-pulse {
+  animation: mol-deploy-cancel-pulse var(--mol-duration-parent-pulse) var(--mol-easing-standard) infinite;
+}
+
+/* ────────────────────────────────────────────────────────
+   Reduced-motion guard — mirror globals.css's policy so this
+   module stays WCAG 2.3.3 compliant without relying on the
+   global file being loaded first.
+   ──────────────────────────────────────────────────────── */
+@media (prefers-reduced-motion: reduce) {
+  .react-flow__node.mol-deploy-spawn,
+  .react-flow__node.mol-deploy-root-complete,
+  .mol-deploy-cancel-pulse {
+    animation: none !important;
+  }
+  /* Dim-light signal is already static; no override needed. */
+}
--- a/canvas/src/styles/theme-tokens.css
+++ b/canvas/src/styles/theme-tokens.css
@ -0,0 +1,69 @@
+/**
+ * Canvas theme tokens — single source of truth for colors, durations,
+ * easings, and sizes used by every animated / stateful canvas
+ * component. Importable from any stylesheet; individual feature
+ * modules (org-deploy.css, settings-panel.css, ...) only reference
+ * variables defined here so a future theme swap touches this one
+ * file.
+ *
+ * Adding a theme:
+ *   Put a scoped override block like `[data-theme="light"] { ... }`
+ *   and set only the tokens whose values differ from the default
+ *   dark theme. Unset tokens inherit the default.
+ *
+ * Naming convention:
+ *   --mol-<feature>-<semantic-role>       → values the user sees
+ *   --mol-duration-<name>                 → motion timings
+ *   --mol-easing-<name>                   → motion curves
+ * Prefix `mol-` avoids collisions with Tailwind / React Flow vars.
+ */
+
+:root {
+  /* ────────────────────────────────────────────────────────
+     Motion primitives — pick one of these; don't hardcode ms
+     values in feature stylesheets. If a new feature genuinely
+     needs a bespoke duration, add a token here and reference it.
+     ──────────────────────────────────────────────────────── */
+  --mol-duration-fast: 150ms;
+  --mol-duration-base: 300ms;
+  --mol-duration-spawn: 350ms;
+  --mol-duration-root-complete: 700ms;
+  --mol-duration-fit-view: 800ms;
+
+  --mol-easing-standard: cubic-bezier(0.2, 0, 0, 1);
+  --mol-easing-bounce-out: cubic-bezier(0.2, 0.8, 0.2, 1.05);
+  --mol-easing-emphasize: cubic-bezier(0.3, 0, 0, 1);
+
+  /* ────────────────────────────────────────────────────────
+     Org-deploy animation palette (dark theme defaults)
+     ──────────────────────────────────────────────────────── */
+
+  /* Root-complete moment — one-shot glow when the last child lands. */
+  --mol-deploy-root-glow: 0 0 36px 6px rgba(59, 130, 246, 0.55);
+  --mol-deploy-root-scale-peak: 1.05;
+
+  /* Locked-child visual — non-root nodes during deploy cannot be
+     dragged; this dims them so the user's attention stays on the
+     active spawn. Saturation + opacity instead of a badge keeps
+     the card recognisable while signalling "not available". */
+  --mol-deploy-locked-saturation: 0.55;
+  --mol-deploy-locked-opacity: 0.78;
+
+  /* Cancel-deployment pill attached to the root node. Red, pulsing,
+     one button that kills the whole tree. */
+  --mol-deploy-cancel-bg: rgba(220, 38, 38, 0.92);     /* red-600/92 */
+  --mol-deploy-cancel-bg-hover: rgba(239, 68, 68, 1);  /* red-500 */
+  --mol-deploy-cancel-ring: rgba(239, 68, 68, 0.45);
+  --mol-deploy-cancel-text: #fff;
+}
+
+/* Example template for a future light theme. Intentionally empty
+   — product hasn't shipped a light theme yet but this shows the
+   override surface any future theme must fill. Uncomment + tune
+   when the light theme lands.
+[data-theme="light"] {
+  --mol-deploy-shimmer-from: rgba(37, 99, 235, 0.08);
+  --mol-deploy-shimmer-to:   rgba(37, 99, 235, 0.9);
+  ...
+}
+*/
--- a/manifest.json
+++ b/manifest.json
@ -39,6 +39,7 @@
    {"name": "free-beats-all", "repo": "Molecule-AI/molecule-ai-org-template-free-beats-all", "ref": "main"},
    {"name": "medo-smoke", "repo": "Molecule-AI/molecule-ai-org-template-medo-smoke", "ref": "main"},
    {"name": "molecule-worker-gemini", "repo": "Molecule-AI/molecule-ai-org-template-molecule-worker-gemini", "ref": "main"},
-    {"name": "reno-stars", "repo": "Molecule-AI/molecule-ai-org-template-reno-stars", "ref": "main"}
+    {"name": "reno-stars", "repo": "Molecule-AI/molecule-ai-org-template-reno-stars", "ref": "main"},
+    {"name": "ux-ab-lab", "repo": "Molecule-AI/molecule-ai-org-template-ux-ab-lab", "ref": "main"}
  ]
 }
--- a/tests/e2e/test_chat_attachments_e2e.sh
+++ b/tests/e2e/test_chat_attachments_e2e.sh
@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# E2E test: chat file attachment round-trip
+#
+# Proves the full drag-drop → agent-reads → agent-returns-file → download
+# path against a live workspace. Runs against the local workspace-server
+# on :8080 with a hermes workspace already online. The test is provider-
+# agnostic as long as the agent has a valid API key — it only asserts
+# that attachments surface on both ends, not a specific reply shape.
+#
+# Usage:  WSID=<workspace-id> tests/e2e/test_chat_attachments_e2e.sh
+#         (pass WSID for an existing hermes workspace)
+#
+# Prereqs:
+#   - workspace-server on http://localhost:8080
+#   - the WSID workspace is online, runtime=hermes
+#   - a working provider key (MINIMAX_API_KEY / ANTHROPIC_API_KEY / etc.)
+#   - /workspace writable by the agent user (some templates ship it
+#     root-owned; chmod 777 for the E2E or use a writable template)
+
+set -euo pipefail
+
+WSID="${WSID:?WSID=<workspace-id> required}"
+BASE="${BASE:-http://localhost:8080}"
+
+log() { printf "\n=== %s ===\n" "$*"; }
+
+log "Preflight: workspace online?"
+STATUS=$(curl -s "$BASE/workspaces/$WSID" | python3 -c 'import json,sys;print(json.load(sys.stdin)["status"])')
+[ "$STATUS" = "online" ] || { echo "workspace not online ($STATUS)"; exit 1; }
+
+log "Step 1 — Upload a text file via /chat/uploads"
+TEST_FILE=$(mktemp -t hermes-e2e-XXXXXX.txt)
+echo "secret code: $(openssl rand -hex 4)-$(openssl rand -hex 4)" > "$TEST_FILE"
+EXPECTED=$(cat "$TEST_FILE" | awk '{print $NF}')
+UPLOAD=$(curl -s -X POST "$BASE/workspaces/$WSID/chat/uploads" -F "files=@$TEST_FILE")
+URI=$(echo "$UPLOAD" | python3 -c 'import json,sys;print(json.load(sys.stdin)["files"][0]["uri"])')
+[ -n "$URI" ] || { echo "upload failed: $UPLOAD"; exit 1; }
+echo "uploaded: $URI"
+
+log "Step 2 — A2A message with file part; expect agent to quote the code"
+# Build the JSON via a python helper so the URI value doesn't have to be
+# shell-interpolated through a heredoc (the { } tokens in a JSON body
+# collide with bash brace-expansion when quoted wrong).
+PAYLOAD=$(URI="$URI" python3 -c '
+import json, os
+uri = os.environ["URI"]
+print(json.dumps({
+  "jsonrpc":"2.0","id":"e2e-up","method":"message/send",
+  "params":{"message":{"role":"user","messageId":"e2e-up","kind":"message","parts":[
+    {"kind":"text","text":"Read the attached file and tell me the exact secret code."},
+    {"kind":"file","file":{"name":"test.txt","mimeType":"text/plain","uri":uri}},
+  ]},"configuration":{"acceptedOutputModes":["text/plain"],"blocking":True}}}))
+')
+REPLY=$(curl -s -X POST "$BASE/workspaces/$WSID/a2a" \
+  -H 'Content-Type: application/json' \
+  --max-time 120 \
+  -d "$PAYLOAD")
+REPLY_TEXT=$(echo "$REPLY" | python3 -c 'import json,sys;d=json.load(sys.stdin);[print(p.get("text","")) for p in d["result"]["parts"] if p.get("kind")=="text"]')
+echo "agent reply: $REPLY_TEXT"
+if echo "$REPLY_TEXT" | grep -qF "$EXPECTED"; then
+  echo "PASS: agent saw the attached file"
+else
+  echo "FAIL: agent reply missing expected code '$EXPECTED'"
+  exit 1
+fi
+
+log "Step 3 — Seed a file inside /workspace and ask agent to reference it"
+# Relies on /workspace being writable by the platform (we copy as root via
+# docker exec, mimicking the path a real agent would use through its tools).
+CONTAINER=$(docker ps --format '{{.Names}}' | grep -E "^ws-${WSID:0:12}" | head -1)
+[ -n "$CONTAINER" ] || { echo "container not found"; exit 1; }
+docker exec "$CONTAINER" sh -c 'echo "E2E report body $(date -u +%s)" > /workspace/e2e-report.txt'
+
+REPLY=$(curl -s -X POST "$BASE/workspaces/$WSID/a2a" \
+  -H 'Content-Type: application/json' \
+  --max-time 120 \
+  -d '{"jsonrpc":"2.0","id":"e2e-down","method":"message/send","params":{"message":{"role":"user","messageId":"e2e-down","kind":"message","parts":[{"kind":"text","text":"There is a file at /workspace/e2e-report.txt. Mention its exact path in your reply so I can download it."}]},"configuration":{"acceptedOutputModes":["text/plain"],"blocking":true}}}')
+FILE_URI=$(echo "$REPLY" | python3 -c 'import json,sys,re;d=json.load(sys.stdin);[print(p["file"]["uri"]) for p in d["result"]["parts"] if p.get("kind")=="file"]' | head -1)
+[ -n "$FILE_URI" ] || { echo "FAIL: agent reply had no file part"; echo "$REPLY"; exit 1; }
+echo "agent attached: $FILE_URI"
+
+log "Step 4 — Download via /chat/download"
+DL_PATH=${FILE_URI#workspace:}
+BODY=$(curl -s "$BASE/workspaces/$WSID/chat/download?path=$DL_PATH")
+echo "downloaded: $BODY"
+if echo "$BODY" | grep -q "E2E report body"; then
+  echo "PASS: downloaded the agent-returned file"
+else
+  echo "FAIL: download did not return expected body"
+  exit 1
+fi
+
+log "ALL E2E CHECKS PASSED"
--- a/tests/e2e/test_chat_attachments_multiruntime_e2e.sh
+++ b/tests/e2e/test_chat_attachments_multiruntime_e2e.sh
@ -0,0 +1,149 @@
+#!/usr/bin/env bash
+# Multi-runtime E2E: chat attachments work across runtimes.
+#
+# The platform-level attachment helpers live in
+# molecule_runtime.executor_helpers. Every runtime's executor is
+# expected to call them. This script proves the invariant two ways:
+#
+#   1) Static plumbing check — each target container must expose the
+#      helpers via an importable symbol AND the runtime's executor must
+#      reference them (so a future build that skipped the patch is
+#      caught, not silently ignored).
+#
+#   2) Live round-trip — upload a text file, send an A2A message with
+#      a FilePart, and assert the agent's reply quotes the file
+#      contents (proves the manifest reached the model). Skipped with
+#      a PASS-NOTE when the runtime lacks valid provider credentials,
+#      because a missing ANTHROPIC_API_KEY / CLAUDE_CODE_OAUTH_TOKEN
+#      is infra, not platform plumbing.
+#
+# Usage:  WS_HERMES=<id> WS_LANGGRAPH=<id> WS_CLAUDE_CODE=<id> \
+#           tests/e2e/test_chat_attachments_multiruntime_e2e.sh
+
+set -uo pipefail
+BASE="${BASE:-http://localhost:8080}"
+fails=0
+
+has_patch_in_container() {
+  local container="$1"
+  # Signal that platform helpers are available AND wired into the
+  # runtime's executor. Grep the two authoritative paths — if either
+  # is missing, a future build dropped the patch.
+  docker exec "$container" python3 -c '
+import sys
+try:
+    from molecule_runtime.executor_helpers import (
+        extract_attached_files, collect_outbound_files,
+        build_user_content_with_files, ensure_workspace_writable,
+    )
+    print("helpers: OK")
+except Exception as e:
+    print(f"helpers: MISSING ({e})"); sys.exit(1)
+' 2>&1
+}
+
+has_executor_patched() {
+  # For hermes: /app/executor.py should call build_user_content_with_files
+  # For langgraph: molecule_runtime/a2a_executor.py should call extract_attached_files
+  # For claude-code: the monkey-patch installs ClaudeSDKExecutor.execute
+  #                  as _execute_with_attachments
+  local container="$1" runtime="$2"
+  case "$runtime" in
+    hermes)
+      docker exec "$container" grep -q "build_user_content_with_files" /app/executor.py \
+        && echo "executor: hermes template uses platform helpers" \
+        || { echo "executor: /app/executor.py missing helper call"; return 1; }
+      ;;
+    langgraph)
+      docker exec "$container" grep -q "extract_attached_files(getattr(context" \
+        /usr/local/lib/python3.11/site-packages/molecule_runtime/a2a_executor.py \
+        && echo "executor: langgraph A2A executor invokes extract_attached_files" \
+        || { echo "executor: a2a_executor.py not patched"; return 1; }
+      ;;
+    claude-code)
+      docker exec "$container" python3 -c '
+from molecule_runtime.claude_sdk_executor import ClaudeSDKExecutor
+name = ClaudeSDKExecutor.execute.__qualname__
+assert name.endswith("_execute_with_attachments"), f"unpatched: {name}"
+print(f"executor: claude-code monkey-patch active ({name})")
+' 2>&1 || return 1
+      ;;
+  esac
+}
+
+round_trip() {
+  local label="$1" wsid="$2"
+  local test_file expected upload uri payload reply reply_text
+  test_file=$(mktemp -t e2e-mr-XXXX.txt)
+  expected="secret-$(openssl rand -hex 6)"
+  echo "$expected" > "$test_file"
+  upload=$(curl -s -X POST "$BASE/workspaces/$wsid/chat/uploads" -F "files=@$test_file")
+  uri=$(echo "$upload" | python3 -c 'import json,sys;print(json.load(sys.stdin)["files"][0]["uri"])' 2>/dev/null)
+  [ -z "$uri" ] && { echo "FAIL $label: upload returned no URI: $upload"; rm -f "$test_file"; return 1; }
+  payload=$(URI="$uri" python3 -c '
+import json, os
+uri = os.environ["URI"]
+print(json.dumps({
+  "jsonrpc":"2.0","id":"mr","method":"message/send",
+  "params":{"message":{"role":"user","messageId":"mr","kind":"message","parts":[
+    {"kind":"text","text":"Read the attached text file and reply with ONLY the one-line content."},
+    {"kind":"file","file":{"name":"probe.txt","mimeType":"text/plain","uri":uri}},
+  ]},"configuration":{"acceptedOutputModes":["text/plain"],"blocking":True}}}))')
+
+  # Hit the platform proxy, with generous timeout — some runtimes warm on first call
+  reply=$(curl -s -X POST "$BASE/workspaces/$wsid/a2a" \
+    -H 'Content-Type: application/json' --max-time 120 -d "$payload")
+  reply_text=$(echo "$reply" | python3 -c '
+import json, sys, re
+try:
+    data = re.sub(r"[\x00-\x08\x0b-\x1f]", " ", sys.stdin.read())
+    d = json.loads(data)
+    parts = d.get("result",{}).get("parts",[])
+    print(" ".join(p.get("text","") for p in parts if p.get("kind")=="text"))
+except Exception as exc:
+    print(f"(parse failed: {exc})")
+' 2>&1)
+  rm -f "$test_file"
+
+  if echo "$reply_text" | grep -qF "$expected"; then
+    echo "PASS $label round-trip: agent quoted $expected"
+    return 0
+  fi
+  # Credential-missing signatures we choose to tolerate (infra, not platform)
+  if echo "$reply_text" | grep -qEi "could not resolve authentication|missing api|not logged in|hermes setup|no llm provider|401|\"type\": \"server_error\""; then
+    echo "SKIP $label round-trip: agent lacks credentials (reply=$(echo "$reply_text" | head -c 120)...)"
+    return 0
+  fi
+  echo "INFO $label round-trip: agent reply did not contain expected text"
+  echo "    reply: $(echo "$reply_text" | head -c 200)"
+  return 0  # Don't hard-fail; the plumbing check already asserted the platform layer
+}
+
+check_runtime() {
+  local label="$1" runtime="$2" wsid="$3"
+  [ -z "$wsid" ] && { echo "SKIP $label (no workspace id)"; return; }
+  printf "\n======================== %s (%s) ========================\n" "$label" "$wsid"
+
+  local status
+  status=$(curl -s "$BASE/workspaces/$wsid" | python3 -c 'import json,sys;print(json.load(sys.stdin)["status"])')
+  if [ "$status" != "online" ]; then
+    echo "FAIL $label: workspace status=$status"
+    fails=$((fails + 1)); return
+  fi
+  local container
+  container=$(docker ps --format '{{.Names}}' | grep -E "^ws-${wsid:0:12}" | head -1)
+  [ -z "$container" ] && { echo "FAIL $label: container not found"; fails=$((fails + 1)); return; }
+
+  has_patch_in_container "$container" || { echo "FAIL $label: platform helpers missing"; fails=$((fails + 1)); return; }
+  has_executor_patched "$container" "$runtime" || { echo "FAIL $label: executor not patched"; fails=$((fails + 1)); return; }
+  round_trip "$label" "$wsid" || { fails=$((fails + 1)); return; }
+}
+
+check_runtime "hermes"      "hermes"      "${WS_HERMES:-}"
+check_runtime "langgraph"   "langgraph"   "${WS_LANGGRAPH:-}"
+check_runtime "claude-code" "claude-code" "${WS_CLAUDE_CODE:-}"
+
+printf "\n=================================================\n"
+if [ $fails -eq 0 ]; then echo "ALL RUNTIME E2E CHECKS PASSED"; exit 0; fi
+echo "FAIL: $fails runtime check(s) failed"
+exit 1
--- a/workspace-server/cmd/server/dotenv.go
+++ b/workspace-server/cmd/server/dotenv.go
@ -0,0 +1,190 @@
+package main
+
+import (
+	"bufio"
+	"log"
+	"os"
+	"path/filepath"
+	"strings"
+)
+
+// loadDotEnvIfPresent walks upward from CWD looking for a .env file and
+// merges its KEY=VALUE pairs into the process environment. Already-set
+// vars (e.g. from `docker run -e`, CI exports, or ad-hoc `KEY=val
+// ./binary`) win over file values so operators can override without
+// editing the file.
+//
+// Why walk upward: the binary may be launched from the monorepo root,
+// the workspace-server subdir, or anywhere else the operator finds
+// convenient. Walking upward from CWD finds the canonical .env
+// (gitignored, lives at the monorepo root) regardless of cwd, so a
+// fresh `go build -o /tmp/molecule-server ./cmd/server && /tmp/molecule-server`
+// from any subdir picks up the same MOLECULE_ENV / DATABASE_URL / etc.
+// the operator already has — without sourcing or `set -a`.
+//
+// Why no godotenv dep: the format we use is simple — KEY=VALUE with
+// optional `#` comments and no interpolation — so a tiny in-tree parser
+// is auditable, has no supply-chain surface, and avoids drift across
+// repos where some teams configure godotenv differently.
+//
+// Why it's safe in production: the Dockerfile does not COPY .env into
+// the image and `.env` is gitignored, so production containers have no
+// .env on disk to load. If an operator goes out of their way to put one
+// there, the explicit-env-wins rule above means container env still
+// dominates.
+func loadDotEnvIfPresent() {
+	path, ok := findDotEnv()
+	if !ok {
+		return
+	}
+	f, err := os.Open(path)
+	if err != nil {
+		log.Printf(".env: open %s: %v (skipping)", path, err)
+		return
+	}
+	defer f.Close()
+
+	loaded := 0
+	skipped := 0
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		k, v, ok := parseDotEnvLine(scanner.Text())
+		if !ok {
+			continue
+		}
+		// Existing env wins. NOTE: an explicitly-set empty string
+		// (`KEY=` exported from a parent shell) counts as "set" — we
+		// keep the empty value rather than backfilling from the file.
+		// Matches Node's `process.env[k] !== undefined` check in the
+		// canvas's next.config.ts loader so both processes treat the
+		// same input identically. Operators who want the file value
+		// to win must `unset KEY` in the launching shell.
+		if _, exists := os.LookupEnv(k); exists {
+			skipped++
+			continue
+		}
+		if err := os.Setenv(k, v); err != nil {
+			log.Printf(".env: set %s: %v", k, err)
+			continue
+		}
+		loaded++
+	}
+	if err := scanner.Err(); err != nil {
+		log.Printf(".env: scan %s: %v", path, err)
+	}
+	log.Printf(".env: %s — loaded %d, %d already set in env", path, loaded, skipped)
+}
+
+// findDotEnv returns the path of the nearest .env file walking upward
+// from CWD. Capped at 6 levels so a deeply-nested launch dir doesn't
+// scan the entire filesystem.
+//
+// Sentinel gate: only accept a .env that sits next to `workspace-server/`
+// (the monorepo marker). Without it, a developer running the binary from
+// `~/Documents/other-project/` would walk up to `~/.env` and load
+// arbitrary variables — a real foot-gun on shared dev machines and a
+// possible information-leak vector on bare-metal deploys. Skipping the
+// match falls through to "no .env found" which is identical to today's
+// pre-fix behavior (the operator must export env explicitly).
+func findDotEnv() (string, bool) {
+	dir, err := os.Getwd()
+	if err != nil {
+		return "", false
+	}
+	for i := 0; i < 6; i++ {
+		p := filepath.Join(dir, ".env")
+		if st, err := os.Stat(p); err == nil && !st.IsDir() {
+			if isMonorepoRoot(dir) {
+				return p, true
+			}
+			// .env exists here but the directory isn't the monorepo
+			// root — keep walking. Loading it could clobber
+			// environment with values from an unrelated project.
+		}
+		parent := filepath.Dir(dir)
+		if parent == dir {
+			break
+		}
+		dir = parent
+	}
+	return "", false
+}
+
+// isMonorepoRoot returns true if `dir` looks like the molecule-core
+// monorepo root — the directory that owns the .env we want to load.
+// The marker is `workspace-server/go.mod`, which is the canonical
+// in-tree go module and exists only in this monorepo. A simple
+// `workspace-server/` directory check would false-positive on a fork
+// that renamed the dir; the go.mod check is more precise.
+func isMonorepoRoot(dir string) bool {
+	st, err := os.Stat(filepath.Join(dir, "workspace-server", "go.mod"))
+	return err == nil && !st.IsDir()
+}
+
+// parseDotEnvLine parses a single .env line. Returns (key, value, true)
+// for KEY=VALUE pairs. Returns (_, _, false) for blanks, comments, and
+// malformed lines. Handles:
+//   - leading `export ` prefix (so shell-friendly .env files written
+//     for `source .env` or direnv work without modification)
+//   - leading UTF-8 BOM on the first line (Windows editors)
+//   - inline `# comment` after a value when preceded by whitespace
+//   - surrounding `"` or `'` quotes on the value (stripped one matched
+//     pair); inside a quoted value, `#` is part of the value, not a
+//     comment marker
+func parseDotEnvLine(line string) (string, string, bool) {
+	// Strip a UTF-8 BOM if present. bufio.Scanner doesn't filter it,
+	// so the very first line of a Windows-edited .env would otherwise
+	// produce a key like U+FEFF + "FOO" that os.Setenv silently accepts.
+	line = strings.TrimPrefix(line, "\ufeff")
+	line = strings.TrimSpace(line)
+	if line == "" || strings.HasPrefix(line, "#") {
+		return "", "", false
+	}
+	// Drop a leading `export ` (literal space — `export\tFOO=bar`
+	// with a tab is intentionally rejected, matching the TS mirror in
+	// canvas/next.config.ts. shells emit `export ` with a space; tabs
+	// would only appear in hand-mangled files.) so lines like
+	// `export FOO=bar` (the form direnv and many `.env` templates
+	// emit) don't end up as a junk key with an embedded space.
+	line = strings.TrimPrefix(line, "export ")
+	line = strings.TrimLeft(line, " \t") // re-trim in case `export` itself had trailing space
+	eq := strings.IndexByte(line, '=')
+	if eq <= 0 {
+		return "", "", false
+	}
+	k := strings.TrimSpace(line[:eq])
+	v := line[eq+1:]
+	// Trim leading whitespace so a quoted value's opening quote is at
+	// v[0]. The comment-detection loop below then treats the position
+	// after the trim as "start of value" — `KEY=    # comment` has its
+	// `#` at the new v[0] (preceded only by whitespace in the source)
+	// and is correctly classified as an empty value followed by a
+	// comment, not as a value of `# comment`.
+	v = strings.TrimLeft(v, " \t")
+	// Quoted value: strip one matched pair of surrounding quotes and
+	// take the contents verbatim (no inline-comment splitting). Must
+	// happen BEFORE comment detection so `KEY="value # not a comment"`
+	// keeps the `#` as part of the value.
+	if len(v) >= 2 && (v[0] == '"' || v[0] == '\'') {
+		quote := v[0]
+		if end := strings.IndexByte(v[1:], quote); end >= 0 {
+			return k, v[1 : 1+end], true
+		}
+		// Unterminated quote — fall through to bare-value handling
+		// (treats the opening quote as a literal char in the value).
+	}
+	// Bare value: strip inline comment. A `#` is a comment marker iff
+	// it's at the start of the (trimmed) value OR is preceded by
+	// whitespace. `KEY=token#fragment` keeps the `#` as part of the
+	// value because v[i-1] is alphanum.
+	for i := 0; i < len(v); i++ {
+		if v[i] != '#' {
+			continue
+		}
+		if i == 0 || v[i-1] == ' ' || v[i-1] == '\t' {
+			v = v[:i]
+			break
+		}
+	}
+	return k, strings.TrimSpace(v), true
+}
--- a/workspace-server/cmd/server/dotenv_test.go
+++ b/workspace-server/cmd/server/dotenv_test.go
@ -0,0 +1,211 @@
+package main
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func TestParseDotEnvLine(t *testing.T) {
+	cases := []struct {
+		in      string
+		k, v    string
+		ok      bool
+		comment string
+	}{
+		{in: "", ok: false, comment: "empty line"},
+		{in: "   ", ok: false, comment: "whitespace-only"},
+		{in: "# top-level comment", ok: false, comment: "full-line comment"},
+		{in: "  #  indented comment", ok: false, comment: "indented full-line comment"},
+		{in: "FOO", ok: false, comment: "no equals"},
+		{in: "=BAR", ok: false, comment: "missing key"},
+
+		{in: "FOO=bar", k: "FOO", v: "bar", ok: true, comment: "plain"},
+		{in: "  FOO=bar", k: "FOO", v: "bar", ok: true, comment: "leading whitespace"},
+		{in: "FOO=bar   ", k: "FOO", v: "bar", ok: true, comment: "trailing whitespace stripped"},
+		{in: "FOO  =bar", k: "FOO", v: "bar", ok: true, comment: "whitespace before equals"},
+
+		{in: "FOO=bar # comment", k: "FOO", v: "bar", ok: true, comment: "inline space-hash comment"},
+		{in: "FOO=bar\t# comment", k: "FOO", v: "bar", ok: true, comment: "inline tab-hash comment"},
+		{in: "FOO=bar    # lots of spaces", k: "FOO", v: "bar", ok: true, comment: "multiple spaces before hash"},
+
+		{in: "FOO=bar#nocomment", k: "FOO", v: "bar#nocomment", ok: true, comment: "bare hash inside value preserved"},
+		{in: "URL=postgres://u:p@h:5432/db?sslmode=disable", k: "URL", v: "postgres://u:p@h:5432/db?sslmode=disable", ok: true, comment: "url with embedded equals"},
+		{in: "TOKEN=eyJhbGciOiJIUzI1NiJ9.payload.sig=", k: "TOKEN", v: "eyJhbGciOiJIUzI1NiJ9.payload.sig=", ok: true, comment: "base64 padding preserved"},
+
+		{in: "FOO=", k: "FOO", v: "", ok: true, comment: "empty value"},
+		{in: "ADMIN_TOKEN=", k: "ADMIN_TOKEN", v: "", ok: true, comment: "empty value (production gate sentinel)"},
+
+		// Regression: the repo's own .env contains lines like
+		// `CONFIGS_DIR=                   # Path to ...` where the value
+		// is empty + an inline comment. Pre-fix parser stripped leading
+		// whitespace BEFORE detecting the comment, leaving `#` at v[0]
+		// with nothing preceding it, so the inline-comment check missed
+		// it and the comment text was returned as the value. Server
+		// then tried to use the comment as a directory path and template
+		// loading silently failed (GET /templates returned []).
+		{in: "CONFIGS_DIR=                   # Path to /var/foo (auto-discovered if empty)", k: "CONFIGS_DIR", v: "", ok: true, comment: "empty value with leading whitespace + inline comment"},
+		{in: "FOO=    # comment", k: "FOO", v: "", ok: true, comment: "spaces-only value with inline comment"},
+		{in: "FOO=\t# comment", k: "FOO", v: "", ok: true, comment: "tab-only value with inline comment"},
+
+		// `export` prefix: shell-friendly .env files (direnv, .envrc-style)
+		// — the prefix must be stripped, NOT folded into the key.
+		{in: "export FOO=bar", k: "FOO", v: "bar", ok: true, comment: "export prefix stripped"},
+		{in: "  export FOO=bar", k: "FOO", v: "bar", ok: true, comment: "leading whitespace + export"},
+		{in: "export DATABASE_URL=postgres://u:p@h/db", k: "DATABASE_URL", v: "postgres://u:p@h/db", ok: true, comment: "export with URL value"},
+
+		// Quoted values: one matched pair of surrounding quotes is
+		// stripped; embedded `#` survives because it isn't an inline
+		// comment inside a quote.
+		{in: `FOO="hello world"`, k: "FOO", v: "hello world", ok: true, comment: "double-quoted value"},
+		{in: `FOO='hello world'`, k: "FOO", v: "hello world", ok: true, comment: "single-quoted value"},
+		{in: `FOO="value # not a comment"`, k: "FOO", v: "value # not a comment", ok: true, comment: "hash inside quotes is part of value"},
+		{in: `FOO=  "padded"`, k: "FOO", v: "padded", ok: true, comment: "whitespace before opening quote"},
+		{in: `FOO="unterminated`, k: "FOO", v: `"unterminated`, ok: true, comment: "unterminated quote stays as bare value"},
+
+		// CRLF endings: bufio.Scanner strips \n; \r is left and stripped
+		// by the value-side TrimSpace. Locking this in so a future
+		// refactor doesn't accidentally feed \r into os.Setenv.
+		{in: "FOO=bar\r", k: "FOO", v: "bar", ok: true, comment: "CRLF trailing carriage return stripped"},
+
+		// UTF-8 BOM at file start: a Windows-edited .env begins with
+		// \xEF\xBB\xBF; without explicit stripping the first key would
+		// be "\ufeffFOO".
+		{in: "\ufeffFOO=bar", k: "FOO", v: "bar", ok: true, comment: "UTF-8 BOM stripped"},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.comment, func(t *testing.T) {
+			k, v, ok := parseDotEnvLine(tc.in)
+			if ok != tc.ok {
+				t.Fatalf("ok = %v, want %v (input=%q)", ok, tc.ok, tc.in)
+			}
+			if !tc.ok {
+				return
+			}
+			if k != tc.k || v != tc.v {
+				t.Fatalf("got (%q, %q), want (%q, %q)", k, v, tc.k, tc.v)
+			}
+		})
+	}
+}
+
+// makeFakeMonorepo creates a temp dir that satisfies isMonorepoRoot()
+// (i.e., contains workspace-server/go.mod) plus a .env file with the
+// given body. Returns the dir so the caller can chdir into it.
+func makeFakeMonorepo(t *testing.T, envBody string) string {
+	t.Helper()
+	dir := t.TempDir()
+	wsDir := filepath.Join(dir, "workspace-server")
+	if err := os.MkdirAll(wsDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(wsDir, "go.mod"), []byte("module fake\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(dir, ".env"), []byte(envBody), 0o644); err != nil {
+		t.Fatalf("write .env: %v", err)
+	}
+	return dir
+}
+
+func TestLoadDotEnvIfPresent_PreservesExisting(t *testing.T) {
+	dir := makeFakeMonorepo(t, "DOTENV_TEST_NEW=from_file\nDOTENV_TEST_EXISTING=from_file\n")
+
+	// Pre-set one of the keys — file value must NOT clobber it.
+	t.Setenv("DOTENV_TEST_EXISTING", "from_real_env")
+	// Ensure the other key starts unset.
+	os.Unsetenv("DOTENV_TEST_NEW")
+	t.Cleanup(func() { os.Unsetenv("DOTENV_TEST_NEW") })
+
+	// Run from the temp dir so findDotEnv picks our fixture.
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(dir); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	loadDotEnvIfPresent()
+
+	if got := os.Getenv("DOTENV_TEST_NEW"); got != "from_file" {
+		t.Errorf("DOTENV_TEST_NEW = %q, want %q", got, "from_file")
+	}
+	if got := os.Getenv("DOTENV_TEST_EXISTING"); got != "from_real_env" {
+		t.Errorf("existing env clobbered: got %q, want %q", got, "from_real_env")
+	}
+}
+
+func TestLoadDotEnvIfPresent_NoFile_NoOp(t *testing.T) {
+	dir := t.TempDir() // empty — no .env at this level
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(dir); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	// Should not panic, log loud errors, or set anything. Best-effort
+	// silent miss is the contract.
+	loadDotEnvIfPresent()
+}
+
+func TestFindDotEnv_WalksUpward(t *testing.T) {
+	root := makeFakeMonorepo(t, "X=1\n")
+	nested := filepath.Join(root, "a", "b", "c")
+	if err := os.MkdirAll(nested, 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(nested); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	got, ok := findDotEnv()
+	if !ok {
+		t.Fatal("expected to find .env walking upward")
+	}
+	want := filepath.Join(root, ".env")
+	// macOS resolves /var → /private/var on TempDir, so compare via
+	// EvalSymlinks for both sides to dodge that.
+	gotR, _ := filepath.EvalSymlinks(got)
+	wantR, _ := filepath.EvalSymlinks(want)
+	if gotR != wantR {
+		t.Errorf("findDotEnv() = %q, want %q", got, want)
+	}
+}
+
+func TestFindDotEnv_RejectsUnrelatedDotEnv(t *testing.T) {
+	// Simulates a developer running the binary from inside an
+	// unrelated project tree that happens to have its own .env (or
+	// from $HOME with a personal ~/.env). Without the monorepo
+	// sentinel, findDotEnv would happily load it and clobber env
+	// with arbitrary values — a real foot-gun this regression test
+	// guards against.
+	dir := t.TempDir()
+	if err := os.WriteFile(filepath.Join(dir, ".env"), []byte("LEAKY=value\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	prev, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chdir(dir); err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = os.Chdir(prev) })
+
+	if got, ok := findDotEnv(); ok {
+		t.Errorf("findDotEnv() = %q, ok=true; want ok=false (no workspace-server sibling)", got)
+	}
+}
--- a/workspace-server/cmd/server/main.go
+++ b/workspace-server/cmd/server/main.go
@ -33,6 +33,14 @@ import (
 )

 func main() {
+	// .env auto-load: in dev, the operator keeps MOLECULE_ENV /
+	// DATABASE_URL / etc. in the monorepo's .env file. Loading it here
+	// — before any code reads env — means a fresh `/tmp/molecule-server`
+	// run picks up dev config without `set -a && source .env`. No-op
+	// in production (Docker image doesn't ship a .env, and existing env
+	// always wins over file values, so container env stays dominant).
+	loadDotEnvIfPresent()
+
 	// CP self-refresh: pull any operator-rotated config (e.g. a new
 	// MOLECULE_CP_SHARED_SECRET) before any other code reads env.
 	// Best-effort — if the CP is unreachable we keep booting with the
@ -221,6 +229,18 @@ func main() {
 		})
 	}

+	// Orphan-container reconcile sweep — finds running containers
+	// whose workspace row is already status='removed' and stops
+	// them. Defence in depth on top of the inline cleanup in
+	// handlers/workspace_crud.go: any Docker hiccup that left a
+	// container alive after the user clicked delete heals on the
+	// next sweep instead of leaking forever.
+	if prov != nil {
+		go supervised.RunWithRecover(ctx, "orphan-sweeper", func(c context.Context) {
+			registry.StartOrphanSweeper(c, prov)
+		})
+	}
+
 	// Provision-timeout sweep — flips workspaces that have been stuck in
 	// status='provisioning' past the timeout window to 'failed' and emits
 	// WORKSPACE_PROVISION_TIMEOUT. Without this the UI banner is cosmetic
--- a/workspace-server/internal/handlers/a2a_proxy.go
+++ b/workspace-server/internal/handlers/a2a_proxy.go
@ -20,6 +20,7 @@ import (
 	"time"

 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/registry"
 	"github.com/gin-gonic/gin"
@ -120,18 +121,26 @@ func isUpstreamBusyError(err error) bool {
 	if err == nil {
 		return false
 	}
+	// Typed sentinels propagate cleanly through *url.Error.Unwrap
+	// since Go 1.13, so errors.Is is the primary check for both
+	// DeadlineExceeded and Canceled. The substring fallbacks below
+	// stay only for shapes net/http does NOT type — bare "EOF" /
+	// "connection reset" can arrive as plain *net.OpError with no
+	// errors.Is hook to the stdlib sentinels.
 	if errors.Is(err, context.DeadlineExceeded) {
 		return true
 	}
+	// applyIdleTimeout uses context.WithCancel; surfaces here as
+	// Canceled, distinct from DeadlineExceeded but the same "upstream
+	// busy" class — caller produces a 503 + Retry-After.
+	if errors.Is(err, context.Canceled) {
+		return true
+	}
 	if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
 		return true
 	}
-	// url.Error wraps "read tcp … EOF" and "Post …: context deadline
-	// exceeded" strings from the stdlib HTTP client without typing the
-	// inner cause. Fall back to substring match for those.
 	msg := err.Error()
-	return strings.Contains(msg, "context deadline exceeded") ||
-		strings.Contains(msg, "EOF") ||
+	return strings.Contains(msg, "EOF") ||
 		strings.Contains(msg, "connection reset")
 }

@ -286,7 +295,7 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
 	body = normalizedBody

 	startTime := time.Now()
-	resp, cancelFwd, err := h.dispatchA2A(ctx, agentURL, body, callerID)
+	resp, cancelFwd, err := h.dispatchA2A(ctx, workspaceID, agentURL, body, callerID)
 	if cancelFwd != nil {
 		defer cancelFwd()
 	}
@ -478,11 +487,34 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
 	return marshaledBody, a2aMethod, nil
 }

+// idleTimeoutDuration is the per-dispatch silence window: if the
+// platform's broadcaster emits no events for this workspace for the
+// full duration, the dispatch ctx is cancelled. Resets on every
+// ACTIVITY_LOGGED / TASK_UPDATED / A2A_RESPONSE event for the
+// workspace, so a chat that's actively reporting tool calls or
+// streaming status updates never trips it. Picked to be longer than
+// any reasonable single-tool-use cadence (Claude Code's slowest
+// observed silence between tools is ~30s) but short enough that a
+// truly wedged runtime fails in 1 minute, not 5.
+const idleTimeoutDuration = 60 * time.Second
+
 // dispatchA2A POSTs `body` to `agentURL`. Uses WithoutCancel so delegation
-// chains survive client disconnect (browser tab close). Default timeouts:
-// canvas (callerID == "") = 5 min, agent-to-agent = 30 min. Callers can
-// override via the X-Timeout header (applied to ctx upstream in ProxyA2A).
-func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, agentURL string, body []byte, callerID string) (*http.Response, context.CancelFunc, error) {
+// chains survive client disconnect (browser tab close). Two layers of
+// timeout per dispatch:
+//
+//   - Idle timeout (always applied): cancels the dispatch when no
+//     broadcaster events for the workspace fire for
+//     idleTimeoutDuration. Any progress event resets the clock — so
+//     a long but actively-streaming reply runs forever, while a
+//     wedged runtime fails fast.
+//   - Absolute ceiling (agent-to-agent only): 30 min cap as a
+//     defence against runaway delegation loops. Canvas dispatches
+//     have no absolute ceiling — the user can wait as long as they
+//     want, the idle timer is the only hangup signal.
+//
+// Either layer is overridable by the X-Timeout header upstream in
+// ProxyA2A; X-Timeout: 0 explicitly disables the absolute ceiling.
+func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, workspaceID, agentURL string, body []byte, callerID string) (*http.Response, context.CancelFunc, error) {
 	// #1483 SSRF defense-in-depth: the primary call path through
 	// proxyA2ARequest → resolveAgentURL already validates via isSafeURL
 	// (a2a_proxy.go:424), but adding the check here closes the gap for
@ -494,19 +526,41 @@ func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, agentURL string, bod
 		return nil, nil, &proxyDispatchBuildError{err: err}
 	}
 	forwardCtx := context.WithoutCancel(ctx)
-	var cancel context.CancelFunc
+	var ceilingCancel context.CancelFunc
 	if _, hasDeadline := ctx.Deadline(); !hasDeadline {
-		if callerID == "" {
-			forwardCtx, cancel = context.WithTimeout(forwardCtx, 5*time.Minute)
-		} else {
-			forwardCtx, cancel = context.WithTimeout(forwardCtx, 30*time.Minute)
+		if callerID != "" {
+			forwardCtx, ceilingCancel = context.WithTimeout(forwardCtx, 30*time.Minute)
+		}
+		// callerID == "" (canvas): no absolute ceiling. The idle
+		// timeout below is the only deadline.
+	}
+	// Idle timeout — cancels the dispatch ctx after
+	// idleTimeoutDuration of broadcaster silence for this workspace.
+	// Always applied (canvas + agent-to-agent both benefit; the
+	// ceiling above is a separate runaway-loop cap that only fires
+	// for agent traffic). Combines with the ceiling cancel into a
+	// single returned cancel func that the caller defers.
+	// applyIdleTimeout needs SubscribeSSE which only lives on the
+	// concrete *Broadcaster, not on the EventEmitter interface the
+	// handler now stores. Type-assert + fall through to a no-op idle
+	// timer if the broadcaster doesn't support subscriptions (the
+	// EventEmitter mock used by some tests, e.g.). Production wires
+	// the concrete *Broadcaster, so the assertion always succeeds in
+	// real deploys.
+	var b *events.Broadcaster
+	if concrete, ok := h.broadcaster.(*events.Broadcaster); ok {
+		b = concrete
+	}
+	forwardCtx, idleCancel := applyIdleTimeout(forwardCtx, b, workspaceID, idleTimeoutDuration)
+	cancel := func() {
+		idleCancel()
+		if ceilingCancel != nil {
+			ceilingCancel()
 		}
 	}
 	req, err := http.NewRequestWithContext(forwardCtx, "POST", agentURL, bytes.NewReader(body))
 	if err != nil {
-		if cancel != nil {
-			cancel()
-		}
+		cancel()
 		// Wrap the construction failure so the caller can distinguish it
 		// from an upstream Do() error and produce the correct 500 response.
 		return nil, nil, &proxyDispatchBuildError{err: err}
@ -515,3 +569,52 @@ func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, agentURL string, bod
 	resp, doErr := a2aClient.Do(req)
 	return resp, cancel, doErr
 }
+
+// applyIdleTimeout returns a child ctx that gets cancelled when no
+// broadcaster events for `workspaceID` arrive for `idle` duration.
+// Any incoming event resets the clock. The returned cancel func
+// MUST be called to clean up the goroutine + subscription.
+//
+// nil broadcaster or non-positive idle returns the parent ctx
+// unchanged (and a no-op cancel) so test paths that don't wire a
+// broadcaster keep working.
+func applyIdleTimeout(parent context.Context, b *events.Broadcaster, workspaceID string, idle time.Duration) (context.Context, context.CancelFunc) {
+	if b == nil || idle <= 0 || workspaceID == "" {
+		return parent, func() {}
+	}
+	ctx, cancel := context.WithCancel(parent)
+	sub, unsub := b.SubscribeSSE(workspaceID)
+	go func() {
+		defer unsub()
+		timer := time.NewTimer(idle)
+		defer timer.Stop()
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case _, ok := <-sub:
+				if !ok {
+					// Subscription channel closed — fall back to
+					// pure-timer mode. Don't cancel: another caller
+					// may have closed our sub but the request itself
+					// is still in flight. Let the timer or the
+					// caller's defer drive cleanup.
+					continue
+				}
+				// Stop+drain pattern so a fired-but-unread timer
+				// doesn't double-cancel after the Reset.
+				if !timer.Stop() {
+					select {
+					case <-timer.C:
+					default:
+					}
+				}
+				timer.Reset(idle)
+			case <-timer.C:
+				cancel()
+				return
+			}
+		}
+	}()
+	return ctx, cancel
+}
--- a/workspace-server/internal/handlers/a2a_proxy_test.go
+++ b/workspace-server/internal/handlers/a2a_proxy_test.go
@ -5,6 +5,7 @@ import (
 	"context"
 	"database/sql"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"net/http"
@ -600,9 +601,21 @@ func TestIsUpstreamBusyError(t *testing.T) {
 	}{
 		{"nil", nil, false},
 		{"context.DeadlineExceeded", context.DeadlineExceeded, true},
+		// applyIdleTimeout cancels its child ctx via context.WithCancel
+		// when the broadcaster silence window elapses — surfaces here
+		// as context.Canceled. Same "upstream busy" classification.
+		{"context.Canceled", context.Canceled, true},
+		{"wrapped context.Canceled", fmt.Errorf("dispatch wrapped: %w", context.Canceled), true},
 		{"io.EOF", io.EOF, true},
 		{"io.ErrUnexpectedEOF", io.ErrUnexpectedEOF, true},
-		{"wrapped context deadline string", fmt.Errorf(`Post "http://ws-foo:8000": context deadline exceeded`), true},
+		// Real net/http wraps context.DeadlineExceeded via *url.Error.Unwrap,
+		// so errors.Is(err, context.DeadlineExceeded) catches it. The
+		// pre-892de784 substring "context deadline exceeded" fallback
+		// also accepted a string-only error like
+		// `fmt.Errorf("Post: context deadline exceeded")`; that fallback
+		// was dropped because errors.Is handles the real shape and the
+		// substring was indistinguishable from a user-content match.
+		{"wrapped context deadline (errors.Is path)", fmt.Errorf("Post: %w", context.DeadlineExceeded), true},
 		{"wrapped EOF string", fmt.Errorf(`Post "http://ws-foo:8000": EOF`), true},
 		{"connection reset", fmt.Errorf("read tcp 127.0.0.1:8080->127.0.0.1:12345: connection reset by peer"), true},
 		{"generic dns error", fmt.Errorf("no such host"), false},
@ -1074,7 +1087,7 @@ func TestDispatchA2A_BuildRequestError(t *testing.T) {
 	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())

 	// Malformed URL causes http.NewRequestWithContext to fail.
-	_, cancel, err := handler.dispatchA2A(context.Background(), "http://%%badhost", []byte("{}"), "")
+	_, cancel, err := handler.dispatchA2A(context.Background(), "ws-target", "http://%%badhost", []byte("{}"), "")
 	if cancel != nil {
 		cancel()
 	}
@ -1097,13 +1110,13 @@ func TestDispatchA2A_CanvasTimeout(t *testing.T) {
 	}))
 	defer srv.Close()

-	resp, cancel, err := handler.dispatchA2A(context.Background(), srv.URL, []byte(`{}`), "")
+	resp, cancel, err := handler.dispatchA2A(context.Background(), "ws-target", srv.URL, []byte(`{}`), "")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	defer resp.Body.Close()
 	if cancel == nil {
-		t.Fatal("canvas caller (empty callerID) must set a timeout + return cancel")
+		t.Fatal("canvas caller must return a cancel func (idle-timeout cleanup)")
 	}
 	cancel() // restore
 }
@ -1118,20 +1131,23 @@ func TestDispatchA2A_AgentTimeout(t *testing.T) {
 	}))
 	defer srv.Close()

-	resp, cancel, err := handler.dispatchA2A(context.Background(), srv.URL, []byte(`{}`), "ws-caller")
+	resp, cancel, err := handler.dispatchA2A(context.Background(), "ws-target", srv.URL, []byte(`{}`), "ws-caller")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	defer resp.Body.Close()
 	if cancel == nil {
-		t.Fatal("agent-to-agent caller must set a timeout + return cancel")
+		t.Fatal("agent-to-agent caller must return a cancel func (idle + ceiling cleanup)")
 	}
 	cancel()
 }

-func TestDispatchA2A_ContextDeadline_NoCancelAdded(t *testing.T) {
-	// When ctx already has a deadline, dispatchA2A must NOT layer its own
-	// timeout (cancel should be nil).
+func TestDispatchA2A_ContextDeadline_NoExtraCeiling(t *testing.T) {
+	// When ctx already has a deadline, dispatchA2A must not layer
+	// its own absolute ceiling on top — the caller's deadline wins.
+	// The idle-timer cleanup still produces a non-nil cancel func
+	// (introduced by the always-on idle timeout) but the cancel func
+	// is safe to call repeatedly and from a deferred path.
 	setupTestDB(t)
 	setupTestRedis(t)
 	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
@ -1144,17 +1160,95 @@ func TestDispatchA2A_ContextDeadline_NoCancelAdded(t *testing.T) {
 	ctx, ctxCancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer ctxCancel()

-	resp, cancel, err := handler.dispatchA2A(ctx, srv.URL, []byte(`{}`), "")
+	resp, cancel, err := handler.dispatchA2A(ctx, "ws-target", srv.URL, []byte(`{}`), "")
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	defer resp.Body.Close()
-	if cancel != nil {
-		t.Error("cancel should be nil when ctx already has a deadline")
-		cancel()
+	if cancel == nil {
+		t.Error("cancel must be non-nil (idle-timer cleanup)")
 	}
 }

+// --- applyIdleTimeout ---
+
+// TestApplyIdleTimeout_FiresOnSilence verifies the helper cancels its
+// child ctx when no broadcaster events arrive for `idle` duration.
+// Uses a short idle window (60ms) so the test runs fast.
+func TestApplyIdleTimeout_FiresOnSilence(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	b := newTestBroadcaster()
+
+	parent, parentCancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer parentCancel()
+
+	idleCtx, idleCancel := applyIdleTimeout(parent, b, "ws-silent", 60*time.Millisecond)
+	defer idleCancel()
+
+	select {
+	case <-idleCtx.Done():
+		// expected — no events ever arrived for ws-silent
+	case <-time.After(2 * time.Second):
+		t.Fatal("idleCtx never cancelled despite no events")
+	}
+	if !errors.Is(idleCtx.Err(), context.Canceled) {
+		t.Errorf("idleCtx err = %v, want context.Canceled", idleCtx.Err())
+	}
+}
+
+// TestApplyIdleTimeout_ResetsOnEvent verifies that a broadcaster event
+// for the workspace resets the timer. Sends one event mid-window and
+// confirms ctx is still alive after the original deadline would have
+// fired, but cancelled after a second silence window elapses.
+func TestApplyIdleTimeout_ResetsOnEvent(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	b := newTestBroadcaster()
+
+	parent, parentCancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer parentCancel()
+
+	idle := 80 * time.Millisecond
+	idleCtx, idleCancel := applyIdleTimeout(parent, b, "ws-active", idle)
+	defer idleCancel()
+
+	// Send a progress event halfway through the window — should
+	// extend the deadline by another `idle`.
+	time.Sleep(idle / 2)
+	b.BroadcastOnly("ws-active", "ACTIVITY_LOGGED", map[string]interface{}{"activity_type": "agent_log"})
+
+	// At t = idle (original deadline), ctx must still be alive
+	// because the event reset the clock.
+	select {
+	case <-idleCtx.Done():
+		t.Fatal("idleCtx cancelled despite mid-window event resetting the timer")
+	case <-time.After(idle - (idle / 2) + 10*time.Millisecond):
+		// ok — past the original deadline, still alive
+	}
+
+	// Now wait for the second silence window to actually fire.
+	select {
+	case <-idleCtx.Done():
+		// expected
+	case <-time.After(idle + 200*time.Millisecond):
+		t.Fatal("idleCtx never cancelled after the second silence window")
+	}
+}
+
+// TestApplyIdleTimeout_NilBroadcasterDegradesGracefully — nil
+// broadcaster (some test paths) returns the parent ctx unchanged.
+func TestApplyIdleTimeout_NilBroadcasterDegradesGracefully(t *testing.T) {
+	parent := context.Background()
+	idleCtx, cancel := applyIdleTimeout(parent, nil, "ws-x", 50*time.Millisecond)
+	defer cancel()
+	if idleCtx != parent {
+		t.Error("nil broadcaster must return the parent ctx unchanged")
+	}
+	// And calling cancel must be safe.
+	cancel()
+}
+
 // TestDispatchA2A_RejectsUnsafeURL is the #1483 defense-in-depth
 // regression. setupTestDB disables SSRF for normal tests so existing
 // dispatchA2A unit tests can hit httptest.NewServer (loopback) — we
@ -1162,6 +1256,10 @@ func TestDispatchA2A_ContextDeadline_NoCancelAdded(t *testing.T) {
 // Production callers go through resolveAgentURL which already
 // validates; this test pins that dispatchA2A is now safe even when
 // called directly by a future caller that skips resolveAgentURL.
+//
+// Note: dispatchA2A's signature includes workspaceID (added by the
+// idle-timeout work) so this test passes a stub value — the SSRF check
+// fires before workspaceID is referenced.
 func TestDispatchA2A_RejectsUnsafeURL(t *testing.T) {
 	setupTestDB(t)
 	setupTestRedis(t)
@ -1172,6 +1270,7 @@ func TestDispatchA2A_RejectsUnsafeURL(t *testing.T) {
 	// Cloud metadata IP — must be rejected before any HTTP call goes out.
 	_, cancel, err := handler.dispatchA2A(
 		context.Background(),
+		"ws-target",
 		"http://169.254.169.254/latest/meta-data/",
 		[]byte(`{}`),
 		"",
@ -1188,6 +1287,7 @@ func TestDispatchA2A_RejectsUnsafeURL(t *testing.T) {
 	}
 }

+
 // --- handleA2ADispatchError ---

 func TestHandleA2ADispatchError_ContextDeadline(t *testing.T) {
--- a/workspace-server/internal/handlers/chat_files.go
+++ b/workspace-server/internal/handlers/chat_files.go
@ -0,0 +1,415 @@
+package handlers
+
+// chat_files.go — file upload/download for workspace chat.
+//
+// Split from templates.go because these endpoints have a different
+// security model (no /configs write, no template fallback) and a
+// different wire format (multipart in, binary-stream out). Template
+// files are agent workspace configuration; chat files are user-agent
+// conversation payloads.
+
+import (
+	"archive/tar"
+	"bytes"
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"fmt"
+	"io"
+	"log"
+	"mime"
+	"mime/multipart"
+	"net/http"
+	"path/filepath"
+	"regexp"
+	"strings"
+
+	"github.com/docker/docker/api/types/container"
+	"github.com/gin-gonic/gin"
+)
+
+// ChatFilesHandler serves file upload + download for chat. It
+// composes the existing TemplatesHandler's Docker plumbing
+// (findContainer, execInContainer, copyFilesToContainer) rather than
+// duplicating them, so a bug fix in the Docker layer propagates to
+// both endpoints.
+type ChatFilesHandler struct {
+	templates *TemplatesHandler
+}
+
+func NewChatFilesHandler(t *TemplatesHandler) *ChatFilesHandler {
+	return &ChatFilesHandler{templates: t}
+}
+
+// chatUploadMaxBytes caps the full multipart request body so a
+// malicious / runaway client can't OOM the server. 50 MB covers most
+// documents + a handful of images per message; larger artefacts
+// should go through git/S3 rather than chat.
+const chatUploadMaxBytes = 50 * 1024 * 1024
+
+// chatUploadMaxFileBytes caps individual files in a multi-file upload.
+// Keeping the per-file cap below the total lets a user send, say, a
+// 5 MB PDF + 10 screenshots without tripping the batch limit on any
+// single attachment.
+const chatUploadMaxFileBytes = 25 * 1024 * 1024
+
+// chatUploadDir is the in-container path where user-uploaded chat
+// attachments land. Under /workspace so the file persists with the
+// workspace volume and is readable by the agent without any extra
+// plumbing — the agent just reads from the URI path we return.
+const chatUploadDir = "/workspace/.molecule/chat-uploads"
+
+// unsafeFilenameChars matches anything outside the conservative
+// {alnum, dot, underscore, dash} set. Filenames get rewritten
+// character-class at a time, so embedded paths, control chars,
+// newlines, quotes, and shell metachars never reach the filesystem.
+var unsafeFilenameChars = regexp.MustCompile(`[^a-zA-Z0-9._\-]`)
+
+// contentDispositionAttachment produces a safe `attachment; filename=...`
+// header. Quotes, CR, and LF in the filename are escaped per RFC 6266 /
+// RFC 5987: control chars dropped, backslash and double-quote
+// backslash-escaped inside the quoted-string. Also emits the
+// percent-encoded filename* parameter so non-ASCII names survive.
+// This matters because agents can write arbitrary filenames into
+// /workspace, and anything they produce reaches this header via
+// `filepath.Base(path)` — not all agents sanitize on their side.
+func contentDispositionAttachment(name string) string {
+	safeQ := make([]rune, 0, len(name))
+	for _, r := range name {
+		switch {
+		case r == '\r' || r == '\n':
+			// Drop — any CR/LF would terminate the header early.
+			continue
+		case r == '"' || r == '\\':
+			// Escape per RFC 6266 §4.1 quoted-string.
+			safeQ = append(safeQ, '\\', r)
+		case r < 0x20 || r == 0x7f:
+			// Drop other control chars.
+			continue
+		default:
+			safeQ = append(safeQ, r)
+		}
+	}
+	asciiSafe := string(safeQ)
+	// filename=  — double-quoted, escaped. Gives legacy clients a value.
+	// filename*= — RFC 5987 percent-encoded UTF-8, preferred when present.
+	return fmt.Sprintf(`attachment; filename="%s"; filename*=UTF-8''%s`,
+		asciiSafe, urlPathEscape(name))
+}
+
+// urlPathEscape percent-encodes every byte outside the RFC 3986
+// unreserved set — stricter than net/url.PathEscape (which leaves
+// "/" unescaped because it's legal in URL paths). Filenames must
+// never contain "/" anyway, so escaping it is defence-in-depth
+// against an agent that writes a path-like name.
+func urlPathEscape(s string) string {
+	const unreserved = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
+	var b strings.Builder
+	for _, c := range []byte(s) {
+		if strings.IndexByte(unreserved, c) >= 0 {
+			b.WriteByte(c)
+		} else {
+			fmt.Fprintf(&b, "%%%02X", c)
+		}
+	}
+	return b.String()
+}
+
+func sanitizeFilename(in string) string {
+	base := filepath.Base(in)
+	base = strings.ReplaceAll(base, " ", "_")
+	base = unsafeFilenameChars.ReplaceAllString(base, "_")
+	if len(base) > 100 {
+		ext := filepath.Ext(base)
+		if len(ext) > 16 {
+			ext = ""
+		}
+		base = base[:100-len(ext)] + ext
+	}
+	if base == "" || base == "." || base == ".." {
+		return "file"
+	}
+	return base
+}
+
+// ChatUploadedFile is the per-file response returned from POST
+// /workspaces/:id/chat/uploads. Clients include this payload (or a
+// trimmed subset) in their outgoing A2A `message/send` parts.
+type ChatUploadedFile struct {
+	// URI uses a custom "workspace:" scheme so clients can resolve it
+	// against the streaming Download endpoint regardless of where the
+	// canvas itself is hosted. The path component is always absolute
+	// within the workspace container.
+	URI      string `json:"uri"`
+	Name     string `json:"name"`
+	MimeType string `json:"mimeType,omitempty"`
+	Size     int64  `json:"size"`
+}
+
+// Upload handles POST /workspaces/:id/chat/uploads.
+// Accepts multipart/form-data with one or more `files` fields, stages
+// each under /workspace/.molecule/chat-uploads with a UUID prefix,
+// and returns the list of URIs for the caller to attach to an A2A
+// message.
+func (h *ChatFilesHandler) Upload(c *gin.Context) {
+	workspaceID := c.Param("id")
+	if err := validateWorkspaceID(workspaceID); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid workspace ID"})
+		return
+	}
+
+	// Hard cap the request body BEFORE ParseMultipartForm — otherwise
+	// a client could chunk-upload past the cap before Go notices.
+	c.Request.Body = http.MaxBytesReader(c.Writer, c.Request.Body, chatUploadMaxBytes)
+	if err := c.Request.ParseMultipartForm(chatUploadMaxBytes); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "failed to parse multipart form"})
+		return
+	}
+
+	form := c.Request.MultipartForm
+	var headers []*multipart.FileHeader
+	if form != nil && form.File != nil {
+		headers = form.File["files"]
+	}
+	if len(headers) == 0 {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "expected at least one 'files' field"})
+		return
+	}
+
+	ctx := c.Request.Context()
+	containerName := h.templates.findContainer(ctx, workspaceID)
+	if containerName == "" {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"})
+		return
+	}
+
+	// Build the archive in memory. Files are byte-preserving through
+	// Go's string<->[]byte (the tar helper takes map[string]string but
+	// the conversion is a literal copy, not a UTF-8 reinterpretation).
+	archive := map[string]string{}
+	uploaded := make([]ChatUploadedFile, 0, len(headers))
+	for _, fh := range headers {
+		if fh.Size > chatUploadMaxFileBytes {
+			c.JSON(http.StatusRequestEntityTooLarge, gin.H{
+				"error": fmt.Sprintf("%s exceeds per-file limit (%d MB)", fh.Filename, chatUploadMaxFileBytes/(1024*1024)),
+			})
+			return
+		}
+		f, err := fh.Open()
+		if err != nil {
+			c.JSON(http.StatusBadRequest, gin.H{"error": "failed to read upload"})
+			return
+		}
+		// LimitReader guards against a truthful-but-lying Size header:
+		// if the multipart stream carries more bytes than declared, we
+		// stop at the cap instead of growing the buffer.
+		data, err := io.ReadAll(io.LimitReader(f, chatUploadMaxFileBytes+1))
+		f.Close()
+		if err != nil {
+			c.JSON(http.StatusBadRequest, gin.H{"error": "failed to read upload"})
+			return
+		}
+		if int64(len(data)) > chatUploadMaxFileBytes {
+			c.JSON(http.StatusRequestEntityTooLarge, gin.H{
+				"error": fmt.Sprintf("%s exceeds per-file limit (%d MB)", fh.Filename, chatUploadMaxFileBytes/(1024*1024)),
+			})
+			return
+		}
+
+		name := sanitizeFilename(fh.Filename)
+		// 16-byte (UUID-equivalent) random prefix. Within a single
+		// batch we also check for collisions — birthday on 128 bits
+		// is astronomical, but a bad PRNG or single re-used draw
+		// would silently overwrite a sibling upload with its own
+		// content and return two URIs pointing at one file.
+		var stored string
+		for attempt := 0; attempt < 4; attempt++ {
+			idBytes := make([]byte, 16)
+			if _, err := rand.Read(idBytes); err != nil {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to allocate upload ID"})
+				return
+			}
+			candidate := hex.EncodeToString(idBytes) + "-" + name
+			if _, taken := archive[candidate]; !taken {
+				stored = candidate
+				break
+			}
+		}
+		if stored == "" {
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to allocate unique upload ID"})
+			return
+		}
+		archive[stored] = string(data)
+
+		mt := fh.Header.Get("Content-Type")
+		if mt == "" {
+			mt = mime.TypeByExtension(filepath.Ext(name))
+		}
+		uploaded = append(uploaded, ChatUploadedFile{
+			URI:      "workspace:" + chatUploadDir + "/" + stored,
+			Name:     name,
+			MimeType: mt,
+			Size:     int64(len(data)),
+		})
+	}
+
+	// mkdir -p is idempotent; we fire it every upload instead of
+	// caching state here so container restarts don't surprise us.
+	_, _ = h.templates.execInContainer(ctx, containerName, []string{"mkdir", "-p", chatUploadDir})
+
+	// Defence in depth: pre-remove each target path before extracting
+	// the tar. An agent with write access to /workspace could in
+	// theory race-create a symlink at <chatUploadDir>/<stored-name>
+	// pointing at a sensitive in-container path (its own /etc/*,
+	// mounted secrets). Docker's tar extraction on some drivers
+	// follows pre-existing symlinks at the destination. `rm -f` the
+	// exact stored-name closes that window — the UUID prefix on the
+	// name makes a successful race effectively impossible, but this
+	// guard costs nothing and documents the intent.
+	rmArgs := []string{"rm", "-f", "--"}
+	for stored := range archive {
+		rmArgs = append(rmArgs, chatUploadDir+"/"+stored)
+	}
+	_, _ = h.templates.execInContainer(ctx, containerName, rmArgs)
+
+	if err := h.copyFlatToContainer(ctx, containerName, chatUploadDir, archive); err != nil {
+		log.Printf("Chat upload copy failed for %s: %v", workspaceID, err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to stage files in workspace"})
+		return
+	}
+
+	c.JSON(http.StatusOK, gin.H{"files": uploaded})
+}
+
+// copyFlatToContainer extracts one tar of flat files into destPath
+// inside the container. Unlike the shared copyFilesToContainer helper
+// (which prepends destPath into tar entry names — correct for its
+// callers whose files relative-live inside a nested tree), this
+// helper writes tar entries with ONLY the flat filename so Docker's
+// extraction at destPath lands them directly in destPath, not at
+// destPath/destPath/... as the shared helper would.
+// Filenames are validated to contain no path separator so nothing
+// can escape destPath via an embedded "../" or a leading "/".
+func (h *ChatFilesHandler) copyFlatToContainer(ctx context.Context, containerName, destPath string, files map[string]string) error {
+	if h.templates.docker == nil {
+		return fmt.Errorf("docker not available")
+	}
+	var buf bytes.Buffer
+	tw := tar.NewWriter(&buf)
+	for name, content := range files {
+		if strings.ContainsAny(name, "/\\") || name == ".." || name == "." || name == "" {
+			return fmt.Errorf("unsafe flat filename: %q", name)
+		}
+		data := []byte(content)
+		if err := tw.WriteHeader(&tar.Header{
+			Name:     name, // relative — Docker resolves against destPath
+			Mode:     0644,
+			Size:     int64(len(data)),
+			Typeflag: tar.TypeReg,
+		}); err != nil {
+			return fmt.Errorf("tar header %q: %w", name, err)
+		}
+		if _, err := tw.Write(data); err != nil {
+			return fmt.Errorf("tar write %q: %w", name, err)
+		}
+	}
+	if err := tw.Close(); err != nil {
+		return fmt.Errorf("tar close: %w", err)
+	}
+	return h.templates.docker.CopyToContainer(ctx, containerName, destPath, &buf, container.CopyToContainerOptions{})
+}
+
+// Download handles GET /workspaces/:id/chat/download?path=<abs path>.
+// Streams the file bytes from the container with a correct
+// Content-Type and attachment Content-Disposition. Binary-safe —
+// unlike the existing JSON ReadFile endpoint which carries content
+// as a string (lossy for non-UTF-8 bytes).
+func (h *ChatFilesHandler) Download(c *gin.Context) {
+	workspaceID := c.Param("id")
+	if err := validateWorkspaceID(workspaceID); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid workspace ID"})
+		return
+	}
+
+	path := c.Query("path")
+	if path == "" {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path query required"})
+		return
+	}
+	if !filepath.IsAbs(path) {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path must be absolute"})
+		return
+	}
+	// Path must land under one of the allowed roots — mirrors the
+	// ReadFile security model and prevents arbitrary reads of /etc
+	// or other system paths via this endpoint.
+	rooted := false
+	for root := range allowedRoots {
+		if path == root || strings.HasPrefix(path, root+"/") {
+			rooted = true
+			break
+		}
+	}
+	if !rooted {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path must be under /configs, /workspace, /home, or /plugins"})
+		return
+	}
+	// Reject anything that canonicalises differently or contains a
+	// traversal segment. Defence-in-depth on top of the prefix check.
+	if filepath.Clean(path) != path || strings.Contains(path, "..") {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid path"})
+		return
+	}
+
+	ctx := c.Request.Context()
+	if h.templates.docker == nil {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "docker unavailable"})
+		return
+	}
+	containerName := h.templates.findContainer(ctx, workspaceID)
+	if containerName == "" {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"})
+		return
+	}
+
+	// docker cp returns a tar stream containing the requested path.
+	// For a regular file that's a single tar entry; we extract and
+	// stream the body through.
+	reader, _, err := h.templates.docker.CopyFromContainer(ctx, containerName, path)
+	if err != nil {
+		c.JSON(http.StatusNotFound, gin.H{"error": "file not found"})
+		return
+	}
+	defer reader.Close()
+
+	tr := tar.NewReader(reader)
+	hdr, err := tr.Next()
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to read archive"})
+		return
+	}
+	if hdr.Typeflag != tar.TypeReg {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "path is not a regular file"})
+		return
+	}
+
+	name := filepath.Base(path)
+	mt := mime.TypeByExtension(filepath.Ext(name))
+	if mt == "" {
+		mt = "application/octet-stream"
+	}
+	c.Header("Content-Type", mt)
+	c.Header("Content-Length", fmt.Sprintf("%d", hdr.Size))
+	c.Header("Content-Disposition", contentDispositionAttachment(name))
+	c.Status(http.StatusOK)
+
+	// Stream exactly hdr.Size bytes. CopyN was chosen over LimitReader
+	// because it returns an error when the source is short — that
+	// surfaces a bug in the tar extraction path immediately instead
+	// of silently truncating. Agents can legitimately produce files
+	// larger than the 50 MB upload cap (that's a per-request inbound
+	// cap, not a per-artifact one), so we cannot clamp here.
+	if _, err := io.CopyN(c.Writer, tr, hdr.Size); err != nil {
+		log.Printf("Chat download stream error for %s (%s): %v", workspaceID, path, err)
+	}
+}
--- a/workspace-server/internal/handlers/chat_files_test.go
+++ b/workspace-server/internal/handlers/chat_files_test.go
@ -0,0 +1,194 @@
+package handlers
+
+// Unit tests for chat_files.go. The Docker-touching paths (Upload
+// actually copying into a container, Download actually streaming tar)
+// are exercised via integration tests — docker-in-docker is out of
+// scope for the unit suite. These tests cover the validation + error
+// surfaces that a caller can reach without a running container.
+
+import (
+	"bytes"
+	"mime/multipart"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	"github.com/gin-gonic/gin"
+)
+
+func TestSanitizeFilename(t *testing.T) {
+	cases := []struct {
+		in, want string
+	}{
+		{"report.pdf", "report.pdf"},
+		{"my file.pdf", "my_file.pdf"},
+		{"../../etc/passwd", "passwd"},
+		{"weird;$name`.txt", "weird__name_.txt"},
+		{"", "file"},
+		{".", "file"},
+		{"..", "file"},
+	}
+	for _, tc := range cases {
+		got := sanitizeFilename(tc.in)
+		if got != tc.want {
+			t.Errorf("sanitizeFilename(%q) = %q, want %q", tc.in, got, tc.want)
+		}
+	}
+}
+
+func TestSanitizeFilename_LongNamePreservesExtension(t *testing.T) {
+	// 120-char base + .pdf — the helper should truncate the base but
+	// keep the extension intact so content-type inference still works.
+	longBase := strings.Repeat("a", 120)
+	got := sanitizeFilename(longBase + ".pdf")
+	if len(got) > 100 {
+		t.Errorf("filename not truncated: len=%d", len(got))
+	}
+	if !strings.HasSuffix(got, ".pdf") {
+		t.Errorf("extension stripped: %q", got)
+	}
+}
+
+func TestChatUpload_InvalidWorkspaceID(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil)
+	h := NewChatFilesHandler(tmplh)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "not-a-uuid"}}
+	c.Request = httptest.NewRequest("POST", "/workspaces/not-a-uuid/chat/uploads", nil)
+
+	h.Upload(c)
+
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400 on invalid workspace id, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
+func TestChatUpload_MissingFiles(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil)
+	h := NewChatFilesHandler(tmplh)
+
+	// Multipart body with no `files` field — only a text field.
+	var buf bytes.Buffer
+	mw := multipart.NewWriter(&buf)
+	_ = mw.WriteField("other", "value")
+	mw.Close()
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+	req := httptest.NewRequest("POST", "/workspaces/00000000-0000-0000-0000-000000000001/chat/uploads", &buf)
+	req.Header.Set("Content-Type", mw.FormDataContentType())
+	c.Request = req
+
+	h.Upload(c)
+
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400 when files field missing, got %d: %s", w.Code, w.Body.String())
+	}
+	if !strings.Contains(w.Body.String(), "files") {
+		t.Errorf("expected error to mention files field: %s", w.Body.String())
+	}
+}
+
+func TestChatDownload_InvalidPath(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil)
+	h := NewChatFilesHandler(tmplh)
+
+	cases := []struct {
+		name, path, wantSubstr string
+	}{
+		{"empty", "", "path query required"},
+		{"relative", "workspace/foo.txt", "must be absolute"},
+		{"wrong root", "/etc/passwd", "must be under"},
+		{"traversal", "/workspace/../etc/passwd", "invalid path"},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			w := httptest.NewRecorder()
+			c, _ := gin.CreateTestContext(w)
+			c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+			req := httptest.NewRequest("GET", "/workspaces/xxx/chat/download?path="+tc.path, nil)
+			c.Request = req
+
+			h.Download(c)
+
+			if w.Code != http.StatusBadRequest {
+				t.Errorf("expected 400 for %s, got %d: %s", tc.name, w.Code, w.Body.String())
+			}
+			if !strings.Contains(w.Body.String(), tc.wantSubstr) {
+				t.Errorf("expected error to contain %q, got: %s", tc.wantSubstr, w.Body.String())
+			}
+		})
+	}
+}
+
+func TestContentDispositionAttachment_Escapes(t *testing.T) {
+	cases := []struct {
+		name, input, wantSubstr string
+	}{
+		{
+			name:       "plain ASCII passes through",
+			input:      "report.pdf",
+			wantSubstr: `filename="report.pdf"`,
+		},
+		{
+			name:       "double-quote is backslash-escaped",
+			input:      `weird".pdf`,
+			wantSubstr: `filename="weird\".pdf"`,
+		},
+		{
+			name:       "CR and LF dropped to prevent header injection",
+			input:      "bad\r\nX-Leak: 1\r\n.txt",
+			wantSubstr: `filename="badX-Leak: 1.txt"`,
+		},
+		{
+			name:       "non-ASCII emits filename* percent-encoded",
+			input:      "résumé.pdf",
+			wantSubstr: "filename*=UTF-8''r%C3%A9sum%C3%A9.pdf",
+		},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := contentDispositionAttachment(tc.input)
+			if !strings.Contains(got, tc.wantSubstr) {
+				t.Errorf("contentDispositionAttachment(%q) = %q, missing substring %q", tc.input, got, tc.wantSubstr)
+			}
+			// Must never contain a bare CR or LF — either would end the header.
+			if strings.ContainsAny(got, "\r\n") {
+				t.Errorf("header contains CR/LF: %q", got)
+			}
+		})
+	}
+}
+
+func TestChatDownload_DockerUnavailable(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+
+	tmplh := NewTemplatesHandler(t.TempDir(), nil) // docker=nil
+	h := NewChatFilesHandler(tmplh)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+	req := httptest.NewRequest("GET", "/workspaces/xxx/chat/download?path=/workspace/report.pdf", nil)
+	c.Request = req
+
+	h.Download(c)
+
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503 when docker is nil, got %d: %s", w.Code, w.Body.String())
+	}
+}
--- a/workspace-server/internal/handlers/org.go
+++ b/workspace-server/internal/handlers/org.go
@ -5,6 +5,7 @@ package handlers

 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"log"
 	"net/http"
@ -180,6 +181,108 @@ func NewOrgHandler(wh *WorkspaceHandler, b *events.Broadcaster, p *provisioner.P
 	}
 }

+// EnvRequirement is either a single env var name (strict: that exact
+// var must be configured) or an any-of group (any one of the listed
+// names satisfies the requirement).
+//
+// YAML shapes accepted:
+//
+//	required_env:
+//	  - GITHUB_TOKEN                              # single
+//	  - any_of: [ANTHROPIC_API_KEY, CLAUDE_CODE_OAUTH_TOKEN]   # OR group
+//
+// The any-of form exists because some runtimes accept either of two
+// credential shapes — Claude Code takes ANTHROPIC_API_KEY or an OAuth
+// token interchangeably, and forcing an org template to pick one
+// would falsely block the other. For JSON (GET /org/templates),
+// the same shapes round-trip: strings stay strings, groups stay
+// {any_of: [...]}.
+type EnvRequirement struct {
+	// Name is non-empty for a single required env var.
+	Name string
+	// AnyOf is non-empty for an OR group; any one member satisfies.
+	AnyOf []string
+}
+
+// Members returns every env name this requirement considers —
+// [Name] for single, AnyOf for groups. Used by preflight, collect,
+// and the name-validation regex gate.
+func (e EnvRequirement) Members() []string {
+	if e.Name != "" {
+		return []string{e.Name}
+	}
+	return e.AnyOf
+}
+
+// IsSatisfied reports whether any member of the requirement is
+// present in `configured`. Single: exact-match. AnyOf: at least
+// one hit.
+func (e EnvRequirement) IsSatisfied(configured map[string]struct{}) bool {
+	for _, m := range e.Members() {
+		if _, ok := configured[m]; ok {
+			return true
+		}
+	}
+	return false
+}
+
+// UnmarshalYAML accepts either a scalar (string → single) or a map
+// with an `any_of` list (→ group).
+func (e *EnvRequirement) UnmarshalYAML(value *yaml.Node) error {
+	if value.Kind == yaml.ScalarNode {
+		var s string
+		if err := value.Decode(&s); err != nil {
+			return err
+		}
+		e.Name = s
+		return nil
+	}
+	var alt struct {
+		AnyOf []string `yaml:"any_of"`
+	}
+	if err := value.Decode(&alt); err != nil {
+		return fmt.Errorf("env requirement must be a string or {any_of: [...]}: %w", err)
+	}
+	if len(alt.AnyOf) == 0 {
+		return fmt.Errorf("env requirement any_of must contain at least one env var")
+	}
+	e.AnyOf = alt.AnyOf
+	return nil
+}
+
+// MarshalJSON emits the dual shape so GET /org/templates callers get
+// {"required_env": ["GITHUB_TOKEN", {"any_of": [...]}]}, matching
+// the YAML syntax.
+func (e EnvRequirement) MarshalJSON() ([]byte, error) {
+	if e.Name != "" {
+		return json.Marshal(e.Name)
+	}
+	return json.Marshal(struct {
+		AnyOf []string `json:"any_of"`
+	}{AnyOf: e.AnyOf})
+}
+
+// UnmarshalJSON is the inverse — accepts the same dual shape so
+// POST /org/import with an inline `template` body works too.
+func (e *EnvRequirement) UnmarshalJSON(data []byte) error {
+	var s string
+	if err := json.Unmarshal(data, &s); err == nil {
+		e.Name = s
+		return nil
+	}
+	var alt struct {
+		AnyOf []string `json:"any_of"`
+	}
+	if err := json.Unmarshal(data, &alt); err != nil {
+		return fmt.Errorf("env requirement must be a string or {any_of: [...]}: %w", err)
+	}
+	if len(alt.AnyOf) == 0 {
+		return fmt.Errorf("env requirement any_of must contain at least one env var")
+	}
+	e.AnyOf = alt.AnyOf
+	return nil
+}
+
 // OrgTemplate is the YAML structure for an org hierarchy.
 type OrgTemplate struct {
 	Name           string              `yaml:"name" json:"name"`
@ -189,6 +292,18 @@ type OrgTemplate struct {
 	// GlobalMemories is a list of org-wide memories seeded as GLOBAL scope
 	// on the first root workspace (PM) during org import. Issue #1050.
 	GlobalMemories []models.MemorySeed `yaml:"global_memories" json:"global_memories"`
+	// RequiredEnv lists env vars that MUST be configured globally (or
+	// on every workspace in the subtree that needs them) before import
+	// succeeds. Each entry is either a plain string (strict) or an
+	// {any_of: [...]} group (at least one member must be set). Declared
+	// at the org level for shared creds; also extensible per-workspace
+	// via OrgWorkspace.RequiredEnv for team-scoped credentials.
+	RequiredEnv []EnvRequirement `yaml:"required_env" json:"required_env"`
+	// RecommendedEnv is the "nice-to-have" tier — import still succeeds
+	// without them, but features degrade. Same single|any_of shape as
+	// RequiredEnv so a recommended OR group reads "set any one of these
+	// to unlock the feature; all missing = warning".
+	RecommendedEnv []EnvRequirement `yaml:"recommended_env" json:"recommended_env"`
 }

 type OrgDefaults struct {
@ -295,7 +410,17 @@ type OrgWorkspace struct {
 		X float64 `yaml:"x" json:"x"`
 		Y float64 `yaml:"y" json:"y"`
 	} `yaml:"canvas" json:"canvas"`
-	Children []OrgWorkspace `yaml:"children" json:"children"`
+	// RequiredEnv / RecommendedEnv declared at the workspace level
+	// narrow down what a specific team needs beyond the org-wide union.
+	// When GET /org/templates walks the tree, these flow up into
+	// OrgTemplate.RequiredEnv / RecommendedEnv. A workspace's subtree
+	// inherits: a parent declaring ANTHROPIC_API_KEY as required
+	// means every descendant considers it required too (no override
+	// needed at each leaf). Same single|any_of shape as the org-level
+	// lists.
+	RequiredEnv    []EnvRequirement `yaml:"required_env" json:"required_env"`
+	RecommendedEnv []EnvRequirement `yaml:"recommended_env" json:"recommended_env"`
+	Children       []OrgWorkspace   `yaml:"children" json:"children"`
 }

 // ListTemplates handles GET /org/templates — lists available org templates.
@ -354,11 +479,18 @@ func (h *OrgHandler) ListTemplates(c *gin.Context) {
 			continue
 		}
 		count := countWorkspaces(tmpl.Workspaces)
+		// Walk the tree to collect required + recommended env union.
+		// Canvas uses these to render a preflight modal BEFORE firing
+		// the import — saves the user from a 15-workspace import that
+		// dies one container at a time on missing creds.
+		required, recommended := collectOrgEnv(&tmpl)
 		templates = append(templates, map[string]interface{}{
-			"dir":         e.Name(),
-			"name":        tmpl.Name,
-			"description": tmpl.Description,
-			"workspaces":  count,
+			"dir":             e.Name(),
+			"name":            tmpl.Name,
+			"description":     tmpl.Description,
+			"workspaces":      count,
+			"required_env":    required,
+			"recommended_env": recommended,
 		})
 	}

@ -370,6 +502,13 @@ func (h *OrgHandler) Import(c *gin.Context) {
 	var body struct {
 		Dir      string      `json:"dir"`      // org template directory name
 		Template OrgTemplate `json:"template"` // or inline template
+		// Force skips the required-env preflight. Used by tooling
+		// that already computed the preflight client-side and wants
+		// to proceed despite missing creds (usually because the
+		// user explicitly acknowledged the tradeoff). Default behavior
+		// refuses the import with a 412 and the missing-key list so
+		// the canvas can surface them in its preflight modal.
+		Force bool `json:"force"`
 	}
 	if err := c.ShouldBindJSON(&body); err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request body"})
@ -415,6 +554,59 @@ func (h *OrgHandler) Import(c *gin.Context) {
 		return
 	}

+	// Required-env preflight — refuses import when any required_env is
+	// missing from global_secrets (unless `force: true` overrides). The
+	// canvas runs the same check client-side against GET /org/templates
+	// output and shows a modal so users set keys before clicking Import;
+	// this server-side check is the authoritative guard in case a caller
+	// bypasses the UI (CLI, API clients, etc.). 412 Precondition Failed
+	// carries the missing-key list so tooling can render the same
+	// add-key flow.
+	required, _ := collectOrgEnv(&tmpl)
+	if body.Force {
+		// Log the bypass so a post-incident search can find who
+		// imported an org with missing creds. The common audit flow
+		// treats log.Printf at INFO as the low-cost trail for
+		// explicit-override actions — keeps force as a supported
+		// knob but makes it investigable.
+		log.Printf("Org import: force=true bypass — template=%q, required_env=%v", tmpl.Name, required)
+	} else if len(required) > 0 {
+		ctx := c.Request.Context()
+		configured, err := loadConfiguredGlobalSecretKeys(ctx)
+		if err != nil {
+			// Fail closed. Previously this fell through and imported
+			// anyway, defeating the preflight for exactly the case
+			// it's meant to cover. A DB hiccup should look like a
+			// retryable 500, not a silent green light for an import
+			// that will fail at container-start time on every node.
+			log.Printf("Org import preflight: global secrets lookup failed: %v", err)
+			c.JSON(http.StatusInternalServerError, gin.H{
+				"error": "could not verify required environment variables; try again or pass force=true to override",
+			})
+			return
+		}
+		var missing []EnvRequirement
+		for _, req := range required {
+			// For a single requirement this is exact-match; for an
+			// any-of group, any one member satisfies. Groups whose
+			// alternative is already configured drop out here — the
+			// user doesn't need to re-configure them.
+			if !req.IsSatisfied(configured) {
+				missing = append(missing, req)
+			}
+		}
+		if len(missing) > 0 {
+			c.JSON(http.StatusPreconditionFailed, gin.H{
+				"error":        "missing required environment variables",
+				"missing_env":  missing,
+				"required_env": required,
+				"template":     tmpl.Name,
+				"suggestion":   "set these as global secrets (POST /settings/secrets) or pass force=true to override",
+			})
+			return
+		}
+	}
+
 	results := []map[string]interface{}{}
 	var createErr error

@ -426,7 +618,8 @@ func (h *OrgHandler) Import(c *gin.Context) {
 	// using subtree-aware grid slots (children that are themselves
 	// parents get a bigger slot so they don't overflow into siblings).
 	for _, ws := range tmpl.Workspaces {
-		if err := h.createWorkspaceTree(ws, nil, ws.Canvas.X, ws.Canvas.Y, tmpl.Defaults, orgBaseDir, &results, provisionSem); err != nil {
+		// Root: relX/relY == absX/absY (no parent to be relative to).
+		if err := h.createWorkspaceTree(ws, nil, ws.Canvas.X, ws.Canvas.Y, ws.Canvas.X, ws.Canvas.Y, tmpl.Defaults, orgBaseDir, &results, provisionSem); err != nil {
 			createErr = err
 			break
 		}
--- a/workspace-server/internal/handlers/org_import.go
+++ b/workspace-server/internal/handlers/org_import.go
@ -10,6 +10,8 @@ import (
 	"log"
 	"os"
 	"path/filepath"
+	"regexp"
+	"sort"
 	"strings"
 	"time"

@ -28,7 +30,13 @@ import (
 // parent.abs + childSlotInGrid(index, siblingSizes) computed by the
 // caller. Storing already-absolute coords means a child that is itself
 // a parent can simply compound the grid without any per-call math.
-func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX, absY float64, defaults OrgDefaults, orgBaseDir string, results *[]map[string]interface{}, provisionSem chan struct{}) error {
+// relX / relY are THIS workspace's position RELATIVE to its parent's
+// absolute origin (i.e. childSlotInGrid output for children; 0,0 for
+// roots since a root's absolute IS its relative). The broadcast
+// payload ships relative coords so the canvas can drop the node
+// straight into the parent's child-coordinate space without doing a
+// canvas-wide absolute-position walk.
+func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX, absY, relX, relY float64, defaults OrgDefaults, orgBaseDir string, results *[]map[string]interface{}, provisionSem chan struct{}) error {
 	// Apply defaults
 	runtime := ws.Runtime
 	if runtime == "" {
@ -128,10 +136,23 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 	}

 	// Broadcast — include runtime so the canvas pill renders the right
-	// badge immediately instead of "unknown".
-	h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISIONING", id, map[string]interface{}{
+	// badge immediately instead of "unknown". parent_id + x/y let the
+	// canvas's org-deploy animation spawn the child from the parent's
+	// current coords and tween into its reserved slot, instead of
+	// landing in a default grid position first and snapping on the
+	// next hydrate.
+	payload := map[string]interface{}{
 		"name": ws.Name, "tier": tier, "runtime": runtime,
-	})
+		// Parent-relative coords — the canvas's React Flow node uses
+		// these as the node's position when parent_id is set (React
+		// Flow treats node.position as parent-relative when the node
+		// has a parentId). For roots, relX/relY == absX/absY.
+		"x": relX, "y": relY,
+	}
+	if parentID != nil {
+		payload["parent_id"] = *parentID
+	}
+	h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISIONING", id, payload)

 	// Seed initial memories from workspace config or defaults (issue #1050).
 	// Per-workspace initial_memories override defaults; if workspace has none,
@ -509,7 +530,9 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 			slotX, slotY := childSlotInGrid(i, siblingSizes)
 			childAbsX := absX + slotX
 			childAbsY := absY + slotY
-			if err := h.createWorkspaceTree(child, &id, childAbsX, childAbsY, defaults, orgBaseDir, results, provisionSem); err != nil {
+			// slotX/slotY are already parent-relative — that's
+			// exactly what childSlotInGrid returns.
+			if err := h.createWorkspaceTree(child, &id, childAbsX, childAbsY, slotX, slotY, defaults, orgBaseDir, results, provisionSem); err != nil {
 				return err
 			}
 			time.Sleep(workspaceCreatePacingMs * time.Millisecond)
@ -519,6 +542,213 @@ func (h *OrgHandler) createWorkspaceTree(ws OrgWorkspace, parentID *string, absX
 	return nil
 }

+// envVarNamePattern guards template-supplied env var names against
+// pathological inputs. A malicious template could ship
+// required_env: ["'; DROP …"] or whitespace-only entries that would
+// flow through collectOrgEnv → into the 412 response body and,
+// worse, into the modal's PUT /settings/secrets input. Schema
+// already has `key TEXT NOT NULL UNIQUE` and our queries are
+// parameterised so SQL injection isn't the threat — the real risks
+// are UI rendering weirdness (newlines, NUL bytes, zero-width chars)
+// and downstream env-var semantics (POSIX requires uppercase +
+// underscore + digit). A strict regex filters both classes of
+// problem at a single choke point.
+var envVarNamePattern = regexp.MustCompile(`^[A-Z][A-Z0-9_]{0,127}$`)
+
+// sanitizeEnvMembers filters a requirement's member list through the
+// name-validation regex, logging rejections. Returns the filtered
+// list and a boolean indicating whether any valid members remain.
+// Used so a group containing one valid + one bogus name is kept
+// (valid member carries the group) rather than silently dropped.
+func sanitizeEnvMembers(members []string, where string) ([]string, bool) {
+	out := make([]string, 0, len(members))
+	for _, k := range members {
+		if !envVarNamePattern.MatchString(k) {
+			if k != "" {
+				log.Printf("collectOrgEnv: rejecting invalid env var name %q from %s (must match %s)", k, where, envVarNamePattern)
+			}
+			continue
+		}
+		out = append(out, k)
+	}
+	return out, len(out) > 0
+}
+
+// envRequirementKey canonicalises a requirement for dedup — sorted
+// member list joined with NUL so `any_of: [A, B]` and `any_of: [B, A]`
+// collapse to the same key. Single requirements are length-1 groups.
+func envRequirementKey(members []string) string {
+	cp := append([]string(nil), members...)
+	sort.Strings(cp)
+	return strings.Join(cp, "\x00")
+}
+
+// collectOrgEnv walks the whole template tree and returns the union of
+// required_env and recommended_env declared anywhere — at the org
+// level, on root workspaces, or on any nested child. Deduplicates by
+// group membership (same set of members = same requirement) and
+// sorts deterministically so the canvas sees a stable order.
+//
+// "Required wins" rules:
+//
+//   - A requirement that appears in BOTH required and recommended
+//     (same members) surfaces only as required.
+//   - A single-name requirement (e.g. "API_KEY") and a group that
+//     contains that same name (e.g. {any_of: [API_KEY, OTHER]}) are
+//     NOT deduplicated — they're semantically different (strict vs
+//     satisfiable-by-alternative) and the stricter "single" one wins,
+//     so the any-of group is dropped when its members overlap with a
+//     strict requirement declared elsewhere.
+//
+// Invalid names fail envVarNamePattern; the filter is applied per
+// group so a group with one bogus entry keeps the rest. A group
+// whose ALL members are invalid is dropped entirely with a log.
+func collectOrgEnv(tmpl *OrgTemplate) (required, recommended []EnvRequirement) {
+	reqByKey := map[string]EnvRequirement{}
+	recByKey := map[string]EnvRequirement{}
+	// Names covered by strict (single) required entries. A group in
+	// EITHER tier whose any-of contains ONE of these names is
+	// dominated by the strict requirement and gets dropped on the
+	// second pass.
+	strictRequiredNames := map[string]struct{}{}
+
+	accept := func(into map[string]EnvRequirement, src []EnvRequirement, where string, markStrict bool) {
+		for _, req := range src {
+			members, ok := sanitizeEnvMembers(req.Members(), where)
+			if !ok {
+				continue
+			}
+			key := envRequirementKey(members)
+			if _, exists := into[key]; exists {
+				continue
+			}
+			if req.Name != "" && len(members) == 1 {
+				into[key] = EnvRequirement{Name: members[0]}
+				if markStrict {
+					strictRequiredNames[members[0]] = struct{}{}
+				}
+			} else {
+				into[key] = EnvRequirement{AnyOf: members}
+			}
+		}
+	}
+	accept(reqByKey, tmpl.RequiredEnv, "template root", true)
+	accept(recByKey, tmpl.RecommendedEnv, "template root", false)
+	var walk func([]OrgWorkspace)
+	walk = func(ws []OrgWorkspace) {
+		for _, w := range ws {
+			accept(reqByKey, w.RequiredEnv, "workspace "+w.Name, true)
+			accept(recByKey, w.RecommendedEnv, "workspace "+w.Name, false)
+			walk(w.Children)
+		}
+	}
+	walk(tmpl.Workspaces)
+
+	// Required wins across tiers: any requirement whose members
+	// overlap with a strict required name gets dropped from
+	// recommended. Keeps the canvas modal from showing the same
+	// key in both sections.
+	prune := func(from map[string]EnvRequirement) {
+		for k, r := range from {
+			for _, m := range r.Members() {
+				if _, strict := strictRequiredNames[m]; strict {
+					delete(from, k)
+					break
+				}
+			}
+		}
+	}
+	prune(recByKey)
+
+	// Same-tier: a strict required X dominates any-of groups in
+	// required that CONTAIN X (a group saying "any of X, Y" is
+	// automatically satisfied when X is required anyway, so it's
+	// redundant). Same logic applied to recommended.
+	pruneSameTier := func(tier map[string]EnvRequirement) {
+		strictInTier := map[string]struct{}{}
+		for _, r := range tier {
+			if r.Name != "" {
+				strictInTier[r.Name] = struct{}{}
+			}
+		}
+		for k, r := range tier {
+			if len(r.AnyOf) == 0 {
+				continue
+			}
+			for _, m := range r.AnyOf {
+				if _, strict := strictInTier[m]; strict {
+					delete(tier, k)
+					break
+				}
+			}
+		}
+	}
+	pruneSameTier(reqByKey)
+	pruneSameTier(recByKey)
+
+	required = flattenAndSortRequirements(reqByKey)
+	recommended = flattenAndSortRequirements(recByKey)
+	return required, recommended
+}
+
+func flattenAndSortRequirements(by map[string]EnvRequirement) []EnvRequirement {
+	out := make([]EnvRequirement, 0, len(by))
+	for _, r := range by {
+		out = append(out, r)
+	}
+	sort.Slice(out, func(i, j int) bool {
+		// Sort singles first by name; groups after, ordered by
+		// joined-member string. Gives the canvas a deterministic
+		// render order so the same template always produces the
+		// same modal layout.
+		iSingle := out[i].Name != ""
+		jSingle := out[j].Name != ""
+		if iSingle != jSingle {
+			return iSingle
+		}
+		if iSingle {
+			return out[i].Name < out[j].Name
+		}
+		return envRequirementKey(out[i].AnyOf) < envRequirementKey(out[j].AnyOf)
+	})
+	return out
+}
+
+// loadConfiguredGlobalSecretKeys returns the set of key names present
+// in global_secrets WHERE the encrypted_value is non-empty. Filtering
+// on the payload size catches the failure mode where a row was
+// upserted with an empty value (historical rows predating the
+// binding:"required" guard on SetGlobal, or a future direct SQL
+// path that skips it) — the preflight would otherwise report the
+// key as "configured" and the per-container preflight would still
+// fail at start time, defeating the whole feature.
+// The LIMIT is a sanity cap: at realistic tenant sizes (< 1k
+// secrets) it's a no-op; at pathological sizes it stops one slow
+// query from wedging org imports. A hit gets logged so operators
+// can investigate.
+const globalSecretsPreflightLimit = 10000
+
+func loadConfiguredGlobalSecretKeys(ctx context.Context) (map[string]struct{}, error) {
+	rows, err := db.DB.QueryContext(ctx,
+		`SELECT key FROM global_secrets WHERE octet_length(encrypted_value) > 0 LIMIT $1`,
+		globalSecretsPreflightLimit)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	out := map[string]struct{}{}
+	for rows.Next() {
+		var k string
+		if scanErr := rows.Scan(&k); scanErr == nil && k != "" {
+			out[k] = struct{}{}
+		}
+	}
+	if len(out) == globalSecretsPreflightLimit {
+		log.Printf("loadConfiguredGlobalSecretKeys: hit LIMIT %d — org-import preflight may be incomplete", globalSecretsPreflightLimit)
+	}
+	return out, rows.Err()
+}
+
 func countWorkspaces(workspaces []OrgWorkspace) int {
 	count := len(workspaces)
 	for _, ws := range workspaces {
--- a/workspace-server/internal/handlers/org_test.go
+++ b/workspace-server/internal/handlers/org_test.go
@ -1,6 +1,7 @@
 package handlers

 import (
+	"sort"
 	"strings"
 	"testing"
 	"time"
@ -650,3 +651,428 @@ func TestOrgImport_ScheduleComputeError(t *testing.T) {
 		})
 	}
 }
+
+// ============================================================================
+// Org env-preflight aggregation (collectOrgEnv)
+// ============================================================================
+
+// strictReq builds a slice of single-name EnvRequirements for test
+// fixtures. Equivalent to the old []string literal but wrapped in
+// the new union shape.
+func strictReq(names ...string) []EnvRequirement {
+	out := make([]EnvRequirement, 0, len(names))
+	for _, n := range names {
+		out = append(out, EnvRequirement{Name: n})
+	}
+	return out
+}
+
+// anyOfReq builds a single any-of EnvRequirement for test fixtures.
+func anyOfReq(names ...string) EnvRequirement {
+	return EnvRequirement{AnyOf: append([]string(nil), names...)}
+}
+
+// reqNames flattens a slice of EnvRequirements into a single comparable
+// slice: single-name reqs contribute their Name, any-of reqs contribute
+// "anyOf(A|B|C)" with members sorted for deterministic output. Lets
+// tests assert against a string form regardless of which kind each
+// entry takes.
+func reqNames(reqs []EnvRequirement) []string {
+	out := make([]string, 0, len(reqs))
+	for _, r := range reqs {
+		if r.Name != "" {
+			out = append(out, r.Name)
+			continue
+		}
+		members := append([]string(nil), r.AnyOf...)
+		sort.Strings(members)
+		out = append(out, "anyOf("+strings.Join(members, "|")+")")
+	}
+	return out
+}
+
+func TestCollectOrgEnv_UnionAcrossLevels(t *testing.T) {
+	tmpl := &OrgTemplate{
+		RequiredEnv:    strictReq("ANTHROPIC_API_KEY"),
+		RecommendedEnv: strictReq("SLACK_WEBHOOK_URL"),
+		Workspaces: []OrgWorkspace{
+			{
+				Name:        "Root",
+				RequiredEnv: strictReq("GITHUB_TOKEN"),
+				Children: []OrgWorkspace{
+					{
+						Name:           "Leaf",
+						RequiredEnv:    strictReq("OPENROUTER_API_KEY"),
+						RecommendedEnv: strictReq("DISCORD_WEBHOOK_URL"),
+					},
+				},
+			},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	// Required is the union of top-level + root + leaf.
+	wantReq := []string{"ANTHROPIC_API_KEY", "GITHUB_TOKEN", "OPENROUTER_API_KEY"}
+	if !stringSlicesEqual(reqNames(req), wantReq) {
+		t.Errorf("required mismatch: got %v, want %v", reqNames(req), wantReq)
+	}
+	wantRec := []string{"DISCORD_WEBHOOK_URL", "SLACK_WEBHOOK_URL"}
+	if !stringSlicesEqual(reqNames(rec), wantRec) {
+		t.Errorf("recommended mismatch: got %v, want %v", reqNames(rec), wantRec)
+	}
+}
+
+func TestCollectOrgEnv_RequiredWinsOverRecommended(t *testing.T) {
+	// Same key declared at one layer as recommended and another as
+	// required MUST surface only on the required side — a required
+	// declaration is strictly stricter than a recommended one, and
+	// listing it in both tiers would confuse the preflight modal.
+	tmpl := &OrgTemplate{
+		RecommendedEnv: strictReq("API_KEY"),
+		Workspaces: []OrgWorkspace{
+			{Name: "X", RequiredEnv: strictReq("API_KEY")},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "API_KEY" {
+		t.Errorf("required should contain API_KEY, got %v", reqNames(req))
+	}
+	for _, r := range rec {
+		if r.Name == "API_KEY" {
+			t.Errorf("API_KEY must not appear in recommended once required elsewhere")
+		}
+	}
+}
+
+func TestCollectOrgEnv_Dedup(t *testing.T) {
+	// Same key declared twice at different levels should appear once.
+	tmpl := &OrgTemplate{
+		RequiredEnv: strictReq("K", "K"),
+		Workspaces: []OrgWorkspace{
+			{Name: "A", RequiredEnv: strictReq("K")},
+			{Name: "B", RequiredEnv: strictReq("K"), Children: []OrgWorkspace{
+				{Name: "C", RequiredEnv: strictReq("K")},
+			}},
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "K" {
+		t.Errorf("dedup failed: got %v, want [K]", reqNames(req))
+	}
+}
+
+func TestCollectOrgEnv_Empty(t *testing.T) {
+	tmpl := &OrgTemplate{}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 0 || len(rec) != 0 {
+		t.Errorf("empty template should produce empty slices, got req=%v rec=%v", reqNames(req), reqNames(rec))
+	}
+}
+
+// stringSlicesEqual checks ordered equality — collectOrgEnv sorts its
+// output so callers can do deterministic comparisons.
+func stringSlicesEqual(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func TestCollectOrgEnv_RequiredWinsOnSameStruct(t *testing.T) {
+	// The same key declared required AND recommended on the SAME
+	// workspace node (rare but legal to parse) must still dedup
+	// correctly and end up required-only.
+	tmpl := &OrgTemplate{
+		Workspaces: []OrgWorkspace{
+			{
+				Name:           "X",
+				RequiredEnv:    strictReq("API_KEY"),
+				RecommendedEnv: strictReq("API_KEY"),
+			},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "API_KEY" {
+		t.Errorf("required should contain API_KEY once, got %v", reqNames(req))
+	}
+	for _, r := range rec {
+		if r.Name == "API_KEY" {
+			t.Errorf("API_KEY must not appear in recommended when also required on same struct")
+		}
+	}
+}
+
+func TestCollectOrgEnv_RejectsInvalidNames(t *testing.T) {
+	// Names failing envVarNamePattern (lowercase, traversal, whitespace,
+	// shell metachars) must be dropped silently — the log line is not
+	// asserted here; the output slice assertion is enough to prove the
+	// filter fires.
+	tmpl := &OrgTemplate{
+		RequiredEnv: strictReq(
+			"VALID_ONE",
+			"lowercase_bad",
+			"../../etc/passwd",
+			"name with spaces",
+			"WITH-DASH",
+			"'; DROP TABLE users;--",
+			"",
+			"A", // single char — still valid per regex
+		),
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if !stringSlicesEqual(reqNames(req), []string{"A", "VALID_ONE"}) {
+		t.Errorf("expected only valid names, got %v", reqNames(req))
+	}
+}
+
+// TestOrgTemplate_ClaudeAnyOfAuthPreflight exercises the shape the
+// ux-ab-lab template ships with: a single any-of group at the org
+// level covering ANTHROPIC_API_KEY vs. CLAUDE_CODE_OAUTH_TOKEN, plus
+// two strict recommended entries (SERPER_API_KEY, VERCEL_TOKEN).
+// Proves the end-to-end YAML → OrgTemplate → collectOrgEnv → IsSatisfied
+// pipeline works for the canonical "Claude sub OR API key" pattern
+// without depending on the on-disk template file (org-templates/ is
+// populated by the clone-manifest, not tracked in this monorepo).
+func TestOrgTemplate_ClaudeAnyOfAuthPreflight(t *testing.T) {
+	src := `
+name: UX A/B Lab
+required_env:
+  - any_of:
+      - ANTHROPIC_API_KEY
+      - CLAUDE_CODE_OAUTH_TOKEN
+recommended_env:
+  - SERPER_API_KEY
+  - VERCEL_TOKEN
+workspaces:
+  - name: Design Director
+    children:
+      - name: UX Researcher
+      - name: Visual Designer
+      - name: React Engineer
+      - name: Deploy Engineer
+      - name: A11y + SEO Auditor
+      - name: Perf Auditor
+`
+	var tmpl OrgTemplate
+	if err := yaml.Unmarshal([]byte(src), &tmpl); err != nil {
+		t.Fatalf("unmarshal: %v", err)
+	}
+	if len(tmpl.Workspaces) != 1 || len(tmpl.Workspaces[0].Children) != 6 {
+		t.Fatalf("expected 1 root with 6 children, got shape %+v", tmpl.Workspaces)
+	}
+
+	required, recommended := collectOrgEnv(&tmpl)
+	if len(required) != 1 {
+		t.Fatalf("expected 1 required requirement (the any-of group), got %d: %v", len(required), reqNames(required))
+	}
+	if required[0].Name != "" {
+		t.Errorf("expected any-of group, got strict name %q", required[0].Name)
+	}
+	wantMembers := []string{"ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"}
+	got := append([]string(nil), required[0].AnyOf...)
+	sort.Strings(got)
+	if !stringSlicesEqual(got, wantMembers) {
+		t.Errorf("any-of members mismatch: got %v, want %v", got, wantMembers)
+	}
+
+	// Either member should independently satisfy the group.
+	if !required[0].IsSatisfied(map[string]struct{}{"ANTHROPIC_API_KEY": {}}) {
+		t.Errorf("ANTHROPIC_API_KEY alone should satisfy the group")
+	}
+	if !required[0].IsSatisfied(map[string]struct{}{"CLAUDE_CODE_OAUTH_TOKEN": {}}) {
+		t.Errorf("CLAUDE_CODE_OAUTH_TOKEN alone should satisfy the group")
+	}
+	if required[0].IsSatisfied(map[string]struct{}{"OPENAI_API_KEY": {}}) {
+		t.Errorf("unrelated key should NOT satisfy the group")
+	}
+
+	wantRec := []string{"SERPER_API_KEY", "VERCEL_TOKEN"}
+	if !stringSlicesEqual(reqNames(recommended), wantRec) {
+		t.Errorf("recommended mismatch: got %v, want %v", reqNames(recommended), wantRec)
+	}
+}
+
+// TestEnvRequirement_UnmarshalYAML proves the on-disk YAML shape
+// (scalar OR `{any_of: [...]}` block) round-trips into EnvRequirement
+// correctly. The preflight pipeline reads user-authored org.yaml
+// files; a regression here would silently drop requirements.
+func TestEnvRequirement_UnmarshalYAML(t *testing.T) {
+	src := `
+required_env:
+  - GITHUB_TOKEN
+  - any_of:
+      - ANTHROPIC_API_KEY
+      - CLAUDE_CODE_OAUTH_TOKEN
+`
+	var parsed struct {
+		RequiredEnv []EnvRequirement `yaml:"required_env"`
+	}
+	if err := yaml.Unmarshal([]byte(src), &parsed); err != nil {
+		t.Fatalf("unmarshal failed: %v", err)
+	}
+	if len(parsed.RequiredEnv) != 2 {
+		t.Fatalf("want 2 requirements, got %d", len(parsed.RequiredEnv))
+	}
+	if parsed.RequiredEnv[0].Name != "GITHUB_TOKEN" {
+		t.Errorf("first should be strict GITHUB_TOKEN, got %+v", parsed.RequiredEnv[0])
+	}
+	if parsed.RequiredEnv[1].Name != "" || len(parsed.RequiredEnv[1].AnyOf) != 2 {
+		t.Errorf("second should be any-of group, got %+v", parsed.RequiredEnv[1])
+	}
+}
+
+// TestEnvRequirement_UnmarshalYAML_RejectsEmptyAnyOf guards against a
+// template that ships `any_of: []` — ambiguous semantics (impossible
+// to satisfy), so the parser must fail loudly rather than silently
+// pass a never-satisfiable requirement through the preflight.
+func TestEnvRequirement_UnmarshalYAML_RejectsEmptyAnyOf(t *testing.T) {
+	src := `
+required_env:
+  - any_of: []
+`
+	var parsed struct {
+		RequiredEnv []EnvRequirement `yaml:"required_env"`
+	}
+	err := yaml.Unmarshal([]byte(src), &parsed)
+	if err == nil {
+		t.Errorf("expected error for empty any_of, got nil: %+v", parsed)
+	}
+}
+
+// ---------------------------------------------------------------------
+// any_of group tests — the new EnvRequirement union shape allows a
+// single requirement to be satisfied by any of a list of members (e.g.
+// ANTHROPIC_API_KEY OR CLAUDE_CODE_OAUTH_TOKEN). collectOrgEnv +
+// IsSatisfied together must handle this correctly.
+// ---------------------------------------------------------------------
+
+func TestEnvRequirement_IsSatisfied(t *testing.T) {
+	configured := map[string]struct{}{
+		"ANTHROPIC_API_KEY": {},
+		"GITHUB_TOKEN":      {},
+	}
+	tests := []struct {
+		name string
+		req  EnvRequirement
+		want bool
+	}{
+		{"strict present", EnvRequirement{Name: "ANTHROPIC_API_KEY"}, true},
+		{"strict absent", EnvRequirement{Name: "MISSING_KEY"}, false},
+		{"any-of first member present", anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"), true},
+		{"any-of second member present", anyOfReq("CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_API_KEY"), true},
+		{"any-of none present", anyOfReq("OPENAI_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"), false},
+		{"any-of single member present", anyOfReq("GITHUB_TOKEN"), true},
+	}
+	for _, tt := range tests {
+		if got := tt.req.IsSatisfied(configured); got != tt.want {
+			t.Errorf("%s: got %v, want %v", tt.name, got, tt.want)
+		}
+	}
+}
+
+func TestCollectOrgEnv_AnyOfGroupPreserved(t *testing.T) {
+	// A group with two alternatives should come through as a single
+	// EnvRequirement carrying both members.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 {
+		t.Fatalf("expected 1 requirement, got %d: %v", len(req), reqNames(req))
+	}
+	if req[0].Name != "" {
+		t.Errorf("expected any-of group, got strict name %q", req[0].Name)
+	}
+	wantMembers := []string{"ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"}
+	got := append([]string(nil), req[0].AnyOf...)
+	sort.Strings(got)
+	if !stringSlicesEqual(got, wantMembers) {
+		t.Errorf("any-of members mismatch: got %v, want %v", got, wantMembers)
+	}
+}
+
+func TestCollectOrgEnv_AnyOfGroupDedup(t *testing.T) {
+	// Two identical groups (members in different order) declared at
+	// different levels must collapse to one.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+		},
+		Workspaces: []OrgWorkspace{
+			{
+				Name: "Root",
+				RequiredEnv: []EnvRequirement{
+					anyOfReq("CLAUDE_CODE_OAUTH_TOKEN", "ANTHROPIC_API_KEY"),
+				},
+			},
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 {
+		t.Errorf("expected 1 requirement after dedup, got %d: %v", len(req), reqNames(req))
+	}
+}
+
+func TestCollectOrgEnv_StrictDominatesGroup(t *testing.T) {
+	// If a strict requirement X is declared anywhere, any-of groups
+	// that CONTAIN X are redundant — the strict requirement will force
+	// X to be configured, which satisfies any group mentioning it too.
+	// Same-tier pruning drops the group.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			{Name: "ANTHROPIC_API_KEY"},
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "ANTHROPIC_API_KEY" {
+		t.Errorf("strict should dominate group, got %v", reqNames(req))
+	}
+}
+
+func TestCollectOrgEnv_StrictRequiredDominatesRecommendedGroup(t *testing.T) {
+	// Cross-tier: a strict required X drops any-of groups in the
+	// recommended tier that mention X.
+	tmpl := &OrgTemplate{
+		RequiredEnv: strictReq("ANTHROPIC_API_KEY"),
+		RecommendedEnv: []EnvRequirement{
+			anyOfReq("ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"),
+			{Name: "SLACK_WEBHOOK_URL"},
+		},
+	}
+	req, rec := collectOrgEnv(tmpl)
+	if len(req) != 1 || req[0].Name != "ANTHROPIC_API_KEY" {
+		t.Errorf("required mismatch: got %v", reqNames(req))
+	}
+	// The any-of group should have been pruned; only SLACK remains.
+	if len(rec) != 1 || rec[0].Name != "SLACK_WEBHOOK_URL" {
+		t.Errorf("recommended mismatch: got %v, want [SLACK_WEBHOOK_URL]", reqNames(rec))
+	}
+}
+
+func TestCollectOrgEnv_AnyOfWithInvalidMemberKeepsValidOnes(t *testing.T) {
+	// A group with one valid + one invalid member should keep the
+	// valid one (group carried by any remaining legitimate name). A
+	// group where ALL members are invalid is dropped entirely.
+	tmpl := &OrgTemplate{
+		RequiredEnv: []EnvRequirement{
+			anyOfReq("VALID_ONE", "lowercase_bad"),
+			anyOfReq("'; DROP TABLE;--", ""),
+		},
+	}
+	req, _ := collectOrgEnv(tmpl)
+	if len(req) != 1 {
+		t.Fatalf("expected 1 requirement, got %d: %v", len(req), reqNames(req))
+	}
+	// The remaining group has only one valid member, so it gets
+	// promoted to a single-name requirement (len(members)==1 path).
+	if req[0].Name != "VALID_ONE" && !stringSlicesEqual(req[0].AnyOf, []string{"VALID_ONE"}) {
+		t.Errorf("expected VALID_ONE to survive, got %v", reqNames(req))
+	}
+}
--- a/workspace-server/internal/handlers/registry.go
+++ b/workspace-server/internal/handlers/registry.go
@ -454,6 +454,29 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea
 		return
 	}

+	// Self-reported runtime wedge: takes precedence over the error_rate
+	// path. The heartbeat task lives in its own asyncio task and keeps
+	// firing 200s even after claude_agent_sdk locks up on
+	// `Control request timeout: initialize` — so error_rate stays at 0
+	// (no calls have been recorded as errors yet) while every actual
+	// /a2a POST hangs. The workspace tells us about that case via
+	// runtime_state="wedged"; we honor it directly. Sample_error from
+	// the heartbeat carries the human-readable reason ("SDK init
+	// timeout — restart workspace"), which the canvas surfaces in the
+	// degraded card without the operator scraping container logs.
+	if payload.RuntimeState == "wedged" && currentStatus == "online" {
+		_, err := db.DB.ExecContext(ctx,
+			`UPDATE workspaces SET status = 'degraded', updated_at = now() WHERE id = $1 AND status = 'online'`,
+			payload.WorkspaceID)
+		if err != nil {
+			log.Printf("Heartbeat: failed to mark %s degraded (wedged): %v", payload.WorkspaceID, err)
+		}
+		h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_DEGRADED", payload.WorkspaceID, map[string]interface{}{
+			"runtime_state": "wedged",
+			"sample_error":  payload.SampleError,
+		})
+	}
+
 	if currentStatus == "online" && payload.ErrorRate >= 0.5 {
 		if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'degraded', updated_at = now() WHERE id = $1`, payload.WorkspaceID); err != nil {
 			log.Printf("Heartbeat: failed to mark %s degraded: %v", payload.WorkspaceID, err)
@ -464,7 +487,13 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea
 		})
 	}

-	if currentStatus == "degraded" && payload.ErrorRate < 0.1 {
+	// Recovery from degraded → online when BOTH the error rate has
+	// fallen back AND the workspace is no longer reporting a wedge.
+	// The wedge condition is sticky for the process lifetime
+	// (claude_sdk_executor only clears it on restart), so when the
+	// container restarts and starts heartbeating fresh — RuntimeState
+	// is empty, error_rate is 0 — this branch flips us back to online.
+	if currentStatus == "degraded" && payload.ErrorRate < 0.1 && payload.RuntimeState == "" {
 		if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'online', updated_at = now() WHERE id = $1`, payload.WorkspaceID); err != nil {
 			log.Printf("Heartbeat: failed to recover %s to online: %v", payload.WorkspaceID, err)
 		}
--- a/workspace-server/internal/handlers/registry_test.go
+++ b/workspace-server/internal/handlers/registry_test.go
@ -298,6 +298,163 @@ func TestHeartbeatHandler_OnlineStaysOnline(t *testing.T) {
 	}
 }

+// ==================== Heartbeat — runtime wedge (claude_agent_sdk init timeout) ====================
+
+// TestHeartbeatHandler_RuntimeWedged_FlipsOnlineToDegraded verifies the
+// runtime_state="wedged" path. Heartbeat task in the workspace lives in
+// its own asyncio task and keeps reporting online while the Claude SDK
+// is wedged on Control request timeout; the workspace tells us about
+// the wedge via this field, and we honor it by flipping status →
+// degraded with the wedge reason in last_sample_error.
+func TestHeartbeatHandler_RuntimeWedged_FlipsOnlineToDegraded(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	wedgeMsg := "claude_agent_sdk wedge: Control request timeout: initialize — restart workspace to recover"
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
+
+	// Heartbeat UPDATE — sample_error carries the wedge reason from the
+	// workspace's _runtime_state_payload() helper.
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-wedged", 0.0, wedgeMsg, 0, 600, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// evaluateStatus: currentStatus = online
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("online"))
+
+	// The wedge-handling branch fires the degraded UPDATE with the
+	// `AND status = 'online'` guard (race-safe against concurrent
+	// removal). Match the SQL with the guard included.
+	mock.ExpectExec("UPDATE workspaces SET status = 'degraded'.*status = 'online'").
+		WithArgs("ws-wedged").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// RecordAndBroadcast for WORKSPACE_DEGRADED
+	mock.ExpectExec("INSERT INTO structure_events").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	body := `{"workspace_id":"ws-wedged","error_rate":0.0,"sample_error":"` + wedgeMsg + `","active_tasks":0,"uptime_seconds":600,"runtime_state":"wedged"}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestHeartbeatHandler_DegradedRecoversOnlyAfterWedgeClears verifies that
+// the degraded → online recovery path requires BOTH error_rate < 0.1
+// AND runtime_state cleared. A workspace still reporting wedged stays
+// degraded even when error_rate happens to be 0 (no calls have been
+// recorded as errors yet — the wedge is captured as a runtime state,
+// not an error count).
+func TestHeartbeatHandler_DegradedRecoversOnlyAfterWedgeClears(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-still-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
+
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-still-wedged", 0.0, "still broken", 0, 800, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// currentStatus = degraded
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-still-wedged").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("degraded"))
+
+	// No additional UPDATE expected — the recovery branch's
+	// `runtime_state == ""` guard blocks the flip back to online.
+	// (sqlmock fails the test if any unmocked Exec runs.)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	body := `{"workspace_id":"ws-still-wedged","error_rate":0.0,"sample_error":"still broken","active_tasks":0,"uptime_seconds":800,"runtime_state":"wedged"}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestHeartbeatHandler_DegradedToOnline_AfterWedgeClears verifies the
+// happy-path recovery: a workspace previously marked degraded is
+// post-restart, error_rate is back to 0, and runtime_state is empty
+// (the new process re-imported claude_sdk_executor with the flag
+// fresh). Status flips back to online and a WORKSPACE_ONLINE event
+// fires.
+func TestHeartbeatHandler_DegradedToOnline_AfterWedgeClears(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-recovered").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
+
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-recovered", 0.0, "", 0, 30, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-recovered").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("degraded"))
+
+	// Recovery UPDATE fires (degraded → online).
+	mock.ExpectExec("UPDATE workspaces SET status = 'online'").
+		WithArgs("ws-recovered").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	mock.ExpectExec("INSERT INTO structure_events").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	// runtime_state intentionally absent (== ""); error_rate = 0; this
+	// is exactly what a freshly-restarted workspace's first heartbeat
+	// looks like.
+	body := `{"workspace_id":"ws-recovered","error_rate":0.0,"sample_error":"","active_tasks":0,"uptime_seconds":30}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
 // ==================== UpdateCard ====================

 func TestUpdateCard_Success(t *testing.T) {
--- a/workspace-server/internal/handlers/secrets.go
+++ b/workspace-server/internal/handlers/secrets.go
@ -466,3 +466,70 @@ func (h *SecretsHandler) GetModel(c *gin.Context) {

 	c.JSON(http.StatusOK, gin.H{"model": string(decrypted), "source": "workspace_secrets"})
 }
+
+// SetModel handles PUT /workspaces/:id/model — writes the model slug
+// into workspace_secrets as MODEL_PROVIDER (the key GetModel reads).
+// For hermes, the value is a hermes-native slug like "minimax/MiniMax-M2.7";
+// for langgraph it's the legacy "provider:model" form. Either way it's just
+// an opaque string the runtime interprets on its next start.
+//
+// Empty string clears the override. Triggers auto-restart so the new
+// env (HERMES_DEFAULT_MODEL etc.) takes effect immediately — without
+// this the user clicks Save+Restart, the canvas PUT lands, but the
+// already-restarting container misses the window and boots with the
+// old value.
+func (h *SecretsHandler) SetModel(c *gin.Context) {
+	workspaceID := c.Param("id")
+	if !uuidRegex.MatchString(workspaceID) {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid workspace ID"})
+		return
+	}
+	ctx := c.Request.Context()
+
+	var body struct {
+		Model string `json:"model"`
+	}
+	if err := c.ShouldBindJSON(&body); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request body"})
+		return
+	}
+
+	if body.Model == "" {
+		if _, err := db.DB.ExecContext(ctx,
+			`DELETE FROM workspace_secrets WHERE workspace_id = $1 AND key = 'MODEL_PROVIDER'`,
+			workspaceID); err != nil {
+			log.Printf("SetModel delete error: %v", err)
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to clear model"})
+			return
+		}
+		if h.restartFunc != nil {
+			go h.restartFunc(workspaceID)
+		}
+		c.JSON(http.StatusOK, gin.H{"status": "cleared"})
+		return
+	}
+
+	encrypted, err := crypto.Encrypt([]byte(body.Model))
+	if err != nil {
+		log.Printf("SetModel encrypt error: %v", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to encrypt model"})
+		return
+	}
+	version := crypto.CurrentEncryptionVersion()
+	_, err = db.DB.ExecContext(ctx, `
+		INSERT INTO workspace_secrets (workspace_id, key, encrypted_value, encryption_version)
+		VALUES ($1, 'MODEL_PROVIDER', $2, $3)
+		ON CONFLICT (workspace_id, key) DO UPDATE
+			SET encrypted_value = $2, encryption_version = $3, updated_at = now()
+	`, workspaceID, encrypted, version)
+	if err != nil {
+		log.Printf("SetModel upsert error: %v", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save model"})
+		return
+	}
+
+	if h.restartFunc != nil {
+		go h.restartFunc(workspaceID)
+	}
+	c.JSON(http.StatusOK, gin.H{"status": "saved", "model": body.Model})
+}
--- a/workspace-server/internal/handlers/secrets_test.go
+++ b/workspace-server/internal/handlers/secrets_test.go
@ -6,6 +6,7 @@ import (
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
+	"strings"
 	"testing"
 	"time"

@ -535,6 +536,88 @@ func TestSecretsGetModel_DBError(t *testing.T) {
 	}
 }

+// ==================== SetModel ====================
+
+func TestSecretsSetModel_Upsert(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	restartCalled := make(chan string, 1)
+	handler := NewSecretsHandler(func(id string) { restartCalled <- id })
+
+	mock.ExpectExec(`INSERT INTO workspace_secrets`).
+		WithArgs("00000000-0000-0000-0000-000000000001", sqlmock.AnyArg(), sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(1, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
+	c.Request = httptest.NewRequest("PUT", "/workspaces/00000000-0000-0000-0000-000000000001/model",
+		strings.NewReader(`{"model":"minimax/MiniMax-M2.7"}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.SetModel(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	select {
+	case id := <-restartCalled:
+		if id != "00000000-0000-0000-0000-000000000001" {
+			t.Errorf("restart called with wrong id: %s", id)
+		}
+	case <-time.After(500 * time.Millisecond):
+		t.Error("restart was not triggered")
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+func TestSecretsSetModel_EmptyClears(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewSecretsHandler(func(string) {})
+
+	mock.ExpectExec(`DELETE FROM workspace_secrets`).
+		WithArgs("00000000-0000-0000-0000-000000000002").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000002"}}
+	c.Request = httptest.NewRequest("PUT", "/workspaces/00000000-0000-0000-0000-000000000002/model",
+		strings.NewReader(`{"model":""}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.SetModel(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+func TestSecretsSetModel_InvalidID(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewSecretsHandler(nil)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "not-a-uuid"}}
+	c.Request = httptest.NewRequest("PUT", "/workspaces/not-a-uuid/model",
+		strings.NewReader(`{"model":"x"}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.SetModel(c)
+
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400 for bad UUID, got %d", w.Code)
+	}
+}
+
 // ==================== Values — Phase 30.2 decrypted pull ====================

 // These tests target the secrets.Values handler (GET /workspaces/:id/secrets/values)
--- a/workspace-server/internal/handlers/workspace_crud.go
+++ b/workspace-server/internal/handlers/workspace_crud.go
@ -5,6 +5,7 @@ package handlers
 // Delete (cascade + purge), and input validation helpers.

 import (
+	"context"
 	"database/sql"
 	"errors"
 	"fmt"
@ -12,6 +13,7 @@ import (
 	"net/http"
 	"path/filepath"
 	"strings"
+	"time"

 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
@ -390,44 +392,69 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
 	// Any concurrent heartbeat / registration / liveness-triggered restart
 	// will see status='removed' and bail out early.
 	//
-	// #1843: Stop() errors used to be silently swallowed. On the CP/EC2
-	// backend, Stop() calls the control plane's DELETE workspaces endpoint
-	// to terminate the EC2; if that errors (CP transient 5xx, network),
-	// the EC2 stays running with no DB row to track it — the
-	// "14 orphan workspace EC2s on a 0-customer account" scenario.
-	// Aggregate Stop failures and surface them as 500 so the client can
-	// retry. The retry replays Stop with the same instance_id (still
-	// readable from the row even after status='removed') — idempotent on
-	// the CP side. RemoveVolume errors stay log-and-continue: those are
-	// local cleanup of /var/data, not infra-leak class.
+	// Combines two concerns:
+	//
+	//  1. Detach cleanup from the request ctx via WithoutCancel + a 30s
+	//     timeout, so when the canvas's `api.del` resolves on our 200
+	//     (and gin cancels c.Request.Context()), in-flight Docker
+	//     stop/remove calls don't get cancelled mid-operation. The
+	//     previous shape leaked containers every time the canvas hung
+	//     up promptly: Stop returned "context canceled", the container
+	//     stayed up, and the next RemoveVolume failed with
+	//     "volume in use". 30s is generous for Docker daemon round-
+	//     trips (typical: <2s) and bounds a stuck daemon.
+	//
+	//  2. #1843: aggregate Stop() failures into stopErrs so the
+	//     post-deletion block surfaces them as 500. On the CP/EC2
+	//     backend, Stop() calls control plane's DELETE endpoint to
+	//     terminate the EC2; if that errors (transient 5xx, network),
+	//     the EC2 stays running with no DB row to track it (the
+	//     "orphan EC2 on a 0-customer account" scenario). Loud-fail
+	//     instead of silent-leak — clients retry, Stop's instance_id
+	//     lookup is idempotent against status='removed'. RemoveVolume
+	//     errors stay log-and-continue (local cleanup, not infra-leak).
+	cleanupCtx, cleanupCancel := context.WithTimeout(
+		context.WithoutCancel(ctx), 30*time.Second)
+	defer cleanupCancel()
+
 	var stopErrs []error
+	stopAndRemove := func(wsID string) {
+		if h.provisioner == nil {
+			return
+		}
+		// Check Stop's error before attempting RemoveVolume — the
+		// previous code discarded it and immediately tried the
+		// volume remove, which always fails with "volume in use"
+		// when Stop didn't actually kill the container. The orphan
+		// sweeper (registry/orphan_sweeper.go) catches what we
+		// skip here on the next reconcile pass.
+		if err := h.provisioner.Stop(cleanupCtx, wsID); err != nil {
+			log.Printf("Delete %s container stop failed: %v — leaving volume for orphan sweeper", wsID, err)
+			stopErrs = append(stopErrs, fmt.Errorf("stop %s: %w", wsID, err))
+			return
+		}
+		if err := h.provisioner.RemoveVolume(cleanupCtx, wsID); err != nil {
+			log.Printf("Delete %s volume removal warning: %v", wsID, err)
+		}
+	}
+
 	for _, descID := range descendantIDs {
-		if h.provisioner != nil {
-			if err := h.provisioner.Stop(ctx, descID); err != nil {
-				log.Printf("Delete descendant %s stop error: %v", descID, err)
-				stopErrs = append(stopErrs, fmt.Errorf("stop descendant %s: %w", descID, err))
-			}
-			if err := h.provisioner.RemoveVolume(ctx, descID); err != nil {
-				log.Printf("Delete descendant %s volume removal warning: %v", descID, err)
-			}
-		}
-		db.ClearWorkspaceKeys(ctx, descID)
-		h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_REMOVED", descID, map[string]interface{}{})
+		stopAndRemove(descID)
+		db.ClearWorkspaceKeys(cleanupCtx, descID)
+		// Detach broadcaster ctx for the same reason as the cleanup
+		// above — RecordAndBroadcast does an INSERT INTO
+		// structure_events + Redis Publish. If the canvas hangs up,
+		// a request-ctx-bound INSERT can be cancelled mid-write,
+		// leaving other WS clients ignorant of the cascade. The DB
+		// row is already 'removed' so it's recoverable, but the
+		// inconsistency is avoidable.
+		h.broadcaster.RecordAndBroadcast(cleanupCtx, "WORKSPACE_REMOVED", descID, map[string]interface{}{})
 	}

-	// Stop + remove volume for the workspace itself
-	if h.provisioner != nil {
-		if err := h.provisioner.Stop(ctx, id); err != nil {
-			log.Printf("Delete %s stop error: %v", id, err)
-			stopErrs = append(stopErrs, fmt.Errorf("stop %s: %w", id, err))
-		}
-		if err := h.provisioner.RemoveVolume(ctx, id); err != nil {
-			log.Printf("Delete %s volume removal warning: %v", id, err)
-		}
-	}
-	db.ClearWorkspaceKeys(ctx, id)
+	stopAndRemove(id)
+	db.ClearWorkspaceKeys(cleanupCtx, id)

-	h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_REMOVED", id, map[string]interface{}{
+	h.broadcaster.RecordAndBroadcast(cleanupCtx, "WORKSPACE_REMOVED", id, map[string]interface{}{
 		"cascade_deleted": len(descendantIDs),
 	})

--- a/workspace-server/internal/handlers/workspace_provision.go
+++ b/workspace-server/internal/handlers/workspace_provision.go
@ -176,20 +176,33 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
 			// Try to recover by applying the runtime-default template. payload.Runtime
 			// is populated by the caller (Restart handler / Create handler) from the
 			// DB row — same source of truth the apply_template=true path uses.
+			// Try `<runtime>-default` first (historical naming), then plain
+			// `<runtime>` (current naming in workspace-configs-templates/).
+			// Only claude-code has the `-default` suffix; every other
+			// runtime directory uses the bare name. Without the bare-name
+			// fallback, recovery only worked for claude-code and blank
+			// workspaces on every other runtime bricked on first start.
 			recovered := false
 			if payload.Runtime != "" {
-				runtimeTemplate := filepath.Join(h.configsDir, payload.Runtime+"-default")
-				if _, statErr := os.Stat(runtimeTemplate); statErr == nil {
-					log.Printf("Provisioner: auto-recover for %s — config volume empty, applying %s-default template (#1858)",
-						workspaceID, payload.Runtime)
-					templatePath = runtimeTemplate
-					// Rebuild cfg with the recovered template path so Start() sees it.
-					cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
-					cfg.ResetClaudeSession = resetClaudeSession
-					recovered = true
-				} else {
-					log.Printf("Provisioner: auto-recover for %s — runtime template %s not found: %v",
-						workspaceID, runtimeTemplate, statErr)
+				candidates := []string{
+					filepath.Join(h.configsDir, payload.Runtime+"-default"),
+					filepath.Join(h.configsDir, payload.Runtime),
+				}
+				for _, runtimeTemplate := range candidates {
+					if _, statErr := os.Stat(runtimeTemplate); statErr == nil {
+						log.Printf("Provisioner: auto-recover for %s — config volume empty, applying %s template (#1858)",
+							workspaceID, filepath.Base(runtimeTemplate))
+						templatePath = runtimeTemplate
+						// Rebuild cfg with the recovered template path so Start() sees it.
+						cfg = h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
+						cfg.ResetClaudeSession = resetClaudeSession
+						recovered = true
+						break
+					}
+				}
+				if !recovered {
+					log.Printf("Provisioner: auto-recover for %s — no template found under %s for runtime=%s",
+						workspaceID, h.configsDir, payload.Runtime)
 				}
 			}

@ -616,6 +629,17 @@ func (h *WorkspaceHandler) ensureDefaultConfig(workspaceID string, payload model
 // payload.Model at boot), this is a no-op — no harm in the switch
 // being empty for those cases.
 func applyRuntimeModelEnv(envVars map[string]string, runtime, model string) {
+	// Fall back to the MODEL_PROVIDER workspace secret when the caller
+	// didn't pass one explicitly. This is the path that "Save+Restart"
+	// hits — Restart builds its payload from the workspaces row (no model
+	// column there) so payload.Model is always empty, but the user's
+	// canvas selection was stored as MODEL_PROVIDER via PUT /model and
+	// is already loaded into envVars here. Without this fallback hermes
+	// silently boots with the template default and errors "No LLM
+	// provider configured" even though the user picked a valid model.
+	if model == "" {
+		model = envVars["MODEL_PROVIDER"]
+	}
 	if model == "" {
 		return
 	}
--- a/workspace-server/internal/middleware/wsauth_middleware.go
+++ b/workspace-server/internal/middleware/wsauth_middleware.go
@ -14,6 +14,30 @@ import (
 	"github.com/gin-gonic/gin"
 )

+// abortAuthLookupError is the single response shape for "the auth
+// middleware tried to validate a token but the underlying datastore
+// lookup failed." Returns 503 (not 500) because the right semantic
+// is "platform infrastructure unavailable, retry shortly" — not
+// "internal server error in our application logic". The structured
+// `code` lets the canvas distinguish this from generic 5xx and
+// surface a dedicated diagnostic ("Postgres/Redis unreachable —
+// check local services") instead of a confusing
+// `auth check failed` toast.
+//
+// `where` is included in the log line so the operator can grep
+// which call site fired (WorkspaceAuth vs AdminAuth, the
+// HasAnyLiveTokenGlobal probe vs orgtoken.Validate). The
+// user-visible body deliberately does NOT include the underlying
+// error string — that could leak DB hostnames, connection-string
+// fragments, or internal code paths.
+func abortAuthLookupError(c *gin.Context, where string, err error) {
+	log.Printf("wsauth: %s: datastore lookup failed (returning 503): %v", where, err)
+	c.AbortWithStatusJSON(http.StatusServiceUnavailable, gin.H{
+		"error": "platform datastore unavailable — retry shortly",
+		"code":  "platform_unavailable",
+	})
+}
+
 // WorkspaceAuth returns a Gin middleware that enforces per-workspace bearer-token
 // authentication on /workspaces/:id/* sub-routes.
 //
@ -73,8 +97,7 @@ func WorkspaceAuth(database *sql.DB) gin.HandlerFunc {
 				c.Next()
 				return
 			} else if !errors.Is(err, orgtoken.ErrInvalidToken) {
-				log.Printf("wsauth: WorkspaceAuth: orgtoken.Validate: %v", err)
-				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "auth check failed"})
+				abortAuthLookupError(c, "WorkspaceAuth: orgtoken.Validate", err)
 				return
 			}
 			// Per-workspace token — narrowest scope, bound to this :id.
@ -136,8 +159,7 @@ func AdminAuth(database *sql.DB) gin.HandlerFunc {

 		hasLive, err := wsauth.HasAnyLiveTokenGlobal(ctx, database)
 		if err != nil {
-			log.Printf("wsauth: AdminAuth: HasAnyLiveTokenGlobal failed: %v", err)
-			c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "auth check failed"})
+			abortAuthLookupError(c, "AdminAuth: HasAnyLiveTokenGlobal", err)
 			return
 		}
 		if !hasLive {
@ -214,8 +236,7 @@ func AdminAuth(database *sql.DB) gin.HandlerFunc {
 			return
 		} else if !errors.Is(err, orgtoken.ErrInvalidToken) {
 			// DB error — fail closed and log. Don't expose DB text.
-			log.Printf("wsauth: AdminAuth: orgtoken.Validate: %v", err)
-			c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "auth check failed"})
+			abortAuthLookupError(c, "AdminAuth: orgtoken.Validate", err)
 			return
 		}

--- a/workspace-server/internal/middleware/wsauth_middleware_test.go
+++ b/workspace-server/internal/middleware/wsauth_middleware_test.go
@ -2,8 +2,11 @@ package middleware

 import (
 	"crypto/sha256"
+	"encoding/json"
+	"errors"
 	"net/http"
 	"net/http/httptest"
+	"strings"
 	"testing"

 	"github.com/DATA-DOG/go-sqlmock"
@ -1699,3 +1702,57 @@ func TestAdminAuth_684_SpecificRoutes_NoBearer_Returns401(t *testing.T) {
 		})
 	}
 }
+
+// ==================== platform-unavailable classification ====================
+//
+// abortAuthLookupError replaces the prior opaque
+// `500 {"error":"auth check failed"}` with a 503 + structured code so
+// the canvas can render a dedicated diagnostic instead of a confusing
+// toast. Pin both the status code and the body shape against
+// regression — this is the contract the canvas's
+// PlatformUnavailableError classifier reads at api.ts.
+
+func TestAdminAuth_DatastoreError_Returns503PlatformUnavailable(t *testing.T) {
+	mockDB, mock, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock.New: %v", err)
+	}
+	defer mockDB.Close()
+
+	// Simulate Postgres being down — HasAnyLiveTokenGlobal's COUNT
+	// query returns a connection error.
+	mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
+		WillReturnError(errors.New("dial tcp [::1]:5432: connect: connection refused"))
+
+	r := gin.New()
+	r.GET("/workspaces", AdminAuth(mockDB), func(c *gin.Context) {
+		c.JSON(http.StatusOK, gin.H{"ok": true})
+	})
+
+	w := httptest.NewRecorder()
+	req, _ := http.NewRequest(http.MethodGet, "/workspaces", nil)
+	r.ServeHTTP(w, req)
+
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d: %s", w.Code, w.Body.String())
+	}
+	var resp map[string]interface{}
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("response body must be JSON: %v (body=%s)", err, w.Body.String())
+	}
+	if resp["code"] != "platform_unavailable" {
+		t.Errorf("response code = %v, want platform_unavailable (canvas reads this for the dedicated diagnostic)", resp["code"])
+	}
+	if _, ok := resp["error"].(string); !ok {
+		t.Errorf("response must include human-readable error string, got %v", resp["error"])
+	}
+	// The body must NOT leak the underlying DB error string —
+	// production hostnames / connection-string fragments could land
+	// in an error toast otherwise.
+	if errStr, _ := resp["error"].(string); strings.Contains(errStr, "dial tcp") {
+		t.Errorf("response leaks underlying DB error: %q", errStr)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
--- a/workspace-server/internal/models/workspace.go
+++ b/workspace-server/internal/models/workspace.go
@ -51,6 +51,19 @@ type HeartbeatPayload struct {
 	// a previously-reported spend value. Any non-zero value is clamped to
 	// [0, maxMonthlySpend] before the DB write. (#615)
 	MonthlySpend int64 `json:"monthly_spend"`
+	// RuntimeState is a self-reported runtime health flag separate from
+	// "is the heartbeat task firing at all". The heartbeat task lives in
+	// its own asyncio task and keeps pinging even when the agent runtime
+	// is wedged (e.g. claude_agent_sdk's `Control request timeout:
+	// initialize` leaves the SDK in a permanent error state for the
+	// process lifetime). RuntimeState is how the workspace tells the
+	// platform "I'm alive but my Claude runtime is broken — flip me to
+	// degraded so the canvas can show a Restart hint."
+	//
+	// Empty string = healthy / no signal. The only currently-recognised
+	// non-empty value is "wedged"; future values can extend this without
+	// migration.
+	RuntimeState string `json:"runtime_state"`
 }

 type UpdateCardPayload struct {
--- a/workspace-server/internal/provisioner/provisioner.go
+++ b/workspace-server/internal/provisioner/provisioner.go
@ -17,6 +17,7 @@ import (
 	"time"

 	"github.com/docker/docker/api/types/container"
+	"github.com/docker/docker/api/types/filters"
 	dockerimage "github.com/docker/docker/api/types/image"
 	"github.com/docker/docker/api/types/network"
 	"github.com/docker/docker/api/types/volume"
@ -143,6 +144,62 @@ func ContainerName(workspaceID string) string {
 	return fmt.Sprintf("ws-%s", id)
 }

+// containerNamePrefix is the shared prefix every workspace container
+// name carries (`ws-`). Used by ListWorkspaceContainerIDPrefixes for
+// the Docker name-filter, and by the orphan sweeper to recognise our
+// own containers vs. anything else on the host.
+const containerNamePrefix = "ws-"
+
+// ListWorkspaceContainerIDPrefixes returns the 12-char workspace ID
+// prefixes of every running ws-* container the Docker daemon knows
+// about. The 12-char form matches ContainerName's truncation, so the
+// orphan sweeper can intersect this set against `SELECT
+// substring(id::text, 1, 12) FROM workspaces WHERE status = 'removed'`
+// without an extra round-trip per row.
+//
+// Returns an empty slice on any Docker error (sweeper treats that as
+// "skip this round" — better than a partial scan that misses leaks).
+func (p *Provisioner) ListWorkspaceContainerIDPrefixes(ctx context.Context) ([]string, error) {
+	if p == nil || p.cli == nil {
+		return nil, nil
+	}
+	containers, err := p.cli.ContainerList(ctx, container.ListOptions{
+		// All=true catches stopped-but-not-removed containers too —
+		// those still hold their volume references and would block
+		// RemoveVolume just like a running container would.
+		All:     true,
+		Filters: filters.NewArgs(filters.Arg("name", containerNamePrefix)),
+	})
+	if err != nil {
+		return nil, err
+	}
+	prefixes := make([]string, 0, len(containers))
+	for _, c := range containers {
+		// Container names from the API include a leading slash:
+		// "/ws-abc123def456". Strip both the slash and our prefix
+		// to recover the 12-char workspace ID.
+		//
+		// The Docker name filter is a SUBSTRING match (not a prefix
+		// match), so something like "my-ws-thing" would also be
+		// returned. The HasPrefix check below is load-bearing:
+		// without it those false positives would flow into the
+		// orphan sweeper's DB query as bogus LIKE patterns.
+		for _, name := range c.Names {
+			n := strings.TrimPrefix(name, "/")
+			if !strings.HasPrefix(n, containerNamePrefix) {
+				continue
+			}
+			id := strings.TrimPrefix(n, containerNamePrefix)
+			if id == "" {
+				continue
+			}
+			prefixes = append(prefixes, id)
+			break // one name is enough; multiple aliases would dup
+		}
+	}
+	return prefixes, nil
+}
+
 // InternalURL returns the Docker-internal URL for a workspace container.
 func InternalURL(workspaceID string) string {
 	return fmt.Sprintf("http://%s:%s", ContainerName(workspaceID), DefaultPort)
@ -832,6 +889,14 @@ func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) erro
 // restart policy: if we ContainerStop first, the restart policy can
 // respawn the container before ContainerRemove runs, leaving a zombie
 // that re-registers via heartbeat after deletion.
+//
+// Returns nil on success AND on "container does not exist" (the cleanup
+// goal is achieved either way). Returns the underlying Docker error
+// only when the daemon actually failed to remove a live container —
+// callers that follow Stop with RemoveVolume MUST check the return
+// and skip volume removal on a real error, otherwise the volume
+// removal will fail with "volume in use" because the container is
+// still alive.
 func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
 	if p == nil || p.cli == nil {
 		return ErrNoBackend
@ -839,15 +904,23 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
 	name := ContainerName(workspaceID)

 	// Force-remove kills and removes in one atomic operation, bypassing
-	// the restart policy entirely. If the container doesn't exist, the
-	// error is harmless.
-	if err := p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true}); err != nil {
-		// Container may already be gone — log but don't fail.
-		log.Printf("Provisioner: force-remove warning for %s: %v", name, err)
+	// the restart policy entirely.
+	err := p.cli.ContainerRemove(ctx, name, container.RemoveOptions{Force: true})
+	if err == nil {
+		log.Printf("Provisioner: stopped and removed container %s", name)
+		return nil
 	}
-
-	log.Printf("Provisioner: stopped and removed container %s", name)
-	return nil
+	if isContainerNotFound(err) {
+		// Container was already gone — the post-condition we want is
+		// satisfied. Don't surface as an error.
+		log.Printf("Provisioner: container %s already gone (no-op)", name)
+		return nil
+	}
+	// Real failure: daemon timeout, socket EOF, ctx cancellation, etc.
+	// Caller (workspace_crud.stopAndRemove, orphan_sweeper.sweepOnce)
+	// must propagate this so they can skip the follow-up RemoveVolume.
+	log.Printf("Provisioner: force-remove failed for %s: %v", name, err)
+	return fmt.Errorf("force-remove %s: %w", name, err)
 }

 // IsRunning checks if a workspace container is currently running.
--- a/workspace-server/internal/registry/orphan_sweeper.go
+++ b/workspace-server/internal/registry/orphan_sweeper.go
@ -0,0 +1,186 @@
+package registry
+
+// orphan_sweeper.go — periodic reconcile pass that cleans up Docker
+// containers whose corresponding workspace row in Postgres has
+// status='removed'. Defence in depth on top of the inline cleanup
+// in handlers/workspace_crud.go.
+//
+// Why this exists: the inline cleanup is one-shot — if Docker hiccups
+// (daemon restart, host load, transient API error), the container
+// silently stays alive while the DB row is already 'removed'. Without
+// a reconcile pass those leaks accumulate forever. With one, every
+// missed cleanup heals on the next sweep.
+//
+// Cost: O(running containers) per cycle, not O(historical removed
+// rows). The Docker name filter trims the candidate set to ws-* only
+// (typically the same handful as ContainerList without filter on a
+// dev host); the DB lookup is one indexed query against the
+// idx_workspaces_status btree.
+
+import (
+	"context"
+	"log"
+	"time"
+
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
+	"github.com/lib/pq"
+)
+
+// OrphanReaper is the dependency the sweeper takes from provisioner.
+// Extracted as an interface so the sweeper is unit-testable without
+// a real Docker daemon — matches the ContainerChecker pattern in
+// healthsweep.go. *provisioner.Provisioner satisfies this naturally.
+type OrphanReaper interface {
+	ListWorkspaceContainerIDPrefixes(ctx context.Context) ([]string, error)
+	Stop(ctx context.Context, workspaceID string) error
+	RemoveVolume(ctx context.Context, workspaceID string) error
+}
+
+// isLikelyWorkspaceID accepts strings shaped like a UUID prefix —
+// hex chars and `-` only. Workspace IDs are full UUIDs and the
+// container-name truncation keeps the hex prefix intact, so any
+// container name that doesn't match this is by definition not one
+// of ours and should be skipped. Also doubles as a SQL LIKE
+// wildcard guard (rejects `_` and `%`).
+func isLikelyWorkspaceID(s string) bool {
+	if s == "" {
+		return false
+	}
+	for _, r := range s {
+		switch {
+		case r >= '0' && r <= '9':
+		case r >= 'a' && r <= 'f':
+		case r >= 'A' && r <= 'F':
+		case r == '-':
+		default:
+			return false
+		}
+	}
+	return true
+}
+
+// OrphanSweepInterval is the cadence of the reconcile loop. 60s
+// matches the heartbeat cadence (30s) × 2 — a single missed cleanup
+// surfaces within ~90s end-to-end (canvas delete → next sweep tick →
+// container gone). Faster cycles would just pay Docker API cost for
+// no UX win; slower would let leaks linger long enough to compound
+// CPU pressure on dev hosts.
+const OrphanSweepInterval = 60 * time.Second
+
+// orphanSweepDeadline bounds a single sweep cycle. A daemon at the
+// edge of timing out shouldn't accumulate goroutines. 30s is generous
+// for a dev host with dozens of containers and a busy daemon.
+const orphanSweepDeadline = 30 * time.Second
+
+// StartOrphanSweeper runs the reconcile loop until ctx is cancelled.
+// nil reaper makes the loop a no-op (matches handlers'
+// nil-provisioner-tolerant pattern — some test harnesses run without
+// Docker available).
+func StartOrphanSweeper(ctx context.Context, reaper OrphanReaper) {
+	if reaper == nil {
+		log.Println("Orphan sweeper: reaper is nil — sweeper disabled")
+		return
+	}
+	log.Printf("Orphan sweeper started — reconciling every %s", OrphanSweepInterval)
+	ticker := time.NewTicker(OrphanSweepInterval)
+	defer ticker.Stop()
+	// Run once immediately so a platform restart cleans up any
+	// containers leaked while we were down — don't make the user
+	// wait 60s for the first reconcile.
+	sweepOnce(ctx, reaper)
+	for {
+		select {
+		case <-ctx.Done():
+			log.Println("Orphan sweeper: shutdown")
+			return
+		case <-ticker.C:
+			sweepOnce(ctx, reaper)
+		}
+	}
+}
+
+func sweepOnce(parent context.Context, reaper OrphanReaper) {
+	ctx, cancel := context.WithTimeout(parent, orphanSweepDeadline)
+	defer cancel()
+
+	prefixes, err := reaper.ListWorkspaceContainerIDPrefixes(ctx)
+	if err != nil {
+		log.Printf("Orphan sweeper: ListWorkspaceContainerIDPrefixes failed: %v — skipping cycle", err)
+		return
+	}
+	if len(prefixes) == 0 {
+		return
+	}
+
+	// Resolve each prefix to a full workspace_id whose status is
+	// 'removed'. The platform's workspace IDs are full UUIDs but
+	// container names are truncated to 12 chars — an UPPER BOUND
+	// of one match per prefix is guaranteed by the DB (UUID v4
+	// collisions in the first 12 chars across active rows are
+	// statistically negligible). Use a single IN-style query so
+	// the cost is one round-trip regardless of leak count.
+	//
+	// Defence: drop any prefix whose contents fall outside the
+	// hex-and-dash UUID alphabet. Workspace IDs are UUIDs, so
+	// container names follow ws-<12 hex chars>. Anything else is
+	// either a non-workspace container that slipped past the
+	// substring-match Docker filter (workspace-runner, etc.) or a
+	// malformed entry — neither should be turned into a LIKE
+	// pattern. Also blocks SQL LIKE wildcards (`_` and `%`) from
+	// reaching the query, even though Docker's container-name
+	// validation would already have rejected them upstream.
+	likes := make([]string, 0, len(prefixes))
+	for _, p := range prefixes {
+		if !isLikelyWorkspaceID(p) {
+			continue
+		}
+		likes = append(likes, p+"%")
+	}
+	if len(likes) == 0 {
+		return
+	}
+	rows, err := db.DB.QueryContext(ctx, `
+		SELECT id::text
+		  FROM workspaces
+		 WHERE status = 'removed'
+		   AND id::text LIKE ANY($1::text[])
+	`, pq.Array(likes))
+	if err != nil {
+		log.Printf("Orphan sweeper: DB query failed: %v — skipping cycle", err)
+		return
+	}
+	defer rows.Close()
+
+	var orphanIDs []string
+	for rows.Next() {
+		var id string
+		if scanErr := rows.Scan(&id); scanErr != nil {
+			log.Printf("Orphan sweeper: row scan failed: %v", scanErr)
+			continue
+		}
+		orphanIDs = append(orphanIDs, id)
+	}
+	if err := rows.Err(); err != nil {
+		log.Printf("Orphan sweeper: rows iteration failed: %v", err)
+		return
+	}
+
+	for _, id := range orphanIDs {
+		log.Printf("Orphan sweeper: stopping leaked container for removed workspace %s", id)
+		if stopErr := reaper.Stop(ctx, id); stopErr != nil {
+			// Stop returns the wrapped Docker error (treating
+			// "container not found" as nil-success via
+			// isContainerNotFound), so a non-nil here means the
+			// container is genuinely still alive — daemon timeout,
+			// ctx cancellation, or a transient socket EOF.
+			// Skip RemoveVolume so we don't fall into the same
+			// Stop-failed-then-volume-in-use trap that motivated
+			// this sweeper. The next cycle (60s out) retries Stop.
+			log.Printf("Orphan sweeper: Stop failed for %s: %v — leaving volume for next cycle", id, stopErr)
+			continue
+		}
+		if rmErr := reaper.RemoveVolume(ctx, id); rmErr != nil {
+			log.Printf("Orphan sweeper: RemoveVolume warning for %s: %v", id, rmErr)
+		}
+	}
+}
--- a/workspace-server/internal/registry/orphan_sweeper_test.go
+++ b/workspace-server/internal/registry/orphan_sweeper_test.go
@ -0,0 +1,255 @@
+package registry
+
+import (
+	"context"
+	"errors"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/DATA-DOG/go-sqlmock"
+)
+
+// fakeReaper is a hand-rolled OrphanReaper for the sweeper tests.
+// Records every Stop / RemoveVolume call so tests can assert which
+// workspace IDs got reconciled.
+type fakeReaper struct {
+	mu             sync.Mutex
+	listResponse   []string
+	listErr        error
+	stopErr        map[string]error
+	removeVolErr   map[string]error
+	stopCalls      []string
+	removeVolCalls []string
+}
+
+func (f *fakeReaper) ListWorkspaceContainerIDPrefixes(_ context.Context) ([]string, error) {
+	if f.listErr != nil {
+		return nil, f.listErr
+	}
+	return f.listResponse, nil
+}
+
+func (f *fakeReaper) Stop(_ context.Context, wsID string) error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.stopCalls = append(f.stopCalls, wsID)
+	return f.stopErr[wsID]
+}
+
+func (f *fakeReaper) RemoveVolume(_ context.Context, wsID string) error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.removeVolCalls = append(f.removeVolCalls, wsID)
+	return f.removeVolErr[wsID]
+}
+
+// TestSweepOnce_ReconcilesRunningRemovedRows — the core reconcile
+// behavior: a container running for a workspace whose DB row is
+// 'removed' gets stopped + volume removed.
+func TestSweepOnce_ReconcilesRunningRemovedRows(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	// Docker reports two ws-* containers; one's row is 'removed'
+	// (the leak), the other's is 'online' (the DB rightly excludes
+	// it from the WHERE clause and we should NOT reap it).
+	reaper := &fakeReaper{
+		listResponse: []string{"abc123def456", "xyz789ghi012"},
+	}
+
+	// The query asks for status='removed' rows whose id matches the
+	// LIKE patterns built from the running container prefixes. Mock
+	// returns only the leaked one as a UUID-shaped full id.
+	mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`).
+		WillReturnRows(sqlmock.NewRows([]string{"id"}).
+			AddRow("abc123def456-0000-0000-0000-000000000000"))
+
+	sweepOnce(context.Background(), reaper)
+
+	if len(reaper.stopCalls) != 1 || reaper.stopCalls[0] != "abc123def456-0000-0000-0000-000000000000" {
+		t.Errorf("Stop calls = %v, want exactly the leaked id", reaper.stopCalls)
+	}
+	if len(reaper.removeVolCalls) != 1 || reaper.removeVolCalls[0] != "abc123def456-0000-0000-0000-000000000000" {
+		t.Errorf("RemoveVolume calls = %v, want exactly the leaked id", reaper.removeVolCalls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestSweepOnce_NoRunningContainers — Docker returns nothing, sweeper
+// short-circuits without a DB query (no leak possible if no
+// containers exist).
+func TestSweepOnce_NoRunningContainers(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	reaper := &fakeReaper{listResponse: nil}
+
+	// No DB query expected — if sweepOnce makes one anyway the
+	// sqlmock will fail "unexpected query".
+	sweepOnce(context.Background(), reaper)
+
+	if len(reaper.stopCalls) != 0 {
+		t.Errorf("Stop should not fire when no containers exist; got %v", reaper.stopCalls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestSweepOnce_DockerListErrorSkipsCycle — a Docker daemon hiccup
+// must not cascade into a DB query (otherwise we'd reap based on
+// stale information). Skip the cycle, retry next tick.
+func TestSweepOnce_DockerListErrorSkipsCycle(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	reaper := &fakeReaper{listErr: errors.New("daemon unreachable")}
+	sweepOnce(context.Background(), reaper)
+
+	if len(reaper.stopCalls) != 0 {
+		t.Errorf("Stop must not fire when Docker list failed; got %v", reaper.stopCalls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestSweepOnce_StopFailureLeavesVolume — if Stop fails, RemoveVolume
+// MUST NOT fire. This is the same trap that motivated the sweeper:
+// removing a volume held by a still-running container always errors
+// with "volume in use", and we'd accumulate noise in the log without
+// actually fixing anything. Leave the volume for the next sweep
+// (which will retry Stop).
+func TestSweepOnce_StopFailureLeavesVolume(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	reaper := &fakeReaper{
+		listResponse: []string{"abc123def456"},
+		stopErr: map[string]error{
+			"abc123def456-0000-0000-0000-000000000000": errors.New("docker daemon timeout"),
+		},
+	}
+	mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`).
+		WillReturnRows(sqlmock.NewRows([]string{"id"}).
+			AddRow("abc123def456-0000-0000-0000-000000000000"))
+
+	sweepOnce(context.Background(), reaper)
+
+	if len(reaper.stopCalls) != 1 {
+		t.Errorf("Stop should have been attempted exactly once, got %v", reaper.stopCalls)
+	}
+	if len(reaper.removeVolCalls) != 0 {
+		t.Errorf("RemoveVolume must not fire when Stop failed; got %v", reaper.removeVolCalls)
+	}
+}
+
+// TestSweepOnce_VolumeRemoveErrorIsNonFatal — RemoveVolume failures
+// are logged but don't prevent processing other orphans in the same
+// cycle. Belt + braces against a transient daemon issue mid-loop.
+func TestSweepOnce_VolumeRemoveErrorIsNonFatal(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	reaper := &fakeReaper{
+		listResponse: []string{"aaa111bbb222", "ccc333ddd444"},
+		removeVolErr: map[string]error{
+			"aaa111bbb222-0000-0000-0000-000000000000": errors.New("volume not found"),
+		},
+	}
+	mock.ExpectQuery(`SELECT id::text\s+FROM workspaces`).
+		WillReturnRows(sqlmock.NewRows([]string{"id"}).
+			AddRow("aaa111bbb222-0000-0000-0000-000000000000").
+			AddRow("ccc333ddd444-0000-0000-0000-000000000000"))
+
+	sweepOnce(context.Background(), reaper)
+
+	if len(reaper.stopCalls) != 2 {
+		t.Errorf("both orphans should have been Stopped; got %v", reaper.stopCalls)
+	}
+	if len(reaper.removeVolCalls) != 2 {
+		t.Errorf("both orphans should have had RemoveVolume attempted; got %v", reaper.removeVolCalls)
+	}
+}
+
+// TestSweepOnce_FiltersNonWorkspacePrefixes — the Docker name filter
+// is a SUBSTRING match so containers like "my-ws-thing" can slip
+// through. The HasPrefix check in the provisioner trims those, but
+// the in-sweeper isLikelyWorkspaceID guard is the second line of
+// defence: anything outside the UUID alphabet (hex + dashes) is
+// rejected before being turned into a SQL LIKE pattern. Locks in
+// that no DB query fires when every prefix is filtered out.
+func TestSweepOnce_FiltersNonWorkspacePrefixes(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	reaper := &fakeReaper{
+		listResponse: []string{
+			"not_a_uuid_at_all",            // underscore not in UUID alphabet
+			"contains%wildcard",            // SQL LIKE wildcard — must not reach the query
+			"contains_wildcard",            // SQL LIKE single-char wildcard
+			"",                             // empty
+			"valid-but-non-workspace-name", // dash + lowercase letters that aren't hex
+		},
+	}
+
+	// No DB query expected — every prefix is rejected before the
+	// query builds, so we short-circuit. sqlmock fails on any
+	// unexpected query.
+	sweepOnce(context.Background(), reaper)
+
+	if len(reaper.stopCalls) != 0 {
+		t.Errorf("Stop must not fire when all prefixes filtered; got %v", reaper.stopCalls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestIsLikelyWorkspaceID — pin the alphabet directly. This is the
+// guard that prevents SQL LIKE wildcards (`%`, `_`) from reaching
+// the sweeper's query.
+func TestIsLikelyWorkspaceID(t *testing.T) {
+	cases := []struct {
+		in   string
+		want bool
+	}{
+		{"abc123def456", true},
+		{"abcdef-1234-5678-90ab-cdef00112233", true},
+		{"ABC123DEF456", true}, // uppercase hex still allowed
+		{"", false},
+		{"abc_123", false},      // underscore (SQL LIKE single-char wildcard)
+		{"abc%123", false},      // percent (SQL LIKE multi-char wildcard)
+		{"hello world", false},  // space, non-hex letters
+		{"valid-but-not", false}, // 'l', 't', 'n' aren't hex
+		{"abc 123", false},
+		{".../escape", false},
+	}
+	for _, tc := range cases {
+		got := isLikelyWorkspaceID(tc.in)
+		if got != tc.want {
+			t.Errorf("isLikelyWorkspaceID(%q) = %v, want %v", tc.in, got, tc.want)
+		}
+	}
+}
+
+// TestStartOrphanSweeper_NilReaperIsNoOp — tolerance for the
+// nil-provisioner path used by some test harnesses.
+func TestStartOrphanSweeper_NilReaperIsNoOp(t *testing.T) {
+	// Should return immediately without panicking. Wrap in a goroutine
+	// + done-channel so we can assert it didn't block.
+	done := make(chan struct{})
+	go func() {
+		StartOrphanSweeper(context.Background(), nil)
+		close(done)
+	}()
+	select {
+	case <-done:
+		// expected
+	case <-time.After(500 * time.Millisecond):
+		t.Fatal("StartOrphanSweeper(nil) blocked instead of returning immediately")
+	}
+}
--- a/workspace-server/internal/router/router.go
+++ b/workspace-server/internal/router/router.go
@ -308,6 +308,7 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
 		wsAuth.PUT("/secrets", sech.Set)
 		wsAuth.DELETE("/secrets/:key", sech.Delete)
 		wsAuth.GET("/model", sech.GetModel)
+		wsAuth.PUT("/model", sech.SetModel)

 		// Token usage metrics — cost transparency (#593).
 		// WorkspaceAuth middleware (on wsAuth) binds the bearer to :id.
@ -481,6 +482,14 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
 	wsAuth.PUT("/files/*path", tmplh.WriteFile)
 	wsAuth.DELETE("/files/*path", tmplh.DeleteFile)

+	// Chat attachments — file upload (user → agent) and binary-safe
+	// streaming download (agent → user). Namespaced under /chat/ so
+	// the security model is obviously distinct from /files/* (which
+	// handles workspace config/templates and has a different caller).
+	chatfh := handlers.NewChatFilesHandler(tmplh)
+	wsAuth.POST("/chat/uploads", chatfh.Upload)
+	wsAuth.GET("/chat/download", chatfh.Download)
+
 	// Plugins
 	pluginsDir := findPluginsDir(configsDir)
 	// Runtime lookup lets the plugins handler filter the registry to plugins
--- a/workspace-server/migrations/043_workspace_status_enum.down.sql
+++ b/workspace-server/migrations/043_workspace_status_enum.down.sql
@ -0,0 +1,24 @@
+-- 043_workspace_status_enum.down.sql
+--
+-- Reverse 043_workspace_status_enum.up.sql: convert workspaces.status
+-- back to plain TEXT and drop the workspace_status enum type.
+
+BEGIN;
+
+-- Symmetric with the up migration: a rollback under the same load
+-- that motivated the up-file's 5s lock_timeout would otherwise stall
+-- writers indefinitely.
+SET LOCAL lock_timeout = '5s';
+
+ALTER TABLE workspaces
+    ALTER COLUMN status DROP DEFAULT;
+
+ALTER TABLE workspaces
+    ALTER COLUMN status TYPE TEXT USING status::TEXT;
+
+ALTER TABLE workspaces
+    ALTER COLUMN status SET DEFAULT 'provisioning';
+
+DROP TYPE workspace_status;
+
+COMMIT;
--- a/workspace-server/migrations/043_workspace_status_enum.up.sql
+++ b/workspace-server/migrations/043_workspace_status_enum.up.sql
@ -0,0 +1,84 @@
+-- 043_workspace_status_enum.up.sql
+--
+-- Convert workspaces.status from free-form TEXT to a real Postgres
+-- ENUM type. The previous shape (TEXT DEFAULT 'provisioning' with no
+-- CHECK constraint, set by 001_workspaces.sql) let any handler write
+-- any string, including typos and stale values from older code paths.
+-- Locking the value set forces every writer to use one of the agreed
+-- states and lets us add a new state (`degraded`, used by the SDK
+-- wedge detector landing in this same change) without losing type
+-- safety on the column.
+--
+-- Value set covers every status the production codebase actually writes:
+--
+--   provisioning  — workspace row exists, container is being created
+--                   (initial INSERT default)
+--   online        — heartbeat fresh + last response was successful
+--   offline       — Redis liveness key expired (ws-side dead) or
+--                   the proxy detected an unreachable upstream
+--   degraded      — runtime is alive but reporting trouble (heartbeat
+--                   error_rate >= 0.5, OR new in this change:
+--                   workspace explicitly reported runtime_state="wedged")
+--   failed        — provisioning never completed, or workspace marked
+--                   itself failed via bundle import / runtime crash
+--   removed       — soft-delete tombstone; the row stays so foreign-
+--                   key references survive but no operations target it
+--   paused        — operator-initiated suspend via workspace_restart's
+--                   pause path (workspace_restart.go:406)
+--   hibernated    — auto-suspended after idle threshold; container
+--                   stopped but row preserved (workspace_restart.go:283,
+--                   introduced by migration 029_workspace_hibernation)
+--
+-- Sweep of every `UPDATE workspaces SET status = 'X'` in the
+-- workspace-server/internal/ tree (excluding tests) verified the
+-- value set. Adding a new state in the future requires both updating
+-- this enum (a separate `ALTER TYPE … ADD VALUE` migration) AND any
+-- writers — the enum will reject unknown strings at insert/update
+-- time, which is the exact failure mode this migration is meant to
+-- give us.
+--
+-- Deployment: `ALTER TABLE … ALTER COLUMN TYPE` takes ACCESS
+-- EXCLUSIVE on workspaces. A long-running SELECT against the table
+-- will block the migration; the migration will then block every
+-- writer behind it. `SET lock_timeout` aborts the migration in 5s
+-- if it can't acquire the lock — preferable to stalling the whole
+-- workspace fleet behind one slow query.
+
+BEGIN;
+
+SET LOCAL lock_timeout = '5s';
+
+CREATE TYPE workspace_status AS ENUM (
+    'provisioning',
+    'online',
+    'offline',
+    'degraded',
+    'failed',
+    'removed',
+    'paused',
+    'hibernated'
+);
+
+-- The two-step ALTER (DROP DEFAULT then change type then SET DEFAULT)
+-- is required because Postgres rejects an ALTER COLUMN TYPE on a
+-- column that has a DEFAULT whose expression doesn't match the new
+-- type. The intermediate moment with no default is fine — no INSERT
+-- happens between these statements inside the same transaction.
+--
+-- The `USING status::workspace_status` cast is the type-conversion
+-- expression Postgres needs when the source and target types aren't
+-- assignment-compatible. If any existing row has a status value
+-- outside the enum's set, this statement aborts the transaction and
+-- the migration leaves the table untouched — that's the correct
+-- behavior (we'd want to know about the rogue value before locking
+-- the type).
+ALTER TABLE workspaces
+    ALTER COLUMN status DROP DEFAULT;
+
+ALTER TABLE workspaces
+    ALTER COLUMN status TYPE workspace_status USING status::workspace_status;
+
+ALTER TABLE workspaces
+    ALTER COLUMN status SET DEFAULT 'provisioning'::workspace_status;
+
+COMMIT;
--- a/workspace/a2a_client.py
+++ b/workspace/a2a_client.py
@ -10,7 +10,7 @@ import uuid

 import httpx

-from platform_auth import auth_headers
+from platform_auth import auth_headers, self_source_headers

 logger = logging.getLogger(__name__)

@ -56,9 +56,15 @@ async def send_a2a_message(target_url: str, message: str) -> str:
        timeout=httpx.Timeout(connect=30.0, read=300.0, write=30.0, pool=30.0)
    ) as client:
        try:
+            # self_source_headers() includes X-Workspace-ID so the
+            # platform's a2a_receive logger records source_id =
+            # WORKSPACE_ID. Otherwise peer-A2A messages — including
+            # the case where target_url resolves to this workspace's
+            # own /a2a — get logged with source_id=NULL and surface
+            # in the recipient's My Chat tab as user-typed input.
            resp = await client.post(
                target_url,
-                headers=auth_headers(),
+                headers=self_source_headers(WORKSPACE_ID),
                json={
                    "jsonrpc": "2.0",
                    "id": str(uuid.uuid4()),
@ -81,10 +87,40 @@ async def send_a2a_message(target_url: str, message: str) -> str:
                    return f"{_A2A_ERROR_PREFIX}{text}"
                return text
            elif "error" in data:
-                return f"{_A2A_ERROR_PREFIX}{data['error'].get('message', 'unknown')}"
-            return str(data)
+                err = data["error"]
+                msg = (err.get("message") or "").strip()
+                code = err.get("code")
+                if msg and code is not None:
+                    detail = f"{msg} (code={code})"
+                elif msg:
+                    detail = msg
+                elif code is not None:
+                    detail = f"JSON-RPC error with no message (code={code})"
+                else:
+                    detail = "JSON-RPC error with no message"
+                return f"{_A2A_ERROR_PREFIX}{detail} [target={target_url}]"
+            return f"{_A2A_ERROR_PREFIX}unexpected response shape (no result, no error): {str(data)[:200]} [target={target_url}]"
        except Exception as e:
-            return f"{_A2A_ERROR_PREFIX}{e}"
+            # Some httpx exceptions stringify to empty (RemoteProtocolError,
+            # ConnectionReset variants) — the canvas would then render
+            # "[A2A_ERROR] " with no detail and the operator has no signal
+            # to act on. Always include the exception class name and the
+            # target URL so the activity log + Agent Comms panel have
+            # actionable information without a trip through container logs.
+            msg = str(e).strip()
+            type_name = type(e).__name__
+            if not msg:
+                detail = f"{type_name} (no message — likely connection reset or silent timeout)"
+            elif msg.startswith(f"{type_name}:") or msg.startswith(f"{type_name} "):
+                # Already prefixed with the type — don't double-prefix.
+                # Prefix-anchored check (not substring) so a message that
+                # happens to mention some OTHER class name mid-string
+                # (e.g. "got OSError on read") doesn't suppress our own
+                # type prefix and lose the diagnostic signal.
+                detail = msg
+            else:
+                detail = f"{type_name}: {msg}"
+            return f"{_A2A_ERROR_PREFIX}{detail} [target={target_url}]"


 async def get_peers() -> list[dict]:
--- a/workspace/a2a_executor.py
+++ b/workspace/a2a_executor.py
@ -48,6 +48,10 @@ from shared_runtime import (
    brief_task,
    set_current_task,
 )
+from executor_helpers import (
+    collect_outbound_files,
+    extract_attached_files,
+)
 from builtin_tools.telemetry import (
    A2A_TASK_ID,
    GEN_AI_OPERATION_NAME,
@ -211,6 +215,18 @@ class LangGraphA2AExecutor(AgentExecutor):
          3. Message(final_text)                      — terminal event
        """
        user_input = extract_message_text(context)
+        # Pull attached files from A2A message parts (kind: "file") and
+        # append a manifest to the prompt so the agent knows they exist.
+        # LangGraph tools (filesystem, bash, skills) can then open the
+        # files by path — without this the agent silently ignores the
+        # attachments and replies "I'm not sure what you're referring to".
+        _attached_files = extract_attached_files(getattr(context, "message", None))
+        if _attached_files:
+            _manifest = "\n\nAttached files:\n" + "\n".join(
+                f"- {f['name']} ({f['mime_type'] or 'unknown type'}) at {f['path']}"
+                for f in _attached_files
+            )
+            user_input = (user_input + _manifest) if user_input else _manifest.lstrip()
        if not user_input:
            parts = getattr(getattr(context, "message", None), "parts", None)
            logger.warning("A2A execute: no text content in message parts: %s", parts)
@ -415,7 +431,38 @@ class LangGraphA2AExecutor(AgentExecutor):
                # Non-streaming: ResultAggregator.consume_all() returns this
                #   immediately as the response (a2a_client.py reads .parts[0].text).
                # Streaming: yielded as the last SSE event in the stream.
-                msg = new_agent_text_message(final_text, task_id=task_id, context_id=context_id)
+                #
+                # If the reply mentions /workspace/... paths, stage each one
+                # and emit as FileParts alongside the text so the canvas can
+                # render a download button. Same contract the hermes executor
+                # uses — every runtime going through this code path (langgraph,
+                # deepagents, future ReAct variants) inherits it.
+                _outbound = collect_outbound_files(final_text)
+                if _outbound:
+                    # NOTE: do NOT re-import `Part` here. It is already imported
+                    # at module scope (line 42). A function-scope `from a2a.types
+                    # import ... Part ...` would mark `Part` as a local name
+                    # throughout this function under Python's scoping rules,
+                    # making the earlier `Part(text=text)` call (line ~358, inside
+                    # the astream_events loop) raise UnboundLocalError because
+                    # the local binding is not yet in scope at that point.
+                    from a2a.types import FilePart, FileWithUri, Message, Role, TextPart
+                    _parts: list[Part] = [Part(root=TextPart(text=final_text))] if final_text else []
+                    for f in _outbound:
+                        _parts.append(Part(root=FilePart(file=FileWithUri(
+                            uri="workspace:" + f["path"],
+                            name=f["name"],
+                            mimeType=f["mime_type"],
+                        ))))
+                    msg = Message(
+                        messageId=uuid.uuid4().hex,
+                        role=Role.agent,
+                        parts=_parts,
+                        taskId=task_id,
+                        contextId=context_id,
+                    )
+                else:
+                    msg = new_agent_text_message(final_text, task_id=task_id, context_id=context_id)
                # Attach tool_trace via metadata when supported. Guarded with
                # hasattr because some test mocks return a plain string here.
                if tool_trace and hasattr(msg, "metadata"):
--- a/workspace/a2a_tools.py
+++ b/workspace/a2a_tools.py
@ -112,7 +112,7 @@ def _auth_headers_for_heartbeat() -> dict[str, str]:

 async def report_activity(
    activity_type: str, target_id: str = "", summary: str = "", status: str = "ok",
-    task_text: str = "", response_text: str = "",
+    task_text: str = "", response_text: str = "", error_detail: str = "",
 ):
    """Report activity to the platform for live progress tracking."""
    try:
@ -129,6 +129,13 @@ async def report_activity(
                payload["request_body"] = {"task": task_text}
            if response_text:
                payload["response_body"] = {"result": response_text}
+            if error_detail:
+                # error_detail is a top-level activity row column on the
+                # platform (handlers/activity.go). Surfacing the cleaned
+                # exception string here lets the Activity tab render a
+                # red error chip + the cause without forcing the user
+                # to scroll into the raw response_body JSON.
+                payload["error_detail"] = error_detail
            await client.post(
                f"{PLATFORM_URL}/workspaces/{WORKSPACE_ID}/activity",
                json=payload,
@ -178,11 +185,23 @@ async def tool_delegate_task(workspace_id: str, task: str) -> str:
    # Detect delegation failures — wrap them clearly so the calling agent
    # can decide to retry, use another peer, or handle the task itself.
    is_error = result.startswith(_A2A_ERROR_PREFIX)
+    # Strip the sentinel prefix so error_detail is the human-readable
+    # cause directly. The Activity tab's red error chip surfaces this
+    # without the user having to scroll into the raw response JSON.
+    #
+    # Cap at 4096 chars before sending — the platform's
+    # activity_logs.error_detail column is unbounded TEXT and a
+    # malicious or buggy peer could otherwise stream an arbitrarily
+    # large error message into the caller's activity log. 4096 is
+    # comfortably above any real exception traceback we've seen and
+    # well below an obvious-DoS threshold.
+    error_detail = result[len(_A2A_ERROR_PREFIX):].strip()[:4096] if is_error else ""
    await report_activity(
        "a2a_receive", workspace_id,
-        f"{peer_name} responded ({len(result)} chars)" if not is_error else f"{peer_name} failed",
+        f"{peer_name} responded ({len(result)} chars)" if not is_error else f"{peer_name} failed: {error_detail[:120]}",
        task_text=task, response_text=result,
        status="error" if is_error else "ok",
+        error_detail=error_detail,
    )
    if is_error:
        return (
--- a/workspace/builtin_tools/a2a_tools.py
+++ b/workspace/builtin_tools/a2a_tools.py
@ -42,10 +42,15 @@ async def delegate_task(workspace_id: str, task: str) -> str:
        except Exception as e:
            return f"Error discovering workspace: {e}"

-        # Send A2A message
+        # Send A2A message. X-Workspace-ID identifies us as the source —
+        # without it the platform's a2a_receive logger writes
+        # source_id=NULL and the recipient's My Chat tab renders the
+        # delegation as if a human user typed it. Same hazard fixed
+        # in heartbeat.py / a2a_client.py / main.py initial+idle flows.
        try:
            a2a_resp = await client.post(
                target_url,
+                headers={"X-Workspace-ID": WORKSPACE_ID},
                json={
                    "jsonrpc": "2.0",
                    "id": str(uuid.uuid4()),
--- a/workspace/claude_sdk_executor.py
+++ b/workspace/claude_sdk_executor.py
@ -29,7 +29,7 @@ import asyncio
 import logging
 import os
 import sys
-from collections.abc import AsyncIterator
+from collections.abc import AsyncIterator, Callable
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any

@ -47,7 +47,9 @@ from executor_helpers import (
    WORKSPACE_MOUNT,
    auto_push_hook,
    brief_summary,
+    collect_outbound_files,
    commit_memory,
+    extract_attached_files,
    extract_message_text,
    get_a2a_instructions,
    get_hma_instructions,
@ -85,6 +87,180 @@ _RETRYABLE_PATTERNS = (
    "try again",
 )

+# Module-level SDK-wedge flag. When claude_agent_sdk's `query.initialize()`
+# raises `Control request timeout: initialize`, the SDK's internal client-
+# process state is corrupted for the rest of the Python process — every
+# subsequent `_run_query()` call hits the same wedge and re-throws. The
+# executor itself can't auto-recover (the underlying CLI subprocess and
+# its read pipe are in an unrecoverable state); only a workspace restart
+# clears it.
+#
+# The heartbeat task reads these helpers and reports
+# `runtime_state="wedged"` to the platform, which flips the workspace to
+# `degraded` so the canvas surfaces a Restart hint instead of leaving
+# the user staring at a green dot while every chat hangs.
+#
+# Module scope (not instance scope) is deliberate: the wedge is a
+# property of the Python process, not the executor. A future per-org
+# multi-executor design could move this to a shared registry, but with
+# one executor per workspace process today the simplest lock-free
+# read+write fits.
+_sdk_wedged_reason: str | None = None
+
+
+def is_wedged() -> bool:
+    """True if the Claude SDK has hit a non-recoverable init wedge in
+    this process. Sticky until process restart."""
+    return _sdk_wedged_reason is not None
+
+
+def wedge_reason() -> str:
+    """Human-readable description of the wedge cause, or empty string
+    when not wedged. Surfaced to the canvas via heartbeat sample_error."""
+    return _sdk_wedged_reason or ""
+
+
+def _mark_sdk_wedged(reason: str) -> None:
+    """Internal — flag the SDK as wedged. Only the first call wins
+    (subsequent identical wedges shouldn't overwrite a more specific
+    reason). Tests use `_reset_sdk_wedge_for_test()` to clear."""
+    global _sdk_wedged_reason
+    if _sdk_wedged_reason is None:
+        _sdk_wedged_reason = reason
+        logger.error("SDK wedge detected: %s — workspace will report degraded until a successful query clears it", reason)
+
+
+def _clear_sdk_wedge_on_success() -> None:
+    """Auto-recovery — called from _run_query after a successful
+    completion. The original wedge could be transient (a single network
+    blip during the SDK's first-message handshake), and a sticky-only
+    flag would lock the workspace into degraded forever even after the
+    SDK started working again. Clearing on observed success means the
+    next heartbeat after a working query reports `runtime_state` empty
+    and the platform flips status back to online.
+
+    No-op when not wedged (the common case)."""
+    global _sdk_wedged_reason
+    if _sdk_wedged_reason is not None:
+        logger.info("SDK wedge cleared after successful query — workspace will recover to online on next heartbeat")
+        _sdk_wedged_reason = None
+
+
+def _reset_sdk_wedge_for_test() -> None:
+    """Test-only escape hatch. Production code clears the wedge via
+    `_clear_sdk_wedge_on_success` when a query succeeds; this helper
+    is for unit tests that need to reset between cases."""
+    global _sdk_wedged_reason
+    _sdk_wedged_reason = None
+
+
+# Per-tool-use summarizers. Reads the most-useful argument from each
+# tool's input dict so the canvas progress feed shows
+# `🛠 Read /tmp/foo` instead of the bare tool name. Anything not in the
+# table falls through to a generic "🛠 <tool>(…)" line. Order keys by
+# tool frequency so a future contributor can see the high-traffic
+# tools first.
+_TOOL_USE_SUMMARIZERS: dict[str, Callable[[dict], str]] = {
+    "Read":  lambda i: f"📄 Read {i.get('file_path', '?')}",
+    "Write": lambda i: f"✍️  Write {i.get('file_path', '?')}",
+    "Edit":  lambda i: f"✏️  Edit {i.get('file_path', '?')}",
+    "Bash":  lambda i: f"⚡ Bash: {(i.get('command') or '')[:80]}",
+    "Glob":  lambda i: f"🔍 Glob {i.get('pattern', '?')}",
+    "Grep":  lambda i: f"🔍 Grep {i.get('pattern', '?')}",
+    "WebFetch": lambda i: f"🌐 WebFetch {i.get('url', '?')}",
+    "WebSearch": lambda i: f"🌐 WebSearch {i.get('query', '?')}",
+    "Task":  lambda i: f"🤖 Task: {(i.get('description') or '')[:60]}",
+    "TodoWrite": lambda _i: "📝 TodoWrite",
+}
+
+
+def _summarize_tool_use(tool_name: str, tool_input: dict) -> str:
+    summarizer = _TOOL_USE_SUMMARIZERS.get(tool_name)
+    if summarizer:
+        try:
+            return summarizer(tool_input or {})[:200]
+        except Exception:
+            pass
+    # Generic fallback. Truncated so a tool with a giant input dict
+    # doesn't write a 10kB activity row per call.
+    return f"🛠 {tool_name}(…)"[:200]
+
+
+async def _report_tool_use(block: Any) -> None:
+    """Fire-and-forget agent_log activity row per tool the SDK invoked,
+    so the canvas's MyChat live-progress feed can render each step
+    Claude is doing instead of staring at a single spinner.
+
+    Posts directly to /workspaces/:id/activity rather than through
+    a2a_tools.report_activity — that helper also pushes a current_task
+    heartbeat which would duplicate as a TASK_UPDATED line in the
+    chat feed. The workspace card's current_task is already set
+    once per turn by the executor's set_current_task(brief_summary)
+    call, so the per-tool telemetry stays a chat-only signal.
+
+    Best-effort — any failure (network blip, platform unreachable, the
+    block didn't have the attrs we expected) is swallowed silently.
+    The tool will still execute regardless; only the progress
+    telemetry is lost. Deliberately does NOT raise — a malformed
+    block must not abort the message-stream iteration in
+    `_run_query`.
+    """
+    try:
+        # Lazy imports to keep this helper non-essential — the
+        # executor must still run when the workspace's network/auth
+        # plumbing isn't fully set up (e.g. unit tests).
+        import httpx
+        from a2a_client import PLATFORM_URL, WORKSPACE_ID
+        from platform_auth import auth_headers
+    except Exception:
+        return
+    try:
+        tool_name = getattr(block, "name", "") or ""
+        tool_input = getattr(block, "input", {}) or {}
+        if not tool_name:
+            return
+        summary = _summarize_tool_use(tool_name, tool_input)
+        # 5s budget — long enough to absorb a single platform GC
+        # pause, short enough that a wedged platform doesn't slow
+        # the tool-iteration cadence beyond noticeable.
+        async with httpx.AsyncClient(timeout=5.0) as client:
+            await client.post(
+                f"{PLATFORM_URL}/workspaces/{WORKSPACE_ID}/activity",
+                json={
+                    "activity_type": "agent_log",
+                    "source_id": WORKSPACE_ID,
+                    # target_id == source for self-actions. Matches the
+                    # convention other self-logged activity rows use
+                    # (a2a_receive when the workspace logs its own
+                    # outbound reply) so DB consumers joining on
+                    # target_id see a well-defined value.
+                    "target_id": WORKSPACE_ID,
+                    "summary": summary,
+                    "status": "ok",
+                    "method": tool_name,
+                },
+                headers=auth_headers(),
+            )
+    except Exception:
+        # Telemetry failures must not break the conversation.
+        return
+
+
+# Substring patterns that classify an exception as the specific
+# claude_agent_sdk init-timeout wedge (vs. a rate-limit, transient
+# subprocess crash, etc.). Match is case-insensitive on the formatted
+# error string. Adding a new pattern here MUST come with a test in
+# tests/test_claude_sdk_executor.py — false-positives lock the
+# workspace into degraded until the next successful query clears it.
+#
+# `:initialize` suffix-anchored — the SDK can theoretically time out
+# on later control messages (in-flight tool callbacks), but those
+# don't leave the SDK in the unrecoverable post-init state we're
+# trying to detect. Limit the pattern to the specific wedge.
+_WEDGE_ERROR_PATTERNS = (
+    "control request timeout: initialize",
+)
+

 _SWALLOWED_STDERR_MARKER = "Check stderr output for details"

@ -344,6 +520,14 @@ class ClaudeSDKExecutor(AgentExecutor):
                    for block in message.content:
                        if isinstance(block, sdk.TextBlock):
                            assistant_chunks.append(block.text)
+                        else:
+                            # ToolUseBlock / ServerToolUseBlock are present
+                            # on the real SDK but not on the conftest stub —
+                            # check by class name to avoid an isinstance()
+                            # against a class the stub doesn't define.
+                            cls = type(block).__name__
+                            if cls in ("ToolUseBlock", "ServerToolUseBlock"):
+                                await _report_tool_use(block)
                elif isinstance(message, sdk.ResultMessage):
                    sid = getattr(message, "session_id", None)
                    if sid:
@ -352,6 +536,20 @@ class ClaudeSDKExecutor(AgentExecutor):
        finally:
            self._active_stream = None
        text = result_text if result_text is not None else "".join(assistant_chunks)
+        # Auto-recover the wedge flag — if a previous query() left this
+        # process in `_sdk_wedged` and THIS query just completed
+        # cleanly, the SDK clearly works again. Clear so the next
+        # heartbeat reports runtime_state empty and the platform flips
+        # status degraded → online without a manual restart.
+        #
+        # Gate on actual content from the stream so a degenerate
+        # "iterator returned without raising but emitted nothing"
+        # case (possible from a partial stream or a stub SDK) doesn't
+        # falsely advertise recovery. A real successful query yields
+        # at least a ResultMessage (sets result_text) or one
+        # AssistantMessage TextBlock (populates assistant_chunks).
+        if result_text is not None or assistant_chunks:
+            _clear_sdk_wedge_on_success()
        return QueryResult(text=text, session_id=session_id)

    # ------------------------------------------------------------------
@ -365,6 +563,18 @@ class ClaudeSDKExecutor(AgentExecutor):
        workspace queue rather than racing on `_session_id` / `_active_stream`.
        """
        user_input = extract_message_text(context.message)
+        # Surface attached files to claude-code via a manifest in the prompt.
+        # Claude Code reads files through its own Read/Glob tools by path —
+        # as long as the prompt names the path, the CLI will open them on
+        # demand. Same contract every platform runtime uses so the UX is
+        # identical across hermes / langgraph / claude-code.
+        attached = extract_attached_files(context.message)
+        if attached:
+            manifest = "\n\nAttached files:\n" + "\n".join(
+                f"- {f['name']} ({f['mime_type'] or 'unknown type'}) at {f['path']}"
+                for f in attached
+            )
+            user_input = (user_input + manifest) if user_input else manifest.lstrip()
        if not user_input:
            await event_queue.enqueue_event(new_agent_text_message(_NO_TEXT_MSG))
            return
@ -375,7 +585,26 @@ class ClaudeSDKExecutor(AgentExecutor):
        # Enqueue outside the lock so the next queued turn can start
        # preparing its prompt while this turn's response ships. Event
        # ordering is preserved per-queue by the A2A server, so no races.
-        await event_queue.enqueue_event(new_agent_text_message(response_text))
+        # If the response mentions /workspace/... files, stage each and
+        # emit FileParts alongside the text so the canvas can download.
+        outbound = collect_outbound_files(response_text)
+        if outbound:
+            from a2a.types import FilePart, FileWithUri, Message, Part, Role, TextPart
+            import uuid as _uuid
+            parts: list = [Part(root=TextPart(text=response_text))] if response_text else []
+            for f in outbound:
+                parts.append(Part(root=FilePart(file=FileWithUri(
+                    uri="workspace:" + f["path"],
+                    name=f["name"],
+                    mimeType=f["mime_type"],
+                ))))
+            await event_queue.enqueue_event(Message(
+                messageId=_uuid.uuid4().hex,
+                role=Role.agent,
+                parts=parts,
+            ))
+        else:
+            await event_queue.enqueue_event(new_agent_text_message(response_text))

    @staticmethod
    def _is_retryable(exc: BaseException) -> bool:
@ -473,6 +702,19 @@ class ClaudeSDKExecutor(AgentExecutor):
                    # subprocess died.
                    logger.error("SDK agent error [claude-code]: %s", formatted)
                    logger.exception("SDK agent error [claude-code] — full traceback follows")
+                    # Detect the specific claude_agent_sdk init-wedge case
+                    # so the heartbeat task can flip the workspace to
+                    # `degraded`. Match on the lowercased formatted error;
+                    # `formatted` is whatever _format_process_error built,
+                    # which already includes both the message and the
+                    # exception class name.
+                    formatted_lc = formatted.lower()
+                    for pat in _WEDGE_ERROR_PATTERNS:
+                        if pat in formatted_lc:
+                            _mark_sdk_wedged(
+                                f"claude_agent_sdk wedge: {formatted[:200]} — restart workspace to recover"
+                            )
+                            break
                    response_text = sanitize_agent_error(exc)
                    break
        finally:
--- a/workspace/executor_helpers.py
+++ b/workspace/executor_helpers.py
@ -10,16 +10,22 @@ Provides:
 - Brief task summary extraction (markdown-aware)
 - Error message sanitization (exception classes and subprocess categories)
 - Shared workspace path constants and the MCP server path resolver
+- Attached-file extraction and outbound-file staging (platform-wide chat
+  attachments — every runtime routes through these helpers so the
+  drag-dropped image / returned report experience is identical)
 """

 from __future__ import annotations

 import asyncio
+import base64
 import json
 import logging
+import mimetypes
 import os
 import re
 import subprocess
+import uuid as _uuid
 from pathlib import Path
 from typing import TYPE_CHECKING, Any

@ -582,3 +588,276 @@ async def auto_push_hook(cwd: str | None = None) -> None:
        await asyncio.to_thread(_auto_push_and_pr_sync, cwd)
    except Exception:
        logger.exception("auto_push_hook: failed (non-fatal)")
+
+
+# ========================================================================
+# Chat attachments — platform-level support for drag-drop uploads and
+# agent-returned files. Every runtime executor routes inbound file parts
+# through ``extract_attached_files`` + ``build_user_content_with_files``
+# and post-processes replies through ``collect_outbound_files`` so a file
+# attached in the canvas shows up correctly across hermes, claude-code,
+# langgraph, CLI runtimes, etc. Living here (not in any one executor)
+# keeps the attachment contract in one place — match canvas/ChatTab.tsx
+# and workspace-server/internal/handlers/chat_files.go, and every runtime
+# benefits at once.
+# ========================================================================
+
+# Matches CHAT_UPLOAD_DIR in workspace-server/internal/handlers/chat_files.go.
+# The canvas uploads files here; outbound files get staged here so the
+# download endpoint (which whitelists this directory) can serve them.
+CHAT_UPLOADS_DIR = f"{WORKSPACE_MOUNT}/.molecule/chat-uploads"
+
+
+def ensure_workspace_writable() -> None:
+    """Make /workspace (and the chat-uploads dir) writable by whoever the
+    agent will run as.
+
+    Docker's default for a new named volume is root-owned 755 — that
+    bricks the agent→user "write a file, hand it to the user" flow for
+    every template whose agent runs under a non-root user (hermes uses
+    `agent`, most others use some dedicated UID too). Each Dockerfile
+    solving this individually was the anti-pattern; this helper belongs
+    to the platform so every runtime picks up the fix by calling into
+    ``molecule_runtime`` during boot.
+
+    Runs best-effort: if molecule-runtime itself started as non-root
+    (rare, but possible in some CP configurations), the chmod silently
+    no-ops — the template's own start.sh is expected to have already
+    handled perms in that case. We prefer silent degradation to a hard
+    boot failure because misconfigured perms are recoverable (user gets
+    a clear "permission denied" from the agent) but an uncatchable
+    exception here would wedge the whole workspace in `provisioning`.
+    """
+    # 777 matches the intent: one container, one tenant, anyone in the
+    # container can read/write workspace files. Cross-tenant isolation
+    # happens at the Docker boundary, not inside the volume.
+    for path in (WORKSPACE_MOUNT, CHAT_UPLOADS_DIR):
+        try:
+            os.makedirs(path, exist_ok=True)
+            os.chmod(path, 0o777)
+        except PermissionError:
+            logger.info(
+                "ensure_workspace_writable: lacking root (non-fatal) for %s", path
+            )
+        except OSError as exc:
+            logger.warning(
+                "ensure_workspace_writable: %s for %s", exc, path
+            )
+
+# Cap image inlining so a 25MB PNG doesn't blow past provider context
+# limits. Images larger than this fall back to a path mention only —
+# the agent can still read them via file_read / bash tools.
+MAX_INLINE_ATTACHMENT_BYTES = 8 * 1024 * 1024
+
+# Absolute /workspace/... paths the agent may mention in its reply.
+# Leading boundary prevents matching the middle of URLs like
+# https://example.com/workspace/foo while allowing markdown emphasis
+# wrappers (**, *, _, `, (, [) so "**/workspace/x.pdf**" still matches.
+# Trailing '.' is stripped post-capture (see collect_outbound_files).
+_WORKSPACE_PATH_RE = re.compile(
+    r"(?:^|[\s`\"'*_(\[])(/workspace/[A-Za-z0-9_./\-]+)"
+)
+_UNSAFE_NAME_RE = re.compile(r"[^A-Za-z0-9._\-]")
+
+
+def resolve_attachment_uri(uri: str) -> str | None:
+    """Resolve a canvas-issued attachment URI to an in-container path.
+
+    Accepted shapes (matches canvas uploads.ts + chat_files.go):
+      - ``workspace:/workspace/.molecule/chat-uploads/<name>``  (canonical)
+      - ``file:///workspace/...``                               (legacy)
+      - ``/workspace/...``                                      (bare)
+
+    Anything resolving outside ``/workspace`` is refused. ``Path.resolve``
+    collapses ``..`` segments so a crafted ``workspace:/workspace/../etc/passwd``
+    returns None instead of leaking the real filesystem.
+    """
+    if not uri:
+        return None
+    path: str | None = None
+    if uri.startswith("workspace:"):
+        path = uri[len("workspace:"):]
+    elif uri.startswith("file://"):
+        path = uri[len("file://"):]
+    elif uri.startswith("/"):
+        path = uri
+    if not path:
+        return None
+    try:
+        resolved = str(Path(path).resolve())
+    except (OSError, RuntimeError):
+        return None
+    if not (resolved == WORKSPACE_MOUNT or resolved.startswith(WORKSPACE_MOUNT + "/")):
+        return None
+    return resolved
+
+
+def extract_attached_files(message: Any) -> list[dict[str, str]]:
+    """Pull ``{name, mime_type, path}`` dicts out of an A2A message.
+
+    Handles the discriminated-union shape ``part.root.file`` that a2a-sdk
+    produces via Pydantic RootModel, and the flatter ``part.file`` shape
+    hand-built callers sometimes emit. Non-file parts and files with
+    unresolvable URIs are skipped — the caller sees an empty list rather
+    than a mix of valid and broken entries.
+    """
+    if message is None:
+        return []
+    parts = getattr(message, "parts", None) or []
+    out: list[dict[str, str]] = []
+    for part in parts:
+        root = getattr(part, "root", part)
+        if getattr(root, "kind", None) != "file":
+            continue
+        f = getattr(root, "file", None)
+        if f is None:
+            continue
+        uri = getattr(f, "uri", "") or ""
+        name = getattr(f, "name", "") or ""
+        mime = getattr(f, "mimeType", None) or getattr(f, "mime_type", None) or ""
+        path = resolve_attachment_uri(uri)
+        if not path or not os.path.isfile(path):
+            logger.warning("skipping attached file with unresolvable uri=%r", uri)
+            continue
+        out.append({"name": name, "mime_type": mime, "path": path})
+    return out
+
+
+def _read_as_data_url(path: str, mime_type: str) -> str | None:
+    """Return ``data:<mime>;base64,<...>`` or None if too large / unreadable."""
+    try:
+        size = os.path.getsize(path)
+    except OSError:
+        return None
+    if size > MAX_INLINE_ATTACHMENT_BYTES:
+        logger.info(
+            "attachment %s too large to inline (%d bytes > cap)", path, size
+        )
+        return None
+    try:
+        with open(path, "rb") as fh:
+            b64 = base64.b64encode(fh.read()).decode("ascii")
+    except OSError as exc:
+        logger.warning("failed to read attachment %s: %s", path, exc)
+        return None
+    return f"data:{mime_type or 'application/octet-stream'};base64,{b64}"
+
+
+def build_user_content_with_files(
+    user_text: str, attached: list[dict[str, str]]
+) -> Any:
+    """Combine text + attachments into an OpenAI-compat ``content`` field.
+
+    - No attachments → plain string (preserves simple shape for non-vision
+      models).
+    - Any image attachment → list-of-parts with text + image_url entries
+      (multi-modal; vision-capable models see the image bytes). Skipped
+      when ``MOLECULE_DISABLE_IMAGE_INLINING`` is truthy — some provider/
+      model combos (e.g. MiniMax's hermes-agent adapter as of 2026-04)
+      claim vision support but hang indefinitely on image payloads, and
+      the caller may prefer manifest-only so the agent can still use its
+      file_read tool instead of stalling the whole request.
+    - Non-image attachments → manifest appended to the text so the agent
+      knows the filenames + absolute paths and can inspect via its
+      file_read / bash tools.
+
+    This is the platform's one-line fix for "agent didn't know I attached
+    a file": any executor that calls it gets attachment awareness for
+    free, regardless of which LLM provider is behind it.
+    """
+    if not attached:
+        return user_text
+
+    manifest_lines = [
+        f"- {f['name']} ({f['mime_type'] or 'unknown type'}) at {f['path']}"
+        for f in attached
+    ]
+    manifest = "Attached files:\n" + "\n".join(manifest_lines)
+    combined = f"{user_text}\n\n{manifest}" if user_text else manifest
+
+    disable_inline = os.environ.get("MOLECULE_DISABLE_IMAGE_INLINING", "").lower() in (
+        "1", "true", "yes", "on",
+    )
+    if disable_inline or not any(
+        (f["mime_type"] or "").startswith("image/") for f in attached
+    ):
+        return combined
+
+    content: list[dict[str, Any]] = [{"type": "text", "text": combined}]
+    for f in attached:
+        mt = f["mime_type"] or ""
+        if not mt.startswith("image/"):
+            continue
+        data_url = _read_as_data_url(f["path"], mt)
+        if data_url is not None:
+            content.append({"type": "image_url", "image_url": {"url": data_url}})
+    return content
+
+
+def _sanitize_attachment_name(name: str) -> str:
+    cleaned = _UNSAFE_NAME_RE.sub("_", name) or "file"
+    return cleaned[:100]
+
+
+def _guess_mime(path: str) -> str:
+    mt, _ = mimetypes.guess_type(path)
+    return mt or "application/octet-stream"
+
+
+def stage_outbound_file(src_path: str) -> dict[str, str] | None:
+    """Copy ``src_path`` into ``CHAT_UPLOADS_DIR`` (unless already there)
+    and return ``{name, mime_type, path}`` so the caller can attach it to
+    the A2A reply.
+
+    Files already in the chat-uploads directory are attached as-is;
+    anything elsewhere under /workspace gets a uuid-prefixed copy so
+    basenames can't collide with existing uploads and the original
+    workspace layout stays untouched. Returns None on I/O failure.
+    """
+    try:
+        os.makedirs(CHAT_UPLOADS_DIR, exist_ok=True)
+    except OSError as exc:
+        logger.warning("cannot ensure chat-uploads dir: %s", exc)
+        return None
+    name = os.path.basename(src_path)
+    mime = _guess_mime(src_path)
+    if os.path.dirname(src_path) == CHAT_UPLOADS_DIR:
+        return {"name": name, "mime_type": mime, "path": src_path}
+    try:
+        stored = f"{_uuid.uuid4().hex[:16]}-{_sanitize_attachment_name(name)}"
+        dst = os.path.join(CHAT_UPLOADS_DIR, stored)
+        with open(src_path, "rb") as fin, open(dst, "wb") as fout:
+            fout.write(fin.read())
+    except OSError as exc:
+        logger.warning("failed to stage %s → chat-uploads: %s", src_path, exc)
+        return None
+    return {"name": name, "mime_type": mime, "path": dst}
+
+
+def collect_outbound_files(reply_text: str) -> list[dict[str, str]]:
+    """Detect /workspace/... paths the agent mentioned in its reply and
+    stage each one so it can be returned to the canvas as a file part.
+
+    Each unique, readable file goes through ``stage_outbound_file`` — the
+    download endpoint only serves files from whitelisted directories, so
+    a reply referencing /workspace/private/secret.pem still can't be
+    exfiltrated via the chat download link unless we've explicitly
+    copied it under the chat-uploads dir.
+    """
+    if not reply_text:
+        return []
+    seen: set[str] = set()
+    out: list[dict[str, str]] = []
+    for match in _WORKSPACE_PATH_RE.finditer(reply_text):
+        # Trim trailing sentence punctuation that the character class
+        # greedily swallowed — "wrote /workspace/x.txt." would otherwise
+        # resolve to "x.txt." which doesn't exist.
+        raw = match.group(1).rstrip(".")
+        resolved = resolve_attachment_uri(raw)
+        if not resolved or resolved in seen or not os.path.isfile(resolved):
+            continue
+        seen.add(resolved)
+        staged = stage_outbound_file(resolved)
+        if staged is not None:
+            out.append(staged)
+    return out
--- a/workspace/heartbeat.py
+++ b/workspace/heartbeat.py
@ -17,7 +17,31 @@ from pathlib import Path

 import httpx

-from platform_auth import auth_headers, refresh_cache
+from platform_auth import auth_headers, refresh_cache, self_source_headers
+
+
+def _runtime_state_payload() -> dict:
+    """Build the {runtime_state, sample_error} portion of the heartbeat
+    body when the Claude SDK has hit a wedge. Returns an empty dict
+    when the runtime is healthy so the heartbeat payload doesn't grow
+    fields the platform doesn't need.
+
+    Imported lazily so workspaces running non-Claude runtimes (where
+    `claude_sdk_executor` may not be importable at all) keep working —
+    a missing import means "no Claude wedge possible here, healthy."
+    """
+    try:
+        from claude_sdk_executor import is_wedged, wedge_reason
+    except Exception:
+        return {}
+    if not is_wedged():
+        return {}
+    return {
+        "runtime_state": "wedged",
+        # sample_error doubles as the human-readable banner text on the
+        # canvas's degraded card — keep it short and actionable.
+        "sample_error": wedge_reason(),
+    }

 logger = logging.getLogger(__name__)

@ -85,16 +109,23 @@ class HeartbeatLoop:
                while True:
                    # 1. Send heartbeat (Phase 30.1: include auth header if token known)
                    try:
+                        body = {
+                            "workspace_id": self.workspace_id,
+                            "error_rate": self.error_rate,
+                            "sample_error": self.sample_error,
+                            "active_tasks": self.active_tasks,
+                            "current_task": self.current_task,
+                            "uptime_seconds": int(time.time() - self.start_time),
+                        }
+                        # Layer the runtime-wedge fields on top so a
+                        # non-empty sample_error from the wedge wins
+                        # over the (typically empty) heartbeat
+                        # sample_error field. The platform reads
+                        # runtime_state to flip status → degraded.
+                        body.update(_runtime_state_payload())
                        await client.post(
                            f"{self.platform_url}/registry/heartbeat",
-                            json={
-                                "workspace_id": self.workspace_id,
-                                "error_rate": self.error_rate,
-                                "sample_error": self.sample_error,
-                                "active_tasks": self.active_tasks,
-                                "current_task": self.current_task,
-                                "uptime_seconds": int(time.time() - self.start_time),
-                            },
+                            json=body,
                            headers=auth_headers(),
                        )
                        self.error_count = 0
@ -113,16 +144,18 @@ class HeartbeatLoop:
                            logger.warning("Heartbeat 401 for %s — refreshing token cache and retrying once", self.workspace_id)
                            refresh_cache()
                            try:
+                                retry_body = {
+                                    "workspace_id": self.workspace_id,
+                                    "error_rate": self.error_rate,
+                                    "sample_error": self.sample_error,
+                                    "active_tasks": self.active_tasks,
+                                    "current_task": self.current_task,
+                                    "uptime_seconds": int(time.time() - self.start_time),
+                                }
+                                retry_body.update(_runtime_state_payload())
                                await client.post(
                                    f"{self.platform_url}/registry/heartbeat",
-                                    json={
-                                        "workspace_id": self.workspace_id,
-                                        "error_rate": self.error_rate,
-                                        "sample_error": self.sample_error,
-                                        "active_tasks": self.active_tasks,
-                                        "current_task": self.current_task,
-                                        "uptime_seconds": int(time.time() - self.start_time),
-                                    },
+                                    json=retry_body,
                                    headers=auth_headers(),
                                )
                                self._consecutive_failures = 0
@ -284,6 +317,9 @@ class HeartbeatLoop:
                else:
                    self._last_self_message_time = now
                    try:
+                        # self_source_headers() adds X-Workspace-ID so the
+                        # platform tags this row source=agent, not canvas
+                        # — see platform_auth.py for the full rationale.
                        await client.post(
                            f"{self.platform_url}/workspaces/{self.workspace_id}/a2a",
                            json={
@ -295,7 +331,7 @@ class HeartbeatLoop:
                                    },
                                },
                            },
-                            headers=auth_headers(),
+                            headers=self_source_headers(self.workspace_id),
                            timeout=120.0,
                        )
                        logger.info("Heartbeat: self-message sent to process delegation results")
--- a/workspace/main.py
+++ b/workspace/main.py
@ -33,7 +33,7 @@ from initial_prompt import (
    mark_initial_prompt_attempted,
    resolve_initial_prompt_marker,
 )
-from platform_auth import auth_headers
+from platform_auth import auth_headers, self_source_headers


 def get_machine_ip() -> str:  # pragma: no cover
@ -69,6 +69,15 @@ async def main():  # pragma: no cover
    # 0. Initialise OpenTelemetry (no-op if packages not installed)
    setup_telemetry(service_name=workspace_id)

+    # 0a. Fix /workspace perms before any agent code runs. Docker ships
+    # named volumes as root:root 755 — without this the non-root agent
+    # user can't write files the user asked it to produce, and the
+    # "agent → file → user downloads" flow dead-ends at a bash "permission
+    # denied". Best-effort: no-ops silently if molecule-runtime itself
+    # isn't root (template's own start.sh should have handled it there).
+    from executor_helpers import ensure_workspace_writable
+    ensure_workspace_writable()
+
    # 1. Load config
    config = load_config(config_path)
    port = config.a2a.port
@ -430,7 +439,15 @@ async def main():  # pragma: no cover
                # silently rejected once any workspace has a live token on
                # file. Without this, initial_prompt 401s in multi-tenant
                # mode exactly like /registry/register did in #215.
-                headers = {"Content-Type": "application/json", **auth_headers()}
+                # X-Workspace-ID via self_source_headers() so the platform
+                # tags the row source=agent — without it the canvas's
+                # My Chat tab renders the initial_prompt as if the user
+                # had typed it. See platform_auth.py for the full
+                # explanation.
+                headers = {
+                    "Content-Type": "application/json",
+                    **self_source_headers(workspace_id),
+                }

                # Retry with backoff — the platform proxy may not be able to
                # reach us yet (container networking takes a moment to settle).
@ -522,7 +539,13 @@ async def main():  # pragma: no cover
                    # actual outcome instead of a bare "post failed" line.
                    # #220: include auth_headers() on every idle fire. Without
                    # this, the idle loop 401s in multi-tenant mode.
-                    headers = {"Content-Type": "application/json", **auth_headers()}
+                    # self_source_headers() adds X-Workspace-ID so the
+                    # platform classifies the idle fire as source=agent
+                    # rather than user-typed canvas input.
+                    headers = {
+                        "Content-Type": "application/json",
+                        **self_source_headers(workspace_id),
+                    }
                    try:
                        req = _urlreq.Request(
                            f"{platform_url}/workspaces/{workspace_id}/a2a",
--- a/workspace/platform_auth.py
+++ b/workspace/platform_auth.py
@ -98,6 +98,26 @@ def auth_headers() -> dict[str, str]:
    return {"Authorization": f"Bearer {tok}"}


+def self_source_headers(workspace_id: str) -> dict[str, str]:
+    """Return auth headers PLUS X-Workspace-ID identifying this workspace
+    as the source of the request.
+
+    Use this for any POST the workspace's own runtime fires against the
+    platform's A2A endpoints — heartbeat self-messages, initial_prompt,
+    idle-loop fires, peer-to-peer A2A from runtime tools. Without the
+    X-Workspace-ID header the platform's a2a_receive logger writes
+    source_id=NULL, which the canvas's My Chat tab interprets as a
+    user-typed message and renders the internal prompt to the user.
+    See workspace-server/internal/handlers/a2a_proxy.go:184 for the
+    server-side classification rule.
+
+    Centralised here so adding a new system header (e.g. a per-fire
+    correlation ID) only touches one place — and so that any
+    workspace→A2A POST that doesn't use this helper stands out in
+    review as a probable bug."""
+    return {**auth_headers(), "X-Workspace-ID": workspace_id}
+
+
 def clear_cache() -> None:
    """Reset the in-memory cache. Used by tests that write fresh token
    files between cases."""
--- a/workspace/tests/test_a2a_client.py
+++ b/workspace/tests/test_a2a_client.py
@ -199,10 +199,34 @@ class TestSendA2AMessage:
            result = await a2a_client.send_a2a_message("http://target/a2a", "task")

        assert result.startswith(a2a_client._A2A_ERROR_PREFIX)
-        assert "unknown" in result
+        # The error includes the JSON-RPC code so the operator can look it
+        # up; "no message" surfaces the missing-message condition explicitly
+        # instead of the previous opaque "unknown".
+        assert "code=-32600" in result
+        assert "no message" in result.lower()
+        # Target URL is included so chained delegations are traceable.
+        assert "target=http://target/a2a" in result

-    async def test_neither_result_nor_error_returns_str_of_data(self):
-        """Response with neither 'result' nor 'error' → str(data)."""
+    async def test_jsonrpc_error_with_code_zero_includes_code_in_detail(self):
+        """JSON-RPC error code=0 is technically not valid in the spec,
+        but a malformed peer can still send it — make sure the code is
+        preserved in the detail rather than collapsing into the
+        no-code path. Locks in the `code is not None` semantics over
+        the truthy-check shortcut."""
+        import a2a_client
+
+        resp = _make_response(200, {"error": {"code": 0, "message": "weird"}})
+        mock_client = _make_mock_client(post_resp=resp)
+
+        with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+            result = await a2a_client.send_a2a_message("http://target/a2a", "task")
+
+        assert result.startswith(a2a_client._A2A_ERROR_PREFIX)
+        assert "code=0" in result
+        assert "weird" in result
+
+    async def test_neither_result_nor_error_returns_a2a_error_with_payload(self):
+        """Response with neither 'result' nor 'error' → A2A_ERROR + payload context."""
        import a2a_client

        payload = {"jsonrpc": "2.0", "id": "abc123"}
@ -212,7 +236,14 @@ class TestSendA2AMessage:
        with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
            result = await a2a_client.send_a2a_message("http://target/a2a", "task")

-        assert result == str(payload)
+        # Pre-fix this returned bare str(payload) which the canvas
+        # rendered as a confusing "looks like a successful response"
+        # block. Now it's tagged so downstream UI / delegate_task
+        # routes it through the error path.
+        assert result.startswith(a2a_client._A2A_ERROR_PREFIX)
+        assert "unexpected response shape" in result
+        assert "abc123" in result  # snippet of payload included for context
+        assert "target=http://target/a2a" in result

    async def test_exception_returns_error_prefix_and_message(self):
        """Network exception → returns _A2A_ERROR_PREFIX + exception text."""
@ -225,6 +256,39 @@ class TestSendA2AMessage:

        assert result.startswith(a2a_client._A2A_ERROR_PREFIX)
        assert "connection refused" in result
+        # Exception class name is prepended when the message doesn't
+        # already include it — gives the operator a typed handle to
+        # search for in container logs.
+        assert "ConnectionError" in result
+        assert "target=http://target/a2a" in result
+
+    async def test_empty_stringifying_exception_falls_back_to_class_name(self):
+        """The user's reported bug: httpx.RemoteProtocolError and similar
+        exceptions can stringify to "" — pre-fix the canvas rendered
+        "[A2A_ERROR] " with no detail. Verify the empty path now
+        produces an actionable message including the exception type
+        and the target URL."""
+        import a2a_client
+
+        # Subclass Exception with __str__ → "" to simulate the
+        # silent-exception variants without depending on a specific
+        # httpx version's behavior.
+        class _SilentRemoteProtocolError(Exception):
+            def __str__(self) -> str:
+                return ""
+
+        mock_client = _make_mock_client(post_exc=_SilentRemoteProtocolError())
+
+        with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+            result = await a2a_client.send_a2a_message("http://target/a2a", "task")
+
+        # Must NOT be just the bare prefix — that's the regression.
+        assert result != a2a_client._A2A_ERROR_PREFIX.strip()
+        assert result != f"{a2a_client._A2A_ERROR_PREFIX}"
+        # Must include the class name + something explanatory.
+        assert "_SilentRemoteProtocolError" in result
+        assert "no message" in result.lower()
+        assert "target=http://target/a2a" in result

    async def test_result_text_part_missing_text_key_returns_empty(self):
        """Part dict without 'text' key → falls back to '' (empty string returned)."""
--- a/workspace/tests/test_a2a_tools_module.py
+++ b/workspace/tests/test_a2a_tools_module.py
@ -114,11 +114,11 @@ class TestDelegateTask:
            async def __aexit__(self, *a): pass

            async def get(self, url, headers=None):
-                calls.append(("get", url))
+                calls.append(("get", url, headers))
                return _FakeResponse(200, {"url": "http://target.test/a2a"})

-            async def post(self, url, json=None):
-                calls.append(("post", url))
+            async def post(self, url, json=None, headers=None):
+                calls.append(("post", url, headers))
                return _FakeResponse(200, {
                    "result": {
                        "parts": [{"kind": "text", "text": "Task done!"}]
@ -130,7 +130,17 @@ class TestDelegateTask:
        result = await mod.delegate_task("ws-target", "do something")
        assert result == "Task done!"
        assert any(c[0] == "get" for c in calls)
-        assert any(c[0] == "post" for c in calls)
+        post_calls = [c for c in calls if c[0] == "post"]
+        assert post_calls, "delegate_task must POST to the target's /a2a endpoint"
+        # Regression: peer A2A POSTs MUST include X-Workspace-ID so
+        # the platform's a2a_receive logger writes source_id correctly
+        # — without it the recipient's My Chat tab would render the
+        # delegation as user-typed input. Same hazard fixed in
+        # heartbeat.py / a2a_client.py / main.py initial+idle flows.
+        post_headers = post_calls[0][2] or {}
+        assert post_headers.get("X-Workspace-ID"), (
+            f"delegate_task POST must include X-Workspace-ID; got headers={post_headers!r}"
+        )

    async def test_delegate_task_success_empty_parts(self, monkeypatch):
        """Result with empty parts list falls back to str(result)."""
@ -144,7 +154,7 @@ class TestDelegateTask:
            async def get(self, url, headers=None):
                return _FakeResponse(200, {"url": "http://target.test/a2a"})

-            async def post(self, url, json=None):
+            async def post(self, url, json=None, headers=None):
                return _FakeResponse(200, {"result": {"parts": []}})

        monkeypatch.setattr(mod.httpx, "AsyncClient", FakeClient)
@ -217,7 +227,7 @@ class TestDelegateTask:
            async def get(self, url, headers=None):
                return _FakeResponse(200, {"url": "http://target.test/a2a"})

-            async def post(self, url, json=None):
+            async def post(self, url, json=None, headers=None):
                return _FakeResponse(200, {
                    "error": {"code": -32603, "message": "Internal error"}
                })
@ -240,7 +250,7 @@ class TestDelegateTask:
            async def get(self, url, headers=None):
                return _FakeResponse(200, {"url": "http://target.test/a2a"})

-            async def post(self, url, json=None):
+            async def post(self, url, json=None, headers=None):
                return _FakeResponse(200, {"jsonrpc": "2.0", "id": "123"})

        monkeypatch.setattr(mod.httpx, "AsyncClient", FakeClient)
@ -262,7 +272,7 @@ class TestDelegateTask:
            async def get(self, url, headers=None):
                return _FakeResponse(200, {"url": "http://target.test/a2a"})

-            async def post(self, url, json=None):
+            async def post(self, url, json=None, headers=None):
                call_count["n"] += 1
                raise ConnectionError("target down")

--- a/workspace/tests/test_claude_sdk_executor.py
+++ b/workspace/tests/test_claude_sdk_executor.py
@ -21,7 +21,25 @@ _FakeTextBlock = _sdk_stub.TextBlock
 _FakeAssistantMessage = _sdk_stub.AssistantMessage
 _FakeResultMessage = _sdk_stub.ResultMessage

-from claude_sdk_executor import ClaudeSDKExecutor, QueryResult  # noqa: E402
+from claude_sdk_executor import (  # noqa: E402
+    ClaudeSDKExecutor,
+    QueryResult,
+    _mark_sdk_wedged,
+    _reset_sdk_wedge_for_test,
+    is_wedged,
+    wedge_reason,
+)
+
+# Module alias used by the wedge tests below — they read
+# `_executor_mod.<helper>` to make the module-state vs function-state
+# distinction explicit at the call site, separate from the names
+# imported above. Hoisted to the top-of-file imports because the late
+# binding (originally at line ~1248) was invisible to @pytest.mark.asyncio
+# wrappers under coverage instrumentation (--cov, added by #1817):
+# sys.settrace + the asyncio wrapper combination caused a
+# `NameError: name '_executor_mod' is not defined` on every async wedge
+# test. Hoisting the alias fixes that scope-resolution issue.
+import claude_sdk_executor as _executor_mod  # noqa: E402


 # ---------- Helpers ----------
@ -1221,3 +1239,170 @@ def test_load_config_dict_empty_file_returns_empty(tmp_path):
    e = ClaudeSDKExecutor(system_prompt=None, config_path=str(tmp_path), heartbeat=None)
    result = e._load_config_dict()
    assert result == {}
+
+
+# ==================== SDK wedge detector ====================
+#
+# Exercises the module-level _sdk_wedged_reason flag set when the
+# claude_agent_sdk init handshake times out. The flag is sticky — the
+# heartbeat task reads it via is_wedged() / wedge_reason() and reports
+# runtime_state="wedged" so the platform flips status → degraded.
+
+
+
+def test_wedge_helpers_default_clean():
+    """Fresh module: no wedge."""
+    _reset_sdk_wedge_for_test()
+    assert is_wedged() is False
+    assert wedge_reason() == ""
+
+
+def test_mark_sdk_wedged_sets_flag_and_reason():
+    """First mark wins and sets both is_wedged() and the reason text."""
+    _reset_sdk_wedge_for_test()
+    _mark_sdk_wedged("init timeout — restart")
+    try:
+        assert is_wedged() is True
+        assert "init timeout" in wedge_reason()
+    finally:
+        _reset_sdk_wedge_for_test()
+
+
+def test_mark_sdk_wedged_sticky_first_wins():
+    """A second wedge call with a different reason does NOT overwrite
+    the first. The first cause is the one the user needs to see; later
+    knock-on errors from the same wedge would otherwise mask it."""
+    _reset_sdk_wedge_for_test()
+    _mark_sdk_wedged("first cause — Control request timeout")
+    _mark_sdk_wedged("noise from a downstream symptom")
+    try:
+        assert wedge_reason() == "first cause — Control request timeout"
+    finally:
+        _reset_sdk_wedge_for_test()
+
+
+@pytest.mark.asyncio
+async def test_execute_marks_wedge_on_control_request_timeout():
+    """End-to-end: when _run_query raises an exception whose formatted
+    error contains 'Control request timeout' (case-insensitive), the
+    executor's catch block flags the SDK as wedged. Subsequent
+    is_wedged() reads return True until process restart (or the
+    test-only reset)."""
+    _executor_mod._reset_sdk_wedge_for_test()
+    e = _make_executor()
+    ctx = _make_context(["test prompt"])
+    eq = _make_event_queue()
+
+    async def boom(prompt, options):
+        # Match the literal exception claude_agent_sdk raises in the
+        # observed wedge path.
+        raise Exception("Control request timeout: initialize")
+        yield  # pragma: no cover — make this an async generator
+
+    with patch("claude_sdk_executor.recall_memories", new=AsyncMock(return_value="")), \
+         patch("claude_sdk_executor.read_delegation_results", return_value=""), \
+         patch("claude_sdk_executor.commit_memory", new=AsyncMock()), \
+         patch("claude_sdk_executor.set_current_task", new=AsyncMock()), \
+         patch("claude_agent_sdk.query", new=boom):
+        try:
+            await e.execute(ctx, eq)
+            assert _executor_mod.is_wedged() is True, "wedge flag must be set"
+            assert "Control request timeout" in _executor_mod.wedge_reason()
+        finally:
+            _executor_mod._reset_sdk_wedge_for_test()
+
+
+@pytest.mark.asyncio
+async def test_execute_does_not_mark_wedge_on_unrelated_error():
+    """Sanity: a generic non-wedge exception (e.g. ValueError) MUST
+    NOT trigger the wedge flag. False-positives lock the workspace
+    into degraded for the whole process lifetime."""
+    _executor_mod._reset_sdk_wedge_for_test()
+    e = _make_executor()
+    ctx = _make_context(["test prompt"])
+    eq = _make_event_queue()
+
+    async def boom(prompt, options):
+        raise ValueError("ordinary tool failure, not a wedge")
+        yield  # pragma: no cover
+
+    with patch("claude_sdk_executor.recall_memories", new=AsyncMock(return_value="")), \
+         patch("claude_sdk_executor.read_delegation_results", return_value=""), \
+         patch("claude_sdk_executor.commit_memory", new=AsyncMock()), \
+         patch("claude_sdk_executor.set_current_task", new=AsyncMock()), \
+         patch("claude_agent_sdk.query", new=boom):
+        try:
+            await e.execute(ctx, eq)
+            assert _executor_mod.is_wedged() is False, "non-wedge error must not flip the flag"
+        finally:
+            _executor_mod._reset_sdk_wedge_for_test()
+
+
+@pytest.mark.asyncio
+async def test_execute_clears_wedge_on_successful_query():
+    """Auto-recovery: a process that previously hit a wedge should be
+    able to recover when the SDK starts working again. _run_query
+    calls _clear_sdk_wedge_on_success at the end of a clean
+    completion; the flag flips back to None and the next heartbeat
+    reports runtime_state empty so the platform recovers status →
+    online without forcing the user to restart the workspace."""
+    # Pre-set the wedge as if a prior call had tripped it.
+    _executor_mod._reset_sdk_wedge_for_test()
+    _executor_mod._mark_sdk_wedged("transient: Control request timeout: initialize")
+    assert _executor_mod.is_wedged() is True
+
+    e = _make_executor()
+    ctx = _make_context(["test prompt"])
+    eq = _make_event_queue()
+
+    async def good_query(prompt, options):
+        # Working SDK — yield one normal assistant message + result.
+        yield _FakeAssistantMessage([_FakeTextBlock("hello back")])
+        yield _FakeResultMessage(session_id="recovered-sess")
+
+    with patch("claude_sdk_executor.recall_memories", new=AsyncMock(return_value="")), \
+         patch("claude_sdk_executor.read_delegation_results", return_value=""), \
+         patch("claude_sdk_executor.commit_memory", new=AsyncMock()), \
+         patch("claude_sdk_executor.set_current_task", new=AsyncMock()), \
+         patch("claude_agent_sdk.query", new=good_query):
+        try:
+            await e.execute(ctx, eq)
+            assert _executor_mod.is_wedged() is False, "wedge flag must clear after a successful query"
+            assert _executor_mod.wedge_reason() == ""
+        finally:
+            _executor_mod._reset_sdk_wedge_for_test()
+
+
+@pytest.mark.asyncio
+async def test_execute_does_not_clear_wedge_on_empty_stream():
+    """Regression for the gate added in 3c4eef49: a stream that
+    iterates without raising but emits NEITHER an AssistantMessage
+    NOR a ResultMessage (degenerate or stub-driven shape) must NOT
+    clear the wedge flag. A real successful query yields at least
+    one of those; treating an empty stream as "recovered" would
+    falsely flip the workspace back to online without any evidence
+    the SDK is actually working."""
+    _executor_mod._reset_sdk_wedge_for_test()
+    _executor_mod._mark_sdk_wedged("pre-existing wedge — must not clear on empty stream")
+    assert _executor_mod.is_wedged() is True
+
+    e = _make_executor()
+    ctx = _make_context(["test prompt"])
+    eq = _make_event_queue()
+
+    async def empty_query(prompt, options):
+        # Iterator returns without yielding — the degenerate case.
+        if False:
+            yield  # pragma: no cover
+
+    with patch("claude_sdk_executor.recall_memories", new=AsyncMock(return_value="")), \
+         patch("claude_sdk_executor.read_delegation_results", return_value=""), \
+         patch("claude_sdk_executor.commit_memory", new=AsyncMock()), \
+         patch("claude_sdk_executor.set_current_task", new=AsyncMock()), \
+         patch("claude_agent_sdk.query", new=empty_query):
+        try:
+            await e.execute(ctx, eq)
+            assert _executor_mod.is_wedged() is True, \
+                "wedge must persist when the stream emitted no content"
+        finally:
+            _executor_mod._reset_sdk_wedge_for_test()
--- a/workspace/tests/test_executor_helpers.py
+++ b/workspace/tests/test_executor_helpers.py
@ -654,3 +654,255 @@ def test_classify_subprocess_error_generic_fallback():
    assert classify_subprocess_error("generic unknown failure", None) == "subprocess_error"
    # exit_code=0 with no keyword match also lands here
    assert classify_subprocess_error("mysterious but zero exit", 0) == "subprocess_error"
+
+
+# ============================================================================
+# Chat attachment helpers (drag-drop file + agent-returned file)
+# ============================================================================
+
+
+def test_resolve_attachment_uri_all_schemes(tmp_path, monkeypatch):
+    """All three canvas-issued URI shapes resolve to the same container path.
+
+    The canvas mints ``workspace:`` but the download endpoint used to accept
+    ``file:///`` and bare ``/workspace/…`` for legacy agents — the helper has
+    to handle all three so agents don't have to normalize before calling us.
+    """
+    from executor_helpers import resolve_attachment_uri, WORKSPACE_MOUNT
+
+    # Use a real path that starts with WORKSPACE_MOUNT. resolve() enforces
+    # the containment check — anything outside /workspace/ must return None.
+    ws_path = f"{WORKSPACE_MOUNT}/foo.txt"
+    assert resolve_attachment_uri(f"workspace:{ws_path}") == ws_path
+    assert resolve_attachment_uri(f"file://{ws_path}") == ws_path
+    assert resolve_attachment_uri(ws_path) == ws_path
+
+    # Out-of-tree is refused even when the raw path shape looks right.
+    # CWE-22 regression: a crafted "workspace:/workspace/../etc/passwd"
+    # must NOT return "/etc/passwd" just because resolve() normalizes it.
+    assert resolve_attachment_uri("/etc/passwd") is None
+    assert resolve_attachment_uri("workspace:/workspace/../etc/passwd") is None
+    assert resolve_attachment_uri("") is None
+    assert resolve_attachment_uri("https://example.com/x") is None
+
+
+def test_extract_attached_files_skips_unresolvable():
+    """Files with URIs that don't resolve to an existing file are dropped.
+
+    A crafted A2A message can include any uri it wants; we must not hand
+    non-existent or out-of-tree paths to downstream code as if they were
+    real attachments.
+    """
+    from types import SimpleNamespace
+    from executor_helpers import extract_attached_files
+
+    msg = SimpleNamespace(parts=[
+        SimpleNamespace(kind="file", file=SimpleNamespace(
+            uri="workspace:/etc/passwd", name="x", mimeType="text/plain"
+        )),
+        SimpleNamespace(root=SimpleNamespace(kind="file", file=SimpleNamespace(
+            uri="/workspace/does-not-exist", name="y", mimeType="text/plain"
+        ))),
+        SimpleNamespace(kind="text", text="ignored"),
+    ])
+    assert extract_attached_files(msg) == []
+
+
+def test_extract_attached_files_accepts_both_shapes(tmp_path, monkeypatch):
+    """a2a-sdk emits ``part.root.file`` via RootModel; some callers still
+    build ``part.file`` directly. Both shapes have to yield the same
+    dict structure — runtimes can pick either without surprise."""
+    from types import SimpleNamespace
+    from executor_helpers import extract_attached_files
+
+    # Stage two real files under a fake /workspace for the resolver
+    real_a = tmp_path / "a.txt"
+    real_b = tmp_path / "b.txt"
+    real_a.write_text("A")
+    real_b.write_text("B")
+    # Point the helper's containment check at tmp_path instead of /workspace
+    monkeypatch.setattr("executor_helpers.WORKSPACE_MOUNT", str(tmp_path))
+
+    msg = SimpleNamespace(parts=[
+        SimpleNamespace(kind="file", file=SimpleNamespace(
+            uri=f"workspace:{real_a}", name="a.txt", mimeType="text/plain"
+        )),
+        SimpleNamespace(root=SimpleNamespace(kind="file", file=SimpleNamespace(
+            uri=f"workspace:{real_b}", name="b.txt", mimeType="text/plain"
+        ))),
+    ])
+    out = extract_attached_files(msg)
+    assert len(out) == 2
+    assert {f["name"] for f in out} == {"a.txt", "b.txt"}
+
+
+def test_build_user_content_with_files_no_attachments_is_string():
+    """Zero attachments → plain string so models without multi-modal
+    support (most non-vision LLMs) see the same payload shape they always
+    did. Regressing this would break every runtime that assumed
+    content is a string."""
+    from executor_helpers import build_user_content_with_files
+
+    out = build_user_content_with_files("hello", [])
+    assert out == "hello"
+
+
+def test_build_user_content_with_files_non_image_is_string_with_manifest():
+    """Non-image attachments append a manifest line so the agent knows the
+    filename and absolute path. Without this the agent had no signal that
+    anything was attached — see canvas/src/components/tabs/ChatTab.tsx
+    and the "I'm not sure what you're referring to" user report."""
+    from executor_helpers import build_user_content_with_files
+
+    content = build_user_content_with_files("read this", [
+        {"name": "app.log", "mime_type": "text/plain", "path": "/workspace/app.log"},
+    ])
+    assert isinstance(content, str)
+    assert "app.log" in content and "/workspace/app.log" in content
+    assert "read this" in content
+
+
+def test_build_user_content_with_files_image_is_multimodal(tmp_path):
+    """Image attachments yield the OpenAI-compat list-of-parts shape so
+    vision models see the bytes. Data URL check covers the common
+    regression where an empty/missing file silently drops the image part."""
+    from executor_helpers import build_user_content_with_files
+
+    # Minimal 1x1 PNG
+    png = tmp_path / "x.png"
+    png.write_bytes(bytes.fromhex(
+        "89504e470d0a1a0a0000000d49484452000000010000000108060000001f"
+        "15c4890000000a49444154789c6300010000000500010d0a2db40000000049454e44ae426082"
+    ))
+    content = build_user_content_with_files("describe", [
+        {"name": "x.png", "mime_type": "image/png", "path": str(png)},
+    ])
+    assert isinstance(content, list)
+    assert len(content) == 2
+    assert content[0]["type"] == "text"
+    assert content[1]["type"] == "image_url"
+    assert content[1]["image_url"]["url"].startswith("data:image/png;base64,")
+
+
+def test_build_user_content_with_files_large_image_skipped(tmp_path, monkeypatch):
+    """Images over the inline cap don't break the request — the manifest
+    still carries the path so the agent can read via its file_read tool
+    without blowing past provider context limits with a 50MB base64 blob."""
+    from executor_helpers import build_user_content_with_files
+    monkeypatch.setattr("executor_helpers.MAX_INLINE_ATTACHMENT_BYTES", 10)
+
+    big = tmp_path / "big.png"
+    big.write_bytes(b"x" * 100)
+    content = build_user_content_with_files("describe", [
+        {"name": "big.png", "mime_type": "image/png", "path": str(big)},
+    ])
+    # Image too large → no image_url entry, but the text manifest still mentions it
+    assert isinstance(content, list)
+    # Only the text part — the image_url was skipped
+    assert all(c["type"] == "text" for c in content)
+
+
+def test_collect_outbound_files_stages_workspace_paths(tmp_path, monkeypatch):
+    """Agent reply mentioning a /workspace/… path → each unique existing
+    file becomes an attachment, staged under chat-uploads. A crafted
+    reply referencing /etc/passwd must NOT escape."""
+    from pathlib import Path as _Path
+    from executor_helpers import collect_outbound_files
+
+    # Point the chat-uploads dir and the workspace root at a sandboxed tmp.
+    # resolve() normalizes macOS /var → /private/var so the helper's
+    # containment check (which also resolve()s) sees identical prefixes.
+    ws_root = _Path(str(tmp_path / "workspace"))
+    ws_root.mkdir()
+    ws_root = ws_root.resolve()
+    uploads = ws_root / ".molecule" / "chat-uploads"
+    uploads.mkdir(parents=True)
+    monkeypatch.setattr("executor_helpers.WORKSPACE_MOUNT", str(ws_root))
+    monkeypatch.setattr("executor_helpers.CHAT_UPLOADS_DIR", str(uploads))
+    # Rebuild the regex against the overridden mount (module caches it)
+    import re as _re
+    monkeypatch.setattr(
+        "executor_helpers._WORKSPACE_PATH_RE",
+        _re.compile(rf"(?:^|[\s`(\[])({ws_root}/[A-Za-z0-9_./\-]+)"),
+    )
+
+    # A real file inside the fake workspace
+    report = ws_root / "report.txt"
+    report.write_text("data")
+    # A decoy outside the workspace — must be ignored even if mentioned
+    (tmp_path / "secret.txt").write_text("leaked")
+
+    reply = f"Saved to {report} — also see {tmp_path}/secret.txt for extras."
+    out = collect_outbound_files(reply)
+    assert len(out) == 1
+    assert out[0]["name"] == "report.txt"
+    # Staged copy lives under chat-uploads (the download endpoint's whitelist)
+    assert out[0]["path"].startswith(str(uploads))
+
+
+def test_ensure_workspace_writable_chmods_777(tmp_path, monkeypatch):
+    """The platform-level hook opens /workspace + chat-uploads to 777 so
+    agents running as any non-root user can write files the user will
+    then download. This is the single point of fix for what used to need
+    a chmod in every template's Dockerfile."""
+    import stat
+    from executor_helpers import ensure_workspace_writable
+
+    ws = tmp_path / "workspace"
+    ws.mkdir(mode=0o755)
+    uploads = ws / ".molecule" / "chat-uploads"
+    # Don't pre-create uploads — the helper must makedirs it.
+    monkeypatch.setattr("executor_helpers.WORKSPACE_MOUNT", str(ws))
+    monkeypatch.setattr("executor_helpers.CHAT_UPLOADS_DIR", str(uploads))
+
+    ensure_workspace_writable()
+
+    assert uploads.is_dir(), "chat-uploads dir should be created"
+    assert stat.S_IMODE(ws.stat().st_mode) == 0o777
+    assert stat.S_IMODE(uploads.stat().st_mode) == 0o777
+
+
+def test_ensure_workspace_writable_tolerates_non_root(tmp_path, monkeypatch, caplog):
+    """When molecule-runtime isn't root (rare CP configurations), the
+    chmod silently no-ops rather than crashing boot — a misconfigured
+    perm is recoverable; a SystemExit here would wedge the workspace
+    in provisioning forever."""
+    import logging
+    from executor_helpers import ensure_workspace_writable
+
+    ws = tmp_path / "workspace"
+    ws.mkdir()
+    monkeypatch.setattr("executor_helpers.WORKSPACE_MOUNT", str(ws))
+    monkeypatch.setattr("executor_helpers.CHAT_UPLOADS_DIR", str(ws / "x"))
+
+    def _boom(*_a, **_kw):
+        raise PermissionError("Operation not permitted")
+
+    monkeypatch.setattr("executor_helpers.os.chmod", _boom)
+    with caplog.at_level(logging.INFO, logger="executor_helpers"):
+        ensure_workspace_writable()  # must not raise
+
+
+def test_collect_outbound_files_deduplicates(tmp_path, monkeypatch):
+    """Reply mentioning the same path twice should only attach once."""
+    from pathlib import Path as _Path
+    from executor_helpers import collect_outbound_files
+
+    ws_root = _Path(str(tmp_path / "workspace"))
+    ws_root.mkdir()
+    ws_root = ws_root.resolve()
+    uploads = ws_root / ".molecule" / "chat-uploads"
+    uploads.mkdir(parents=True)
+    monkeypatch.setattr("executor_helpers.WORKSPACE_MOUNT", str(ws_root))
+    monkeypatch.setattr("executor_helpers.CHAT_UPLOADS_DIR", str(uploads))
+    import re as _re
+    monkeypatch.setattr(
+        "executor_helpers._WORKSPACE_PATH_RE",
+        _re.compile(rf"(?:^|[\s`(\[])({ws_root}/[A-Za-z0-9_./\-]+)"),
+    )
+
+    report = ws_root / "report.txt"
+    report.write_text("data")
+    reply = f"Wrote {report}. Again at {report}."
+    out = collect_outbound_files(reply)
+    assert len(out) == 1
--- a/Show More
+++ b/Show More