Merge branch 'staging' into feat/external-runtime-first-class

2026-04-26 10:40:15 -07:00 · 2026-04-26 10:40:15 -07:00 · 78afa0f544
commit 78afa0f544
parent 775406d7fe 5b346ab3e7
51 changed files with 3775 additions and 118 deletions
--- a/.github/workflows/auto-tag-runtime.yml
+++ b/.github/workflows/auto-tag-runtime.yml
@ -0,0 +1,113 @@
+name: auto-tag-runtime
+
+# Auto-tag runtime releases on every merge to main that touches workspace/.
+# This is the entry point of the runtime CD chain:
+#
+#   merge PR → auto-tag-runtime (this) → publish-runtime → cascade → template
+#   image rebuilds → repull on hosts.
+#
+# Default bump is patch. Override via PR label `release:minor` or
+# `release:major` BEFORE merging — the label is read off the merged PR
+# associated with the push commit.
+#
+# Skips when:
+#   - The push isn't to main (other branches don't auto-release).
+#   - The merge commit message contains `[skip-release]` (escape hatch
+#     for cleanup PRs that touch workspace/ but shouldn't ship).
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "workspace/**"
+      - "scripts/build_runtime_package.py"
+      - ".github/workflows/auto-tag-runtime.yml"
+      - ".github/workflows/publish-runtime.yml"
+
+permissions:
+  contents: write    # to push the new tag
+  pull-requests: read # to read labels off the merged PR
+
+concurrency:
+  # Serialize tag bumps so two near-simultaneous merges can't both think
+  # they're 0.1.6 and race to push the same tag.
+  group: auto-tag-runtime
+  cancel-in-progress: false
+
+jobs:
+  tag:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0    # need full tag history for `git describe` / sort
+
+      - name: Skip when commit asks
+        id: skip
+        run: |
+          MSG=$(git log -1 --format=%B "${{ github.sha }}")
+          if echo "$MSG" | grep -qiE '\[skip-release\]|\[no-release\]'; then
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            echo "Commit message contains [skip-release] — no tag will be created."
+          else
+            echo "skip=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Determine bump kind from PR label
+        id: bump
+        if: steps.skip.outputs.skip != 'true'
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          # The merged PR for this push commit. `gh pr list --search` finds
+          # closed PRs whose merge commit matches; we take the first.
+          PR=$(gh pr list --state merged --search "${{ github.sha }}" --json number,labels --jq '.[0]' 2>/dev/null || echo "")
+          if [ -z "$PR" ] || [ "$PR" = "null" ]; then
+            echo "No merged PR found for ${{ github.sha }} — defaulting to patch bump."
+            echo "kind=patch" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          LABELS=$(echo "$PR" | jq -r '.labels[].name')
+          if echo "$LABELS" | grep -qx 'release:major'; then
+            echo "kind=major" >> "$GITHUB_OUTPUT"
+          elif echo "$LABELS" | grep -qx 'release:minor'; then
+            echo "kind=minor" >> "$GITHUB_OUTPUT"
+          else
+            echo "kind=patch" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Compute next version from latest runtime-v* tag
+        id: version
+        if: steps.skip.outputs.skip != 'true'
+        run: |
+          # Find the highest runtime-vX.Y.Z tag. `sort -V` handles semver
+          # ordering; `grep` filters to the right tag prefix.
+          LATEST=$(git tag --list 'runtime-v*' | sort -V | tail -1)
+          if [ -z "$LATEST" ]; then
+            # No prior tag — start the runtime line at 0.1.0.
+            CURRENT="0.0.0"
+          else
+            CURRENT="${LATEST#runtime-v}"
+          fi
+          MAJOR=$(echo "$CURRENT" | cut -d. -f1)
+          MINOR=$(echo "$CURRENT" | cut -d. -f2)
+          PATCH=$(echo "$CURRENT" | cut -d. -f3)
+          case "${{ steps.bump.outputs.kind }}" in
+            major) MAJOR=$((MAJOR+1)); MINOR=0; PATCH=0;;
+            minor) MINOR=$((MINOR+1)); PATCH=0;;
+            patch) PATCH=$((PATCH+1));;
+          esac
+          NEW="$MAJOR.$MINOR.$PATCH"
+          echo "current=$CURRENT" >> "$GITHUB_OUTPUT"
+          echo "new=$NEW" >> "$GITHUB_OUTPUT"
+          echo "Bumping runtime $CURRENT → $NEW (${{ steps.bump.outputs.kind }})"
+
+      - name: Push new tag
+        if: steps.skip.outputs.skip != 'true'
+        run: |
+          NEW_TAG="runtime-v${{ steps.version.outputs.new }}"
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git tag -a "$NEW_TAG" -m "runtime $NEW_TAG (auto-bump from ${{ steps.bump.outputs.kind }})"
+          git push origin "$NEW_TAG"
+          echo "Pushed $NEW_TAG — publish-runtime workflow will fire on the tag."
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -283,7 +283,9 @@ jobs:
          cache: pip
          cache-dependency-path: workspace/requirements.txt
      - run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov
-      - run: python -m pytest --tb=short -q --cov=. --cov-report=term-missing
+      # Coverage flags + fail-under floor moved into workspace/pytest.ini
+      # (issue #1817) so local `pytest` and CI use identical config.
+      - run: python -m pytest --tb=short

      # SDK + plugin validation moved to standalone repo:
      # github.com/Molecule-AI/molecule-sdk-python
--- a/.github/workflows/publish-runtime.yml
+++ b/.github/workflows/publish-runtime.yml
@ -0,0 +1,161 @@
+name: publish-runtime
+
+# Publishes molecule-ai-workspace-runtime to PyPI from monorepo workspace/.
+# Monorepo workspace/ is the only source-of-truth for runtime code; this
+# workflow is the bridge from monorepo edits to the PyPI artifact that
+# the 8 workspace-template-* repos depend on.
+#
+# Triggered by:
+#   - Pushing a tag matching `runtime-vX.Y.Z` (the version is derived from
+#     the tag — `runtime-v0.1.6` publishes `0.1.6`).
+#   - Manual workflow_dispatch with an explicit `version` input (useful for
+#     dev/test releases without tagging the repo).
+#
+# The workflow:
+#   1. Runs scripts/build_runtime_package.py to copy workspace/ →
+#      build/molecule_runtime/ with imports rewritten (`a2a_client` →
+#      `molecule_runtime.a2a_client`).
+#   2. Builds wheel + sdist with `python -m build`.
+#   3. Publishes to PyPI via twine + repo secret PYPI_TOKEN.
+#
+# After publish: the 8 template repos pick up the new version on their
+# next image rebuild (their requirements.txt pin
+# `molecule-ai-workspace-runtime>=0.1.0`, so any new release is eligible).
+# To force-pull immediately, bump the pin in each template repo's
+# requirements.txt and merge — that triggers their own publish-image.yml.
+
+on:
+  push:
+    tags:
+      - "runtime-v*"
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "Version to publish (e.g. 0.1.6). Required for manual dispatch."
+        required: true
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    environment: pypi-publish
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: pip
+
+      - name: Derive version from tag or input
+        id: version
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            VERSION="${{ inputs.version }}"
+          else
+            # Tag is `runtime-vX.Y.Z` — strip the prefix.
+            VERSION="${GITHUB_REF_NAME#runtime-v}"
+          fi
+          if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+(\.dev[0-9]+|rc[0-9]+|a[0-9]+|b[0-9]+|\.post[0-9]+)?$'; then
+            echo "::error::version $VERSION does not match PEP 440"
+            exit 1
+          fi
+          echo "version=$VERSION" >> "$GITHUB_OUTPUT"
+          echo "Publishing molecule-ai-workspace-runtime $VERSION"
+
+      - name: Install build tooling
+        run: pip install build twine
+
+      - name: Build package from workspace/
+        run: |
+          python scripts/build_runtime_package.py \
+            --version "${{ steps.version.outputs.version }}" \
+            --out "${{ runner.temp }}/runtime-build"
+
+      - name: Build wheel + sdist
+        working-directory: ${{ runner.temp }}/runtime-build
+        run: python -m build
+
+      - name: Verify package contents (sanity)
+        working-directory: ${{ runner.temp }}/runtime-build
+        run: |
+          python -m twine check dist/*
+          # Smoke-import the built wheel to catch import-rewrite mistakes
+          # before they hit PyPI. The package depends on a2a-sdk + httpx
+          # via pyproject; install those so the smoke import resolves.
+          python -m venv /tmp/smoke
+          /tmp/smoke/bin/pip install --quiet dist/*.whl
+          WORKSPACE_ID=00000000-0000-0000-0000-000000000000 \
+          PLATFORM_URL=http://localhost:8080 \
+            /tmp/smoke/bin/python -c "
+          from molecule_runtime import a2a_client, a2a_tools
+          from molecule_runtime.builtin_tools import memory
+          from molecule_runtime.adapters import get_adapter, BaseAdapter, AdapterConfig
+          assert a2a_client._A2A_QUEUED_PREFIX, 'queued prefix missing — chat-leak fix not in build'
+          print('✓ smoke import passed')
+          "
+
+      - name: Publish to PyPI
+        working-directory: ${{ runner.temp }}/runtime-build
+        env:
+          TWINE_USERNAME: __token__
+          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+        run: python -m twine upload dist/*
+
+  cascade:
+    # After PyPI accepts the upload, fan out a repository_dispatch to each
+    # template repo so they rebuild their image against the new runtime.
+    # Each template's `runtime-published.yml` receiver picks up the event,
+    # pulls the new PyPI version (their requirements.txt pin is `>=`), and
+    # republishes ghcr.io/molecule-ai/workspace-template-<runtime>:latest.
+    #
+    # Soft-fail per repo: if one template's dispatch fails (perms missing,
+    # repo archived, etc.) we still try the others and surface the failures
+    # in the workflow summary instead of aborting the whole cascade.
+    needs: publish
+    runs-on: ubuntu-latest
+    steps:
+      - name: Fan out repository_dispatch
+        env:
+          # Fine-grained PAT with `actions:write` on the 8 template repos.
+          # GITHUB_TOKEN can't fire dispatches across repos — needs an explicit
+          # token. Stored as a repo secret; rotate per the standard schedule.
+          DISPATCH_TOKEN: ${{ secrets.TEMPLATE_DISPATCH_TOKEN }}
+          RUNTIME_VERSION: ${{ needs.publish.outputs.version || steps.version.outputs.version }}
+        run: |
+          set +e   # don't abort on a single repo failure — collect them all
+          if [ -z "$DISPATCH_TOKEN" ]; then
+            echo "::warning::TEMPLATE_DISPATCH_TOKEN secret not set — skipping cascade. PyPI was published; templates will pick up the new version on their own next rebuild."
+            exit 0
+          fi
+          # Re-derive version from the tag here too (in case publish job
+          # didn't expose an output the previous step's reference reads).
+          VERSION="${GITHUB_REF_NAME#runtime-v}"
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            VERSION="${{ inputs.version }}"
+          fi
+          TEMPLATES="claude-code langgraph crewai autogen deepagents hermes gemini-cli openclaw"
+          FAILED=""
+          for tpl in $TEMPLATES; do
+            REPO="Molecule-AI/molecule-ai-workspace-template-$tpl"
+            STATUS=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" \
+              -X POST "https://api.github.com/repos/$REPO/dispatches" \
+              -H "Authorization: Bearer $DISPATCH_TOKEN" \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              -d "{\"event_type\":\"runtime-published\",\"client_payload\":{\"runtime_version\":\"$VERSION\"}}")
+            if [ "$STATUS" = "204" ]; then
+              echo "✓ dispatched $tpl ($VERSION)"
+            else
+              echo "::warning::✗ failed to dispatch $tpl: HTTP $STATUS — $(cat /tmp/dispatch.out)"
+              FAILED="$FAILED $tpl"
+            fi
+          done
+          if [ -n "$FAILED" ]; then
+            echo "::warning::Cascade incomplete. Failed templates:$FAILED"
+            # Don't fail the whole job — PyPI publish already succeeded;
+            # operators can retry the failed templates manually.
+          fi
--- a/.github/workflows/runtime-pin-compat.yml
+++ b/.github/workflows/runtime-pin-compat.yml
@ -0,0 +1,77 @@
+name: Runtime Pin Compatibility
+
+# CI gate that prevents the 5-hour staging outage from 2026-04-24 from
+# recurring (controlplane#253). The original failure mode:
+#   1. molecule-ai-workspace-runtime 0.1.13 declared `a2a-sdk<1.0` in its
+#      requires_dist metadata (incorrect — it actually imports
+#      a2a.server.routes which only exists in a2a-sdk 1.0+)
+#   2. `pip install molecule-ai-workspace-runtime` resolved cleanly
+#   3. `from molecule_runtime.main import main_sync` raised ImportError
+#   4. Every tenant workspace crashed; the canary tenant caught it but
+#      only after 5 hours of degraded staging
+#
+# This workflow installs the runtime in a fresh Python venv from PyPI
+# and tries the same import the EC2 user-data does. If pip resolution
+# silently produces a broken combo, this gate fails before the tenant
+# image gets published.
+
+on:
+  push:
+    branches: [main, staging]
+    paths:
+      - 'workspace/requirements.txt'
+      - '.github/workflows/runtime-pin-compat.yml'
+  pull_request:
+    branches: [main, staging]
+    paths:
+      - 'workspace/requirements.txt'
+      - '.github/workflows/runtime-pin-compat.yml'
+  # Daily catch for upstream PyPI publishes that break the pin combo
+  # without any change in our repo (e.g. someone re-yanks an a2a-sdk
+  # release or molecule-ai-workspace-runtime publishes a bad bump).
+  schedule:
+    - cron: '0 13 * * *'  # 06:00 PT
+  workflow_dispatch:
+  # Required-check support: when this becomes a branch-protection gate,
+  # merge_group runs let the queue green-check this in addition to PRs.
+  merge_group:
+    types: [checks_requested]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  default-install:
+    name: Default install + import smoke
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: pip
+          cache-dependency-path: workspace/requirements.txt
+      - name: Install runtime + workspace requirements
+        # Install order is load-bearing: install the runtime FIRST so pip
+        # honors whatever a2a-sdk constraint the runtime metadata declares
+        # (this is the surface that broke in 2026-04-24 — runtime declared
+        # `a2a-sdk<1.0` but actually needed >=1.0). The follow-up install
+        # of workspace/requirements.txt then upgrades a2a-sdk to the
+        # constraint our runtime image actually pins. The import smoke
+        # below verifies the upgraded combination is consistent.
+        run: |
+          python -m venv /tmp/venv
+          /tmp/venv/bin/pip install --upgrade pip
+          /tmp/venv/bin/pip install molecule-ai-workspace-runtime
+          /tmp/venv/bin/pip install -r workspace/requirements.txt
+          /tmp/venv/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
+            | grep -E '^(Name|Version):'
+      - name: Smoke import — fail if metadata declares deps that don't satisfy real imports
+        # WORKSPACE_ID is validated at import time by platform_auth.py — EC2
+        # user-data sets it from the cloud-init template; set a placeholder
+        # here so the import smoke doesn't trip on the env-var guard.
+        env:
+          WORKSPACE_ID: 00000000-0000-0000-0000-000000000001
+        run: |
+          /tmp/venv/bin/python -c "from molecule_runtime.main import main_sync; print('runtime imports OK')"
--- a/.github/workflows/sweep-cf-orphans.yml
+++ b/.github/workflows/sweep-cf-orphans.yml
@ -0,0 +1,124 @@
+name: Sweep stale Cloudflare DNS records
+
+# Janitor for Cloudflare DNS records whose backing tenant/workspace no
+# longer exists. Without this loop, every short-lived E2E or canary
+# leaves a CF record on the moleculesai.app zone — the zone has a
+# 200-record quota (controlplane#239 hit it 2026-04-23+) and provisions
+# start failing with code 81045 once exhausted.
+#
+# Why a separate workflow vs sweep-stale-e2e-orgs.yml:
+#   - That workflow operates at the CP layer (DELETE /cp/admin/tenants/:slug
+#     drives the cascade). It assumes CP has the org row to drive the
+#     deprovision from. It doesn't catch records left behind when CP
+#     itself never knew about the tenant (canary scratch, manual ops
+#     experiments) or when the cascade's CF-delete branch failed.
+#   - sweep-cf-orphans.sh enumerates the CF zone directly and matches
+#     each record against live CP slugs + AWS EC2 names. It catches
+#     leaks the CP-driven sweep can't.
+#
+# Safety: the script's own MAX_DELETE_PCT gate refuses to nuke more
+# than 50% of records in a single run. If something has gone weird
+# (CP admin endpoint returns no orgs → every tenant looks orphan) the
+# gate halts before damage. Decision-function unit tests in
+# scripts/ops/test_sweep_cf_decide.py (#2027) cover the rule
+# classifier.
+
+on:
+  schedule:
+    # Hourly. Mirrors sweep-stale-e2e-orgs cadence so the two janitors
+    # converge on the same tick. CF API rate budget is generous (1200
+    # req/5min); a single sweep makes ~1 list + N deletes (N<=quota/2).
+    - cron: '15 * * * *'  # offset from sweep-stale-e2e-orgs (top of hour)
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry run only — list what would be deleted, no deletion"
+        required: false
+        type: boolean
+        default: true
+      max_delete_pct:
+        description: "Override safety gate (default 50, set higher only for major cleanup)"
+        required: false
+        default: "50"
+  # No `merge_group:` trigger on purpose. This is a janitor — it doesn't
+  # need to gate merges, and including it as written before #2088 fired
+  # the full sweep job (or its secret-check) on every PR going through
+  # the merge queue, generating one red CI run per merge-queue eval. If
+  # this workflow is ever wired up as a required check, re-add
+  #   merge_group: { types: [checks_requested] }
+  # AND gate the sweep step with `if: github.event_name != 'merge_group'`
+  # so merge-queue evals report success without actually running.
+
+# Don't let two sweeps race the same zone. workflow_dispatch during a
+# scheduled run would otherwise issue duplicate DELETE calls.
+concurrency:
+  group: sweep-cf-orphans
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+jobs:
+  sweep:
+    name: Sweep CF orphans
+    runs-on: ubuntu-latest
+    # 3 min surfaces hangs (CF API stall, AWS describe-instances stuck)
+    # within one cron interval instead of burning a full tick. Realistic
+    # worst case is ~2 min: 4 sequential curls + 1 aws + N×CF-DELETE
+    # each individually capped at 10s by the script's curl -m flag.
+    timeout-minutes: 3
+    env:
+      CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
+      CF_ZONE_ID: ${{ secrets.CF_ZONE_ID }}
+      CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }}
+      CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      AWS_DEFAULT_REGION: us-east-2
+      MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Verify required secrets present
+        id: verify
+        # Soft skip when secrets aren't configured. The 6 secrets have
+        # to be set on the repo manually before this workflow can do
+        # real work; until they are, the schedule is a no-op rather
+        # than a recurring red CI run. workflow_dispatch surfaces a
+        # warning so an operator running it ad-hoc sees the gap.
+        run: |
+          missing=()
+          for var in CF_API_TOKEN CF_ZONE_ID CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
+            if [ -z "${!var:-}" ]; then
+              missing+=("$var")
+            fi
+          done
+          if [ ${#missing[@]} -gt 0 ]; then
+            echo "::warning::skipping sweep — secrets not yet configured: ${missing[*]}"
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          echo "All required secrets present ✓"
+          echo "skip=false" >> "$GITHUB_OUTPUT"
+
+      - name: Run sweep
+        if: steps.verify.outputs.skip != 'true'
+        # Schedule-vs-dispatch dry-run asymmetry (intentional):
+        #   - Scheduled runs: github.event.inputs.dry_run is empty →
+        #     defaults to "false" below → script runs with --execute
+        #     (the whole point of an hourly janitor).
+        #   - Manual workflow_dispatch: input default is true (line 38)
+        #     so an ad-hoc operator-triggered run is dry-run by default;
+        #     they have to flip the toggle to actually delete.
+        # The script's MAX_DELETE_PCT gate (default 50%) is the second
+        # line of defense regardless of mode.
+        run: |
+          set -euo pipefail
+          if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
+            echo "Running in dry-run mode — no deletions"
+            bash scripts/ops/sweep-cf-orphans.sh
+          else
+            echo "Running with --execute — will delete identified orphans"
+            bash scripts/ops/sweep-cf-orphans.sh --execute
+          fi
--- a/.github/workflows/test-ops-scripts.yml
+++ b/.github/workflows/test-ops-scripts.yml
@ -0,0 +1,36 @@
+name: Ops Scripts Tests
+
+# Runs the unittest suite for scripts/ops/ on every PR + push that touches
+# the directory. Kept separate from the main CI so a script-only change
+# doesn't trigger the heavier Go/Canvas/Python pipelines.
+
+on:
+  push:
+    branches: [main, staging]
+    paths:
+      - 'scripts/ops/**'
+      - '.github/workflows/test-ops-scripts.yml'
+  pull_request:
+    branches: [main, staging]
+    paths:
+      - 'scripts/ops/**'
+      - '.github/workflows/test-ops-scripts.yml'
+  merge_group:
+    types: [checks_requested]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    name: Ops scripts (unittest)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Run unittest
+        working-directory: scripts/ops
+        run: python -m unittest discover -p 'test_*.py' -v
--- a/canvas/e2e/staging-setup.ts
+++ b/canvas/e2e/staging-setup.ts
@ -46,7 +46,16 @@ const TENANT_DOMAIN = process.env.STAGING_TENANT_DOMAIN || "staging.moleculesai.
 // were blocking staging→main syncs on 2026-04-24.
 const PROVISION_TIMEOUT_MS = 20 * 60 * 1000;
 const WORKSPACE_ONLINE_TIMEOUT_MS = 20 * 60 * 1000;
-const TLS_TIMEOUT_MS = 3 * 60 * 1000;
+
+// TLS readiness depends on (1) Cloudflare DNS propagation through the
+// edge, (2) the tenant's CF Tunnel registering the new hostname, (3)
+// CF's edge ACME cert provisioning + cache. Each of these layers can
+// add 1-3 min on its own under heavy staging load. The original 3-min
+// cap blocked four cycles of staging→main PRs across 2026-04-24+.
+// 10 min stays inside the 20-min PROVISION_TIMEOUT envelope (so a
+// genuinely-stuck tenant still fails-loud at the provision step) but
+// absorbs the realistic worst case for a one-shot tenant TLS handshake.
+const TLS_TIMEOUT_MS = 10 * 60 * 1000;

 async function jsonFetch(
  url: string,
--- a/canvas/src/components/ProvisioningTimeout.tsx
+++ b/canvas/src/components/ProvisioningTimeout.tsx
@ -71,15 +71,22 @@ export function ProvisioningTimeout({
  // Runtime included so the timeout threshold can be resolved per-node
  // (hermes cold-boot legitimately takes 8-13 min vs 30-90s for docker
  //  runtimes — a single threshold would false-alarm on one or the other).
-  // Separator: `|` between fields, `,` between nodes. Names may contain
-  // anything the user typed; strip `|` and `,` so serialization round-trips.
+  // provisionTimeoutMs added by #2054 — server-declared per-workspace
+  // override that wins over the runtime profile when present.
+  // Separator: `|` between fields, `,` between nodes. Only `name` is
+  // user-typed (gets sanitized below); the other fields are
+  // primitive-typed (id is a UUID, runtime is a [a-z-]+ slug,
+  // provisionTimeoutMs is numeric). If a future field is string-typed,
+  // extend the sanitize step to strip `|` + `,` from it too.
+  // Empty-string sentinels for missing values so split/index stays positional.
  const provisioningNodes = useCanvasStore((s) => {
    const result = s.nodes
      .filter((n) => n.data.status === "provisioning")
      .map((n) => {
        const safeName = (n.data.name ?? "").replace(/[|,]/g, " ");
        const runtime = n.data.runtime ?? "";
-        return `${n.id}|${safeName}|${runtime}`;
+        const provisionTimeoutMs = n.data.provisionTimeoutMs ?? "";
+        return `${n.id}|${safeName}|${runtime}|${provisionTimeoutMs}`;
      });
    return result.join(",");
  });
@ -87,8 +94,14 @@ export function ProvisioningTimeout({
    () =>
      provisioningNodes
        ? provisioningNodes.split(",").map((entry) => {
-            const [id, name, runtime] = entry.split("|");
-            return { id, name, runtime };
+            const [id, name, runtime, provisionTimeoutMs] = entry.split("|");
+            const ptms = provisionTimeoutMs ? Number(provisionTimeoutMs) : undefined;
+            return {
+              id,
+              name,
+              runtime,
+              provisionTimeoutMs: Number.isFinite(ptms) ? ptms : undefined,
+            };
          })
        : [],
    [provisioningNodes],
@ -138,10 +151,19 @@ export function ProvisioningTimeout({
      // default), then scales by concurrent-provisioning count. A
      // hermes workspace in a batch alongside two langgraph workspaces
      // gets hermes's 12-min base, not langgraph's 2-min base.
+      //
+      // Resolution priority (most specific wins):
+      //   1. node.provisionTimeoutMs — server-declared per-workspace
+      //      override (#2054, sourced from template manifest)
+      //   2. timeoutMs prop — single-threshold test override
+      //   3. runtime profile in @/lib/runtimeProfiles
+      //   4. DEFAULT_RUNTIME_PROFILE
      for (const node of parsedProvisioningNodes) {
        const startedAt = tracking.get(node.id);
        if (!startedAt) continue;
-        const base = timeoutMs ?? provisionTimeoutForRuntime(node.runtime);
+        const base = provisionTimeoutForRuntime(node.runtime, {
+          provisionTimeoutMs: node.provisionTimeoutMs ?? timeoutMs,
+        });
        const effective = effectiveTimeoutMs(
          base,
          parsedProvisioningNodes.length,
--- a/canvas/src/components/tests/ProvisioningTimeout.test.tsx
+++ b/canvas/src/components/tests/ProvisioningTimeout.test.tsx
@ -287,5 +287,60 @@ describe("ProvisioningTimeout", () => {
        );
      });
    });
+
+    // #2054 — per-workspace server override threading from socket
+    // payload through node-data into ProvisioningTimeout's resolver.
+    // Doesn't render the component; verifies the data path lands the
+    // value where ProvisioningTimeout reads it from.
+    describe("server-side per-workspace override (#2054)", () => {
+      it("hydrate carries provision_timeout_ms onto node.data.provisionTimeoutMs", () => {
+        useCanvasStore.getState().hydrate([
+          makeWS({
+            id: "ws-slow",
+            name: "Slow",
+            status: "provisioning",
+            runtime: "future-runtime",
+            provision_timeout_ms: 600_000,
+          }),
+        ]);
+        const node = useCanvasStore
+          .getState()
+          .nodes.find((n) => n.id === "ws-slow");
+        expect(node?.data.provisionTimeoutMs).toBe(600_000);
+      });
+
+      it("absent provision_timeout_ms hydrates to null (falls through to runtime profile)", () => {
+        useCanvasStore.getState().hydrate([
+          makeWS({ id: "ws-default", name: "Default", status: "provisioning", runtime: "hermes" }),
+        ]);
+        const node = useCanvasStore
+          .getState()
+          .nodes.find((n) => n.id === "ws-default");
+        expect(node?.data.provisionTimeoutMs).toBeNull();
+        // And the resolver still returns hermes' profile value when
+        // no override is supplied — proves the fall-through stays intact.
+        expect(
+          provisionTimeoutForRuntime("hermes", {
+            provisionTimeoutMs: node?.data.provisionTimeoutMs ?? undefined,
+          }),
+        ).toBe(RUNTIME_PROFILES.hermes.provisionTimeoutMs);
+      });
+
+      it("server override wins over runtime profile via the resolver path the component uses", () => {
+        // Mirrors ProvisioningTimeout.tsx:144 where node.provisionTimeoutMs
+        // is passed as overrides — verifies the resolver respects it
+        // even when the runtime has its own profile entry.
+        const override = 30_000;
+        expect(
+          provisionTimeoutForRuntime("hermes", {
+            provisionTimeoutMs: override,
+          }),
+        ).toBe(override);
+        // Sanity — the runtime profile would have been much larger.
+        expect(RUNTIME_PROFILES.hermes.provisionTimeoutMs).toBeGreaterThan(
+          override,
+        );
+      });
+    });
  });
 });
--- a/canvas/src/store/canvas-topology.ts
+++ b/canvas/src/store/canvas-topology.ts
@ -478,6 +478,9 @@ export function buildNodesAndEdges(
        needsRestart: false,
        budgetLimit: ws.budget_limit ?? null,
        budgetUsed: ws.budget_used ?? null,
+        // #2054 — server-declared per-workspace provisioning timeout.
+        // Falls through to the runtime profile when null/absent.
+        provisionTimeoutMs: ws.provision_timeout_ms ?? null,
      },
    };
    if (hasParent) {
--- a/canvas/src/store/canvas.ts
+++ b/canvas/src/store/canvas.ts
@ -92,6 +92,12 @@ export interface WorkspaceNodeData extends Record<string, unknown> {
  budgetLimit: number | null;
  /** Cumulative USD spend. Present when the platform tracks spend (issue #541). */
  budgetUsed?: number | null;
+  /** Per-workspace provisioning-timeout override in milliseconds (#2054).
+   *  Sourced server-side from the workspace's template manifest at provision
+   *  time. null/absent = fall through to runtime profile + default in
+   *  @/lib/runtimeProfiles. Lets a slow runtime declare its cold-boot
+   *  expectation without a canvas release. */
+  provisionTimeoutMs?: number | null;
 }

 export type PanelTab = "details" | "skills" | "chat" | "terminal" | "config" | "schedule" | "channels" | "files" | "memory" | "traces" | "events" | "activity" | "audit";
--- a/canvas/src/store/socket.ts
+++ b/canvas/src/store/socket.ts
@ -122,6 +122,13 @@ export interface WorkspaceData {
  budget_limit: number | null;
  /** Cumulative USD spend for this workspace. Present when the platform tracks spend. */
  budget_used?: number | null;
+  /** Server-declared provisioning-timeout override in milliseconds (#2054).
+   *  Sourced from the workspace's template manifest at provision time —
+   *  lets a slow runtime declare its cold-boot expectation without a
+   *  canvas release. Falls through to the per-runtime profile in
+   *  `@/lib/runtimeProfiles` when absent (the default behavior for any
+   *  template that hasn't yet declared the field). */
+  provision_timeout_ms?: number | null;
 }

 let socket: ReconnectingSocket | null = null;
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -126,6 +126,13 @@ services:
      REDIS_URL: redis://redis:6379
      PORT: "${PLATFORM_PORT:-8080}"
      PLATFORM_URL: "http://platform:${PLATFORM_PORT:-8080}"
+      # Default MOLECULE_ENV=development so the WorkspaceAuth / AdminAuth
+      # middleware fail-open path activates when ADMIN_TOKEN is unset —
+      # otherwise the canvas (which runs without a bearer in pure local
+      # dev) gets 401 "missing workspace auth token" on every request.
+      # Override to "production" for SaaS/staged deploys; in those modes
+      # ADMIN_TOKEN must also be set or every request rejects.
+      MOLECULE_ENV: "${MOLECULE_ENV:-development}"
      CORS_ORIGINS: ${CORS_ORIGINS:-http://localhost:${CANVAS_PUBLISH_PORT:-3000},http://127.0.0.1:${CANVAS_PUBLISH_PORT:-3000},http://localhost:3001}
      RATE_LIMIT: "${RATE_LIMIT:-1000}"
      CONFIGS_DIR: /configs
@ -153,6 +160,24 @@ services:
      HIBERNATION_IDLE_MINUTES: "${HIBERNATION_IDLE_MINUTES:-}"
      # Plugin supply chain hardening (issue #768 / PR #775). Never set in production.
      PLUGIN_ALLOW_UNPINNED: "${PLUGIN_ALLOW_UNPINNED:-}"
+      # Force ImagePull/ContainerCreate to request linux/amd64 manifests
+      # for the workspace-template-* images. The templates ship single-arch
+      # amd64 today; without this override, an arm64 host (Apple Silicon)
+      # asks the daemon for linux/arm64/v8, which doesn't match the manifest
+      # and the pull fails with "no matching manifest". Apple Silicon will
+      # run the amd64 image under Rosetta — slower (~2-3×) but functional.
+      # Override to "" or another platform when the templates start shipping
+      # multi-arch (then this hardcoded amd64 becomes unnecessary).
+      MOLECULE_IMAGE_PLATFORM: "${MOLECULE_IMAGE_PLATFORM:-linux/amd64}"
+      # GHCR auth for the workspace-images refresh endpoint
+      # (POST /admin/workspace-images/refresh). When set, the platform's
+      # Docker SDK ImagePull on private workspace-template-* images
+      # succeeds without per-host `docker login`. GHCR_USER is the GitHub
+      # username; GHCR_TOKEN is a fine-grained PAT with `read:packages`
+      # on the Molecule-AI org. Both unset → endpoint can only pull
+      # public images (current state for all 8 templates).
+      GHCR_USER: "${GHCR_USER:-}"
+      GHCR_TOKEN: "${GHCR_TOKEN:-}"
    volumes:
      - ./workspace-configs-templates:/configs
      - ./org-templates:/org-templates:ro
--- a/docs/workspace-runtime-package.md
+++ b/docs/workspace-runtime-package.md
@ -2,29 +2,67 @@

 ## Overview

-The shared workspace runtime infrastructure lives in two places:
+The shared workspace runtime infrastructure has **one editable source** and
+**one published artifact**:

-1. **Source of truth (monorepo):** `workspace/` — this is where all development happens
-2. **Published package:** [`molecule-ai-workspace-runtime`](https://pypi.org/project/molecule-ai-workspace-runtime/) on PyPI
+1. **Source of truth (monorepo, editable):** `workspace/` — every runtime
+   change lands here. Edit it like any other monorepo code.
+2. **Published artifact (PyPI, generated):** [`molecule-ai-workspace-runtime`](https://pypi.org/project/molecule-ai-workspace-runtime/)
+   — produced by `.github/workflows/publish-runtime.yml` on every
+   `runtime-vX.Y.Z` tag push. Do NOT edit this independently — it gets
+   overwritten on every publish.
+
+The legacy sibling repo `molecule-ai-workspace-runtime` (the GitHub repo, as
+distinct from the PyPI package) is no longer the source-of-truth and should
+be treated as a publish artifact only. It can be archived or used as a
+read-only mirror.
+
+## Why this shape
+
+The 8 workspace template repos (claude-code, langgraph, hermes, etc.) each
+build their own Docker image and `pip install molecule-ai-workspace-runtime`
+from PyPI. PyPI is the right distribution channel — semver, reproducible
+builds, no submodule dance per-repo. But the runtime ALSO needs to evolve
+in lock-step with the platform's wire protocol (queue shape, A2A metadata,
+event payloads). Shipping cross-cutting protocol changes as separate
+runtime + platform PRs in two repos creates ordering pain and broken
+intermediate states.
+
+The monorepo + auto-publish split gives both: edit cross-cutting changes
+in one PR, publish the runtime artifact via a tag.

 ## What's in the package

-Everything in `workspace/` except adapter-specific code:
+Everything in `workspace/*.py` plus the `adapters/`, `builtin_tools/`,
+`plugins_registry/`, `policies/`, `skill_loader/` subpackages. Build
+artifacts (`Dockerfile`, `*.sh`, `pytest.ini`, `requirements.txt`) are
+excluded.

- `molecule_runtime/` — all shared `.py` files (main.py, config.py, heartbeat.py, etc.)
- `molecule_runtime/adapters/` — `BaseAdapter`, `AdapterConfig`, `SetupResult`, `shared_runtime`
- `molecule_runtime/builtin_tools/` — delegation, memory, approvals, sandbox, telemetry
- `molecule_runtime/skill_loader/` — skill loading + hot-reload
- `molecule_runtime/plugins_registry/` — plugin discovery and install pipeline
- `molecule_runtime/policies/` — namespace routing policies
- Console script: `molecule-runtime` → `molecule_runtime.main:main_sync`
+The build script rewrites bare imports so the published package is a
+proper Python namespace:
+
+```
+# In monorepo workspace/:
+from a2a_client import discover_peer
+from builtin_tools.memory import store
+
+# In published molecule_runtime/ (auto-rewritten at publish time):
+from molecule_runtime.a2a_client import discover_peer
+from molecule_runtime.builtin_tools.memory import store
+```
+
+The closed allowlist of rewritten module names lives in
+`scripts/build_runtime_package.py` (`TOP_LEVEL_MODULES` + `SUBPACKAGES`).
+Add a new top-level module to workspace/? Add it to the allowlist in the
+same PR.

 ## Adapter repos

-Each of the 8 adapter repos now contains:
+Each of the 8 adapter template repos contains:
 - `adapter.py` — runtime-specific `Adapter` class
- `requirements.txt` — `molecule-ai-workspace-runtime>=0.1.0` + adapter deps
- `Dockerfile` — standalone image (no longer extends workspace-template:base)
+- `requirements.txt` — `molecule-ai-workspace-runtime>=0.1.X` + adapter deps
+- `Dockerfile` — standalone image with `ENV ADAPTER_MODULE=adapter` and
+  `ENTRYPOINT ["molecule-runtime"]`

 | Adapter | Repo |
 |---------|------|
@ -39,8 +77,8 @@ Each of the 8 adapter repos now contains:

 ## Adapter discovery (ADAPTER_MODULE)

-Standalone adapter repos set `ENV ADAPTER_MODULE=adapter` in their Dockerfile.
-The runtime's `get_adapter()` checks this env var first:
+Standalone adapter repos set `ENV ADAPTER_MODULE=adapter` in their
+Dockerfile. The runtime's `get_adapter()` checks this env var first:

 ```python
 # In molecule_runtime/adapters/__init__.py
@ -49,25 +87,104 @@ def get_adapter(runtime: str) -> type[BaseAdapter]:
    if adapter_module:
        mod = importlib.import_module(adapter_module)
        return getattr(mod, "Adapter")
-    # Fall back to built-in subdirectory scan (monorepo local dev)
-    ...
+    raise KeyError(...)
 ```

 ## Publishing a new version

 ```bash
-cd workspace-template
-# 1. Bump version in pyproject.toml
-# 2. Sync to molecule-ai-workspace-runtime repo
-# 3. Tag and push — CI publishes to PyPI via PYPI_TOKEN secret
+# From any local checkout of monorepo, after merging your runtime change:
+git tag runtime-v0.1.6
+git push origin runtime-v0.1.6
 ```

-Or manually:
-```bash
-cd workspace-template
-python -m build
-python -m twine upload dist/*
+The `publish-runtime` workflow takes over — checks out the tag, runs
+`scripts/build_runtime_package.py --version 0.1.6`, builds wheel + sdist,
+runs a smoke import to catch broken rewrites, and uploads to PyPI via
+the `PYPI_TOKEN` repo secret.
+
+For dev/test releases without tagging, dispatch the workflow manually
+with an explicit version (e.g. `0.1.6.dev1` — PEP 440 dev/rc/post forms
+are accepted).
+
+After publish, the 8 template repos pick up the new version on their
+next `:latest` rebuild. To force-pull immediately, bump the pin in each
+template's `requirements.txt`.
+
+## End-to-end CD chain
+
+The full chain from monorepo merge → workspace containers running new code:
+
 ```
+1. Merge PR with workspace/ changes to main
+   ↓
+2. .github/workflows/auto-tag-runtime.yml fires
+   ↓ reads PR labels (release:major/minor) or defaults to patch
+   ↓ pushes runtime-vX.Y.Z tag
+   ↓
+3. .github/workflows/publish-runtime.yml fires (on the tag)
+   ↓ builds wheel via scripts/build_runtime_package.py
+   ↓ smoke-imports the wheel
+   ↓ uploads to PyPI
+   ↓ cascade job fires repository_dispatch (event-type: runtime-published)
+   ↓ to all 8 workspace-template-* repos
+   ↓
+4. Each template's publish-image.yml fires (on repository_dispatch)
+   ↓ rebuilds Dockerfile (which pip-installs the new PyPI version)
+   ↓ pushes ghcr.io/molecule-ai/workspace-template-<runtime>:latest
+   ↓
+5. Production hosts run scripts/refresh-workspace-images.sh
+   OR an operator hits POST /admin/workspace-images/refresh on the platform
+   ↓ docker pull all 8 :latest tags
+   ↓ remove + force-recreate any running ws-* containers using a refreshed image
+   ↓ canvas re-provisions the workspaces on next interaction
+```
+
+Steps 1-4 are fully automated. Step 5 is one-click: a single curl or shell
+command. SaaS deployments typically wire step 5 into their normal deploy
+pipeline (every release pulls fresh images on every host); local dev fires
+it manually after a runtime release lands.
+
+### Required secrets
+
+| Secret | Where | Why |
+|---|---|---|
+| `PYPI_TOKEN` | molecule-core repo | Twine upload auth (PyPI) |
+| `TEMPLATE_DISPATCH_TOKEN` | molecule-core repo | Fine-grained PAT with `actions:write` on the 8 template repos. Without it the `cascade` job warns and exits clean — PyPI still publishes; templates just don't auto-rebuild. |
+
+### Step 5 specifics
+
+**Local dev (compose stack):**
+```bash
+bash scripts/refresh-workspace-images.sh                  # all runtimes
+bash scripts/refresh-workspace-images.sh --runtime claude-code
+bash scripts/refresh-workspace-images.sh --no-recreate    # pull only, leave containers
+```
+
+**Via platform admin endpoint (any deploy):**
+```bash
+curl -X POST "$PLATFORM/admin/workspace-images/refresh"
+curl -X POST "$PLATFORM/admin/workspace-images/refresh?runtime=claude-code"
+curl -X POST "$PLATFORM/admin/workspace-images/refresh?recreate=false"
+```
+
+The endpoint pulls + recreates from inside the platform container, so it
+needs Docker socket access (the compose stack mounts
+`/var/run/docker.sock` already) AND GHCR auth on the host's docker config
+(`docker login ghcr.io` once per host). On a fresh host without GHCR auth,
+the pull step warns per runtime and the response surfaces the failures.
+
+## Local dev (build the package without publishing)
+
+```bash
+python3 scripts/build_runtime_package.py --version 0.1.0-local --out /tmp/runtime-build
+cd /tmp/runtime-build
+python -m build              # produces dist/*.whl + dist/*.tar.gz
+pip install dist/*.whl       # install into a venv to test locally
+```
+
+This is the same pipeline CI runs. Use it to validate import-rewrite
+correctness before pushing a `runtime-v*` tag.

 ## Writing a new adapter

@ -75,5 +192,18 @@ python -m twine upload dist/*
 2. Copy `adapter.py` pattern from any existing adapter repo
 3. Change imports: `from molecule_runtime.adapters.base import BaseAdapter, AdapterConfig`
 4. Create `requirements.txt` with `molecule-ai-workspace-runtime>=0.1.0` + your deps
-5. Create `Dockerfile` with `ENV ADAPTER_MODULE=adapter` and `ENTRYPOINT ["molecule-runtime"]`
+5. Create `Dockerfile` with `ENV ADAPTER_MODULE=adapter` and
+   `ENTRYPOINT ["molecule-runtime"]`
 6. Register the runtime name in the platform's known runtimes list
+
+## Migration note
+
+Prior to this workflow, the runtime was duplicated across monorepo
+`workspace/` AND a sibling repo `molecule-ai-workspace-runtime`, with no
+sync mechanism. That caused 30+ files to drift between the two trees and
+tonight's chat-leak / queued-classification fixes existed only in the
+monorepo copy until manually ported.
+
+If you have an old local checkout of `molecule-ai-workspace-runtime`, treat
+it as outdated. The monorepo `workspace/` is now authoritative; the PyPI
+artifact is rebuilt from it on every `runtime-v*` tag.
--- a/scripts/build_runtime_package.py
+++ b/scripts/build_runtime_package.py
@ -0,0 +1,298 @@
+#!/usr/bin/env python3
+"""Build the molecule-ai-workspace-runtime PyPI package from monorepo workspace/.
+
+Monorepo workspace/ is the single source-of-truth for runtime code. The PyPI
+package is a publish-time mirror produced by this script, NOT a parallel
+editable copy. Anyone editing the runtime should edit workspace/, never the
+sibling molecule-ai-workspace-runtime repo.
+
+What this does
+--------------
+1. Copies workspace/ source into build/molecule_runtime/ (note the rename:
+   bare modules become a real Python package).
+2. Rewrites top-level imports so e.g. `from a2a_client import X` becomes
+   `from molecule_runtime.a2a_client import X`. The rewrite is regex-based
+   on a closed allowlist of modules — third-party imports like `from a2a.X`
+   (the a2a-sdk package) are left alone because the regex is anchored on
+   exact module names.
+3. Writes a pyproject.toml with the requested version + the README + the
+   py.typed marker.
+4. Leaves the build dir ready for `python -m build` to produce a wheel/sdist.
+
+Usage
+-----
+  scripts/build_runtime_package.py --version 0.1.6 --out /tmp/runtime-build
+  cd /tmp/runtime-build && python -m build
+  python -m twine upload dist/*
+
+The publish workflow (.github/workflows/publish-runtime.yml) drives this
+on every `runtime-v*` tag push.
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import shutil
+import sys
+from pathlib import Path
+
+# Top-level Python modules in workspace/ that become molecule_runtime.X.
+# Anything imported as `from <name> import` or `import <name>` (where <name>
+# matches one of these) gets rewritten to use the package prefix.
+#
+# Closed list (not "every .py we copy") because a typo in workspace/ would
+# otherwise leak into a wrong rewrite. Update this when adding a new
+# top-level module to workspace/.
+TOP_LEVEL_MODULES = {
+    "a2a_cli",
+    "a2a_client",
+    "a2a_executor",
+    "a2a_mcp_server",
+    "a2a_tools",
+    "adapter_base",
+    "agent",
+    "agents_md",
+    "claude_sdk_executor",
+    "cli_executor",
+    "config",
+    "consolidation",
+    "coordinator",
+    "events",
+    "executor_helpers",
+    "heartbeat",
+    "hermes_executor",
+    "initial_prompt",
+    "main",
+    "molecule_ai_status",
+    "platform_auth",
+    "plugins",
+    "preflight",
+    "prompt",
+    "shared_runtime",
+}
+
+# Subdirectory packages — these are already real packages (they have or will
+# have __init__.py) so the rewrite is `from <pkg>` → `from molecule_runtime.<pkg>`.
+SUBPACKAGES = {
+    "adapters",
+    "builtin_tools",
+    "plugins_registry",
+    "policies",
+    "skill_loader",
+}
+
+# Files in workspace/ NOT included in the published package. These are
+# build artifacts, dev scripts, or monorepo-only scaffolding.
+EXCLUDE_FILES = {
+    "Dockerfile",
+    "build-all.sh",
+    "rebuild-runtime-images.sh",
+    "entrypoint.sh",
+    "pytest.ini",
+    "requirements.txt",
+    # Note: adapter_base.py, agents_md.py, hermes_executor.py, shared_runtime.py
+    # are kept (referenced by adapters/__init__.py and other modules); they get
+    # their imports rewritten via TOP_LEVEL_MODULES. Excluding them broke the
+    # smoke-test install with `ModuleNotFoundError: adapter_base`.
+}
+
+EXCLUDE_DIRS = {
+    "__pycache__",
+    "tests",
+    "lib",
+    "molecule_audit",
+    "scripts",
+}
+
+
+def build_import_rewriter() -> re.Pattern:
+    """Compile a single regex matching all import statements that need
+    rewriting. The match groups capture the keyword + module name so the
+    replacement preserves whitespace and trailing punctuation.
+
+    Modules included: TOP_LEVEL_MODULES ∪ SUBPACKAGES.
+
+    The negative-lookahead on `\\.` in the suffix prevents matching
+    `from a2a.server.X import Y` against bare `a2a` (which isn't in our
+    set, but the principle matters for any future short module name that
+    happens to be a prefix of a real package name).
+    """
+    names = sorted(TOP_LEVEL_MODULES | SUBPACKAGES)
+    alt = "|".join(re.escape(n) for n in names)
+    # Matches:
+    #   from <name>(\.|\s|import)
+    #   import <name>(\s|$|,)
+    # And captures the keyword + name so we can re-emit with prefix.
+    pattern = (
+        r"(?m)^(?P<indent>\s*)"          # leading whitespace (preserved)
+        r"(?P<kw>from|import)\s+"        # 'from' or 'import'
+        r"(?P<mod>" + alt + r")"          # the module name
+        r"(?P<rest>[\s.,]|$)"            # what follows: '.subpath', ' import …', ',', whitespace, EOL
+    )
+    return re.compile(pattern)
+
+
+def rewrite_imports(text: str, regex: re.Pattern) -> str:
+    """Replace bare imports with package-prefixed ones.
+
+    `import X`           → `import molecule_runtime.X as X`  (preserve binding)
+    `from X import Y`    → `from molecule_runtime.X import Y`
+    `from X.sub import Y` → `from molecule_runtime.X.sub import Y`
+    """
+    def repl(m: re.Match) -> str:
+        indent, kw, mod, rest = m.group("indent"), m.group("kw"), m.group("mod"), m.group("rest")
+        if kw == "from":
+            # `from X` or `from X.sub` — always safe to prefix.
+            return f"{indent}from molecule_runtime.{mod}{rest}"
+        # `import X` — preserve the binding name `X` (callers do `X.foo`)
+        # by aliasing. `import X.sub` is uncommon for our modules and would
+        # need a different binding form, but isn't used in workspace/ today.
+        if rest.startswith("."):
+            # `import X.sub` — rewrite as `import molecule_runtime.X.sub` and
+            # leave the trailing dot pattern intact for the rest of the line.
+            return f"{indent}import molecule_runtime.{mod}{rest}"
+        # Plain `import X` — alias preserves the local name.
+        return f"{indent}import molecule_runtime.{mod} as {mod}{rest}"
+    return regex.sub(repl, text)
+
+
+def copy_tree_filtered(src: Path, dst: Path) -> list[Path]:
+    """Copy src/ → dst/ skipping EXCLUDE_FILES + EXCLUDE_DIRS. Returns the
+    list of .py files copied so the caller can run the import rewrite over
+    them in one pass."""
+    py_files: list[Path] = []
+    if dst.exists():
+        shutil.rmtree(dst)
+    dst.mkdir(parents=True)
+    for entry in src.iterdir():
+        if entry.is_dir():
+            if entry.name in EXCLUDE_DIRS:
+                continue
+            sub_py = copy_tree_filtered(entry, dst / entry.name)
+            py_files.extend(sub_py)
+        else:
+            if entry.name in EXCLUDE_FILES:
+                continue
+            shutil.copy2(entry, dst / entry.name)
+            if entry.suffix == ".py":
+                py_files.append(dst / entry.name)
+    return py_files
+
+
+PYPROJECT_TEMPLATE = """\
+[build-system]
+requires = ["setuptools>=68.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "molecule-ai-workspace-runtime"
+version = "{version}"
+description = "Molecule AI workspace runtime — shared infrastructure for all agent adapters"
+requires-python = ">=3.11"
+license = {{text = "BSL-1.1"}}
+readme = "README.md"
+dependencies = [
+    "a2a-sdk[http-server]>=1.0.0,<2.0",
+    "httpx>=0.27.0",
+    "uvicorn>=0.30.0",
+    "starlette>=0.38.0",
+    "websockets>=12.0",
+    "pyyaml>=6.0",
+    "langchain-core>=0.3.0",
+    "opentelemetry-api>=1.24.0",
+    "opentelemetry-sdk>=1.24.0",
+    "opentelemetry-exporter-otlp-proto-http>=1.24.0",
+    "temporalio>=1.7.0",
+]
+
+[project.scripts]
+molecule-runtime = "molecule_runtime.main:main_sync"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["molecule_runtime*"]
+
+[tool.setuptools.package-data]
+"molecule_runtime" = ["py.typed"]
+"""
+
+
+README_TEMPLATE = """\
+# molecule-ai-workspace-runtime
+
+Shared workspace runtime for [Molecule AI](https://github.com/Molecule-AI/molecule-core)
+agent adapters. Installed by every workspace template image
+(`workspace-template-claude-code`, `-langgraph`, `-hermes`, etc.) to provide
+A2A delegation, heartbeat, memory, plugin loading, and skill management.
+
+This package is **published from the molecule-core monorepo `workspace/`
+directory** by the `publish-runtime` GitHub Actions workflow on every
+`runtime-v*` tag push. **Do not edit this package directly** — edit
+`workspace/` in the monorepo.
+
+See [`docs/workspace-runtime-package.md`](https://github.com/Molecule-AI/molecule-core/blob/main/docs/workspace-runtime-package.md)
+for the publish flow and architecture.
+"""
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--version", required=True, help="Package version, e.g. 0.1.6")
+    parser.add_argument("--out", required=True, type=Path, help="Build output directory (will be wiped)")
+    parser.add_argument("--source", type=Path, default=Path(__file__).resolve().parent.parent / "workspace",
+                        help="Path to monorepo workspace/ directory (default: ../workspace from this script)")
+    args = parser.parse_args()
+
+    src = args.source.resolve()
+    out = args.out.resolve()
+    if not src.is_dir():
+        print(f"error: source not a directory: {src}", file=sys.stderr)
+        return 2
+
+    pkg_dir = out / "molecule_runtime"
+    print(f"[build] source: {src}")
+    print(f"[build] output: {out}")
+    print(f"[build] package: {pkg_dir}")
+
+    if out.exists():
+        shutil.rmtree(out)
+    out.mkdir(parents=True)
+
+    py_files = copy_tree_filtered(src, pkg_dir)
+    print(f"[build] copied {len(py_files)} .py files")
+
+    # Ensure top-level package marker exists. workspace/ doesn't have one
+    # (it's not a package in monorepo), but the published artifact must.
+    init = pkg_dir / "__init__.py"
+    if not init.exists():
+        init.write_text('"""Molecule AI workspace runtime."""\n')
+
+    # Touch py.typed so type-checkers in adapter consumers see the package
+    # as typed. Empty file is the convention.
+    (pkg_dir / "py.typed").touch()
+
+    # Rewrite imports in every .py file we copied + the new __init__.py.
+    regex = build_import_rewriter()
+    rewrites = 0
+    for f in [*py_files, init]:
+        original = f.read_text()
+        rewritten = rewrite_imports(original, regex)
+        if rewritten != original:
+            f.write_text(rewritten)
+            rewrites += 1
+    print(f"[build] rewrote imports in {rewrites} files")
+
+    # Emit pyproject.toml + README at build root.
+    (out / "pyproject.toml").write_text(PYPROJECT_TEMPLATE.format(version=args.version))
+    (out / "README.md").write_text(README_TEMPLATE)
+
+    print(f"[build] done. To publish:")
+    print(f"  cd {out}")
+    print(f"  python -m build")
+    print(f"  python -m twine upload dist/*")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/ops/sweep-cf-orphans.sh
+++ b/scripts/ops/sweep-cf-orphans.sh
@ -117,62 +117,69 @@ log "  CF records: $TOTAL_CF"
 #   5. Anything else → keep (we only sweep patterns we understand).

 export PROD_SLUGS STAGING_SLUGS EC2_NAMES TOTAL_CF
+# Edits inside the CANONICAL DECIDE block below must mirror
+# scripts/ops/sweep_cf_decide.py — the parity test in
+# test_sweep_cf_decide.py asserts they match byte-for-byte.
 DECISIONS=$(echo "$CF_JSON" | python3 -c '
 import json, os, re, sys
 d = json.load(sys.stdin)
-prod = set(os.environ["PROD_SLUGS"].split())
-staging = set(os.environ["STAGING_SLUGS"].split())
-all_slugs = prod | staging
+prod_slugs = set(os.environ["PROD_SLUGS"].split())
+staging_slugs = set(os.environ["STAGING_SLUGS"].split())
+all_slugs = prod_slugs | staging_slugs
 ec2_names = set(n for n in os.environ["EC2_NAMES"].split() if n)

-def decide(r):
+_PLATFORM_CORE_NAMES = {
+    "api.moleculesai.app", "app.moleculesai.app", "doc.moleculesai.app",
+    "send.moleculesai.app", "status.moleculesai.app", "www.moleculesai.app",
+    "staging-api.moleculesai.app",
+}
+_WS_RE = re.compile(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$")
+_E2E_RE = re.compile(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$")
+_TENANT_RE = re.compile(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$")
+
+# CANONICAL DECIDE BEGIN
+def decide(r, all_slugs, ec2_names):
    n = r["name"]
    rid = r["id"]
    typ = r["type"]

-    # Rule 1: platform core — leave alone
    if n == "moleculesai.app":
        return ("keep", "apex", rid, n, typ)
    if n.startswith("_") or n.endswith("._domainkey.moleculesai.app"):
        return ("keep", "verification/key", rid, n, typ)
-    if n in {"api.moleculesai.app","app.moleculesai.app","doc.moleculesai.app",
-            "send.moleculesai.app","status.moleculesai.app","www.moleculesai.app",
-            "staging-api.moleculesai.app"}:
+    if n in _PLATFORM_CORE_NAMES:
        return ("keep", "platform-core", rid, n, typ)

-    # Rule 3: ws-<hex8>-<rest>.(staging.)moleculesai.app
-    m = re.match(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$", n)
+    m = _WS_RE.match(n)
    if m:
        prefix = m.group(1)
-        # Live EC2 names are like "ws-d3605ef2-f7d" — same shape as DNS subdomain.
+        # Live EC2 names share the ws-<hex8>-<rest> shape with the DNS subdomain.
        for ename in ec2_names:
            if ename.startswith(prefix):
                return ("keep", "live-ec2", rid, n, typ)
        return ("delete", "orphan-ws", rid, n, typ)

-    # Rule 4: e2e-* tenants (includes canary, canvas variants)
-    m = re.match(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$", n)
+    m = _E2E_RE.match(n)
    if m:
        slug = m.group(1)
        if slug in all_slugs:
            return ("keep", "live-e2e-tenant", rid, n, typ)
        return ("delete", "orphan-e2e-tenant", rid, n, typ)

-    # Rule 2: any other tenant subdomain (slug.moleculesai.app or slug.staging.moleculesai.app)
-    m = re.match(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$", n)
+    m = _TENANT_RE.match(n)
    if m:
        slug = m.group(1)
        if slug in all_slugs:
            return ("keep", "live-tenant", rid, n, typ)
-        # Only flag as orphan if name looks like a tenant (not a one-off like "hermes-final-*")
-        # To avoid false-positive nukes on ad-hoc records, we KEEP anything that
-        # does not match a known pattern. Orphan only for explicit tenant-shaped names.
+        # KEEP unknown tenant-shaped names — avoid false-positive nukes on
+        # ad-hoc records (e.g. hermes-final-*) that do not match a known slug.
        return ("keep", "unknown-subdomain-kept-for-safety", rid, n, typ)

    return ("keep", "not-a-pattern-we-sweep", rid, n, typ)
+# CANONICAL DECIDE END

 for r in d["result"]:
-    action, reason, rid, name, typ = decide(r)
+    action, reason, rid, name, typ = decide(r, all_slugs, ec2_names)
    print(json.dumps({"action": action, "reason": reason, "id": rid, "name": name, "type": typ}))
 ')

--- a/scripts/ops/sweep_cf_decide.py
+++ b/scripts/ops/sweep_cf_decide.py
@ -0,0 +1,94 @@
+"""Decision logic extracted from sweep-cf-orphans.sh for unit testing (#2027).
+
+The bash script embeds the same logic inline as a python heredoc — this
+module is a verbatim copy used by test_sweep_cf_decide.py. The parity
+test (TestParityWithBashScript) reads the bash script and asserts the
+canonical block in this file is present byte-for-byte, so the two
+cannot drift apart silently.
+
+If you change the rules: edit BOTH this file AND the inline block in
+``scripts/ops/sweep-cf-orphans.sh`` (the canonical block runs from
+``# CANONICAL DECIDE BEGIN`` to ``# CANONICAL DECIDE END`` markers in
+both files; the parity check compares those slices).
+
+Inputs to ``decide(record, all_slugs, ec2_names)``:
+  record       Cloudflare DNS record dict {name, id, type}
+  all_slugs    set of CP org slugs (prod ∪ staging) — caller computes the
+               union once instead of per-record (decide is hot-path: 100s
+               to 1000s of records per sweep)
+  ec2_names    set of live EC2 Name tags (e.g. ``ws-d3605ef2-f7d``)
+
+Returns ``(action, reason, id, name, type)`` matching the bash heredoc.
+"""
+from __future__ import annotations
+
+import re
+
+
+# Pre-compile per-record regexes once at module load — saves the per-call
+# pattern-cache lookup across 1000s of CF records per sweep. Mirrored at
+# the same scope in sweep-cf-orphans.sh's heredoc.
+_PLATFORM_CORE_NAMES = {
+    "api.moleculesai.app", "app.moleculesai.app", "doc.moleculesai.app",
+    "send.moleculesai.app", "status.moleculesai.app", "www.moleculesai.app",
+    "staging-api.moleculesai.app",
+}
+_WS_RE = re.compile(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$")
+_E2E_RE = re.compile(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$")
+_TENANT_RE = re.compile(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$")
+
+
+# CANONICAL DECIDE BEGIN
+def decide(r, all_slugs, ec2_names):
+    n = r["name"]
+    rid = r["id"]
+    typ = r["type"]
+
+    if n == "moleculesai.app":
+        return ("keep", "apex", rid, n, typ)
+    if n.startswith("_") or n.endswith("._domainkey.moleculesai.app"):
+        return ("keep", "verification/key", rid, n, typ)
+    if n in _PLATFORM_CORE_NAMES:
+        return ("keep", "platform-core", rid, n, typ)
+
+    m = _WS_RE.match(n)
+    if m:
+        prefix = m.group(1)
+        # Live EC2 names share the ws-<hex8>-<rest> shape with the DNS subdomain.
+        for ename in ec2_names:
+            if ename.startswith(prefix):
+                return ("keep", "live-ec2", rid, n, typ)
+        return ("delete", "orphan-ws", rid, n, typ)
+
+    m = _E2E_RE.match(n)
+    if m:
+        slug = m.group(1)
+        if slug in all_slugs:
+            return ("keep", "live-e2e-tenant", rid, n, typ)
+        return ("delete", "orphan-e2e-tenant", rid, n, typ)
+
+    m = _TENANT_RE.match(n)
+    if m:
+        slug = m.group(1)
+        if slug in all_slugs:
+            return ("keep", "live-tenant", rid, n, typ)
+        # KEEP unknown tenant-shaped names — avoid false-positive nukes on
+        # ad-hoc records (e.g. hermes-final-*) that do not match a known slug.
+        return ("keep", "unknown-subdomain-kept-for-safety", rid, n, typ)
+
+    return ("keep", "not-a-pattern-we-sweep", rid, n, typ)
+# CANONICAL DECIDE END
+
+
+def safety_gate(total: int, delete_count: int, max_delete_pct: int = 50) -> bool:
+    """Return True iff the sweep is safe to execute.
+
+    Mirrors the shell-side gate: if the deletion fraction exceeds
+    ``max_delete_pct`` the sweep refuses to run. Same integer arithmetic
+    as the bash script (``DELETE_COUNT*100/TOTAL``) so a future threshold
+    tweak only needs to land in one semantic place.
+    """
+    if total <= 0:
+        return True
+    pct = delete_count * 100 // total
+    return pct <= max_delete_pct
--- a/scripts/ops/test_sweep_cf_decide.py
+++ b/scripts/ops/test_sweep_cf_decide.py
@ -0,0 +1,225 @@
+"""Tests for the sweep-cf-orphans.sh decision function (#2027).
+
+Run locally: ``python3 -m unittest scripts/ops/test_sweep_cf_decide.py -v``
+
+Why this exists: the inline Python heredoc in sweep-cf-orphans.sh decides
+which Cloudflare DNS records to delete. A misclassification could nuke a
+live workspace's DNS record. These tests cover the rule priority order +
+the safety gate, plus a parity check that asserts the inline block in the
+shell script matches the importable module byte-for-byte (so the two
+cannot drift silently).
+"""
+from __future__ import annotations
+
+import os
+import unittest
+
+import sweep_cf_decide as M
+
+
+# Caller responsibility (per the new decide signature): compute the union once.
+ALL_SLUGS = {"acme", "globex", "initech", "e2e-test-runner", "soak", "playground"}
+LIVE_EC2 = {"ws-d3605ef2-f7d", "ws-aaaaaaaa-bbb", "ws-cafef00d-dec"}
+
+
+def rec(name: str, rid: str = "rid-x", typ: str = "A") -> dict:
+    return {"name": name, "id": rid, "type": typ}
+
+
+def call(record: dict) -> tuple:
+    return M.decide(record, ALL_SLUGS, LIVE_EC2)
+
+
+class TestPlatformCore(unittest.TestCase):
+    """Apex, www, api, app, _verification keys must NEVER be touched."""
+
+    def test_apex_kept(self):
+        action, reason, *_ = call(rec("moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "apex"))
+
+    def test_underscore_records_kept(self):
+        for n in ("_vercel.moleculesai.app", "_railway-verify.moleculesai.app"):
+            with self.subTest(name=n):
+                action, reason, *_ = call(rec(n))
+                self.assertEqual((action, reason), ("keep", "verification/key"))
+
+    def test_dkim_kept(self):
+        action, reason, *_ = call(rec("send._domainkey.moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "verification/key"))
+
+    def test_platform_subdomains_kept(self):
+        for n in (
+            "api.moleculesai.app",
+            "app.moleculesai.app",
+            "doc.moleculesai.app",
+            "send.moleculesai.app",
+            "status.moleculesai.app",
+            "www.moleculesai.app",
+            "staging-api.moleculesai.app",
+        ):
+            with self.subTest(name=n):
+                action, reason, *_ = call(rec(n))
+                self.assertEqual((action, reason), ("keep", "platform-core"))
+
+
+class TestWsRule(unittest.TestCase):
+    """ws-* DNS records keep iff a live EC2 with the same prefix exists."""
+
+    def test_live_ws_kept(self):
+        action, reason, *_ = call(rec("ws-d3605ef2-f7d.moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "live-ec2"))
+
+    def test_live_ws_kept_on_staging(self):
+        action, reason, *_ = call(rec("ws-aaaaaaaa-bbb.staging.moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "live-ec2"))
+
+    def test_dead_ws_deleted(self):
+        action, reason, *_ = call(rec("ws-deadbeef-fff.moleculesai.app"))
+        self.assertEqual((action, reason), ("delete", "orphan-ws"))
+
+    def test_dead_ws_on_staging_deleted(self):
+        action, reason, *_ = call(rec("ws-deadbeef-fff.staging.moleculesai.app"))
+        self.assertEqual((action, reason), ("delete", "orphan-ws"))
+
+
+class TestE2ERule(unittest.TestCase):
+    def test_live_e2e_kept(self):
+        action, reason, *_ = call(rec("e2e-test-runner.staging.moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "live-e2e-tenant"))
+
+    def test_dead_e2e_deleted(self):
+        action, reason, *_ = call(rec("e2e-ghost-1234.staging.moleculesai.app"))
+        self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant"))
+
+    def test_dead_e2e_on_prod_deleted(self):
+        action, reason, *_ = call(rec("e2e-ghost.moleculesai.app"))
+        self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant"))
+
+
+class TestTenantSubdomainRule(unittest.TestCase):
+    def test_live_prod_tenant_kept(self):
+        action, reason, *_ = call(rec("acme.moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "live-tenant"))
+
+    def test_live_staging_tenant_kept(self):
+        action, reason, *_ = call(rec("soak.staging.moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "live-tenant"))
+
+    def test_unknown_subdomain_kept_for_safety(self):
+        action, reason, *_ = call(rec("hermes-final-2.moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "unknown-subdomain-kept-for-safety"))
+
+
+class TestNotASweepPattern(unittest.TestCase):
+    def test_external_domain_kept(self):
+        # Domain-spoofing attempt — must NOT match any of the moleculesai.app rules.
+        action, reason, *_ = call(rec("api.openai.com.evil.internal"))
+        self.assertEqual((action, reason), ("keep", "not-a-pattern-we-sweep"))
+
+    def test_unrelated_apex_kept(self):
+        action, reason, *_ = call(rec("example.com"))
+        self.assertEqual((action, reason), ("keep", "not-a-pattern-we-sweep"))
+
+
+class TestRulePriority(unittest.TestCase):
+    """Platform-core check must precede the tenant-subdomain regex —
+    e.g. ``api.moleculesai.app`` matches the tenant pattern but must
+    classify as platform-core."""
+
+    def test_api_subdomain_classified_as_platform_not_tenant(self):
+        action, reason, *_ = call(rec("api.moleculesai.app"))
+        self.assertEqual(reason, "platform-core")
+
+    def test_underscore_record_classified_before_tenant(self):
+        action, reason, *_ = call(rec("_vercel.moleculesai.app"))
+        self.assertEqual(reason, "verification/key")
+
+
+class TestSafetyGate(unittest.TestCase):
+    """The bash gate refuses to delete >MAX_DELETE_PCT (default 50%)."""
+
+    def test_under_threshold_passes(self):
+        self.assertTrue(M.safety_gate(total=100, delete_count=49))
+        self.assertTrue(M.safety_gate(total=100, delete_count=50))
+
+    def test_over_threshold_fails(self):
+        self.assertFalse(M.safety_gate(total=100, delete_count=51))
+        self.assertFalse(M.safety_gate(total=10, delete_count=10))
+
+    def test_zero_total_passes_trivially(self):
+        # No records → nothing to delete → gate trivially OK (no div-by-zero).
+        self.assertTrue(M.safety_gate(total=0, delete_count=0))
+
+    def test_custom_threshold(self):
+        self.assertTrue(M.safety_gate(total=100, delete_count=70, max_delete_pct=75))
+        self.assertFalse(M.safety_gate(total=100, delete_count=76, max_delete_pct=75))
+
+
+class TestEmptyLiveSets(unittest.TestCase):
+    """If the CP admin API returns no orgs (auth broken, network blip),
+    every tenant-shaped record looks orphan. decide() alone has no
+    defense — that's safety_gate's job. This test pins the contract so
+    a future "make decide() defensive" change doesn't silently bypass
+    the gate."""
+
+    def test_dead_e2e_orphans_when_live_set_empty(self):
+        action, reason, *_ = M.decide(
+            rec("e2e-test-runner.staging.moleculesai.app"),
+            set(), set(),
+        )
+        self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant"))
+
+    def test_live_ws_still_kept_when_ec2_set_empty(self):
+        action, reason, *_ = M.decide(
+            rec("ws-cafef00d-dec.moleculesai.app"),
+            ALL_SLUGS, set(),
+        )
+        self.assertEqual((action, reason), ("delete", "orphan-ws"))
+
+
+class TestParityWithBashScript(unittest.TestCase):
+    """The decision logic exists in two places: the canonical block in
+    sweep_cf_decide.py and the inline heredoc in sweep-cf-orphans.sh.
+    This test asserts the two match between the
+    ``# CANONICAL DECIDE BEGIN`` / ``# CANONICAL DECIDE END`` markers,
+    so an edit to one without the other fails CI loudly. The mirror-
+    reminder comment lives OUTSIDE the markers in the .sh file so we
+    don't need to special-case it here."""
+
+    @staticmethod
+    def _slice_canonical(text: str) -> list[str]:
+        """Return the lines between the canonical markers, exclusive.
+        Markers are matched line-anchored (a stripped-line literal match)
+        so the docstring's prose mention is ignored."""
+        lines = text.splitlines()
+        begin_idx = end_idx = None
+        for i, line in enumerate(lines):
+            stripped = line.strip()
+            if begin_idx is None and stripped == "# CANONICAL DECIDE BEGIN":
+                begin_idx = i
+            elif begin_idx is not None and stripped == "# CANONICAL DECIDE END":
+                end_idx = i
+                break
+        if begin_idx is None or end_idx is None:
+            raise AssertionError(
+                "missing CANONICAL DECIDE BEGIN/END markers — "
+                "first 30 lines were:\n" + "\n".join(lines[:30])
+            )
+        return lines[begin_idx + 1:end_idx]
+
+    def test_blocks_match(self):
+        here = os.path.dirname(__file__)
+        with open(os.path.join(here, "sweep_cf_decide.py"), "r", encoding="utf-8") as f:
+            py_block = self._slice_canonical(f.read())
+        with open(os.path.join(here, "sweep-cf-orphans.sh"), "r", encoding="utf-8") as f:
+            sh_block = self._slice_canonical(f.read())
+        self.assertEqual(
+            py_block,
+            sh_block,
+            "CANONICAL DECIDE block has drifted between sweep_cf_decide.py "
+            "and sweep-cf-orphans.sh — re-sync them.",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/scripts/refresh-workspace-images.sh
+++ b/scripts/refresh-workspace-images.sh
@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+# refresh-workspace-images.sh — pull the latest workspace template images
+# from GHCR and recreate any running ws-* containers against the new digest.
+#
+# This is the local-dev / single-host equivalent of step 5 of the runtime CD
+# chain (see docs/workspace-runtime-package.md). On a SaaS deployment the
+# host's deploy pipeline does the pull on every release; this script is
+# what to run on a local docker-compose host after a runtime release lands.
+#
+# Usage:
+#   bash scripts/refresh-workspace-images.sh                     # pull all 8 + recreate running ws-*
+#   bash scripts/refresh-workspace-images.sh --runtime claude-code  # pull just one template
+#   bash scripts/refresh-workspace-images.sh --no-recreate          # pull only, leave containers
+#
+# Behavior:
+#   - Always pulls fresh; docker is a no-op if local matches remote, so
+#     repeated runs are cheap.
+#   - Recreate is "kill + remove + let the next canvas interaction re-
+#     provision" — simpler than `docker stop / docker run` because the
+#     platform owns the run flags. Workspaces re-register on next probe.
+#   - If a container is mid-conversation, the kill cancels in-flight work.
+#     Run during a quiet window OR add --no-recreate and recreate manually
+#     via canvas Restart buttons.
+
+set -euo pipefail
+
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+log()  { echo -e "${GREEN}[refresh]${NC} $1" >&2; }
+warn() { echo -e "${YELLOW}[refresh]${NC} $1" >&2; }
+err()  { echo -e "${RED}[refresh]${NC} $1" >&2; }
+
+ALL_RUNTIMES=(claude-code langgraph crewai autogen deepagents hermes gemini-cli openclaw)
+RUNTIMES=("${ALL_RUNTIMES[@]}")
+RECREATE=true
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --runtime) RUNTIMES=("$2"); shift 2;;
+    --no-recreate) RECREATE=false; shift;;
+    -h|--help) sed -n '2,30p' "$0"; exit 0;;
+    *) err "unknown arg: $1"; exit 2;;
+  esac
+done
+
+# 1. Pull fresh tags. Soft-fail per runtime — one missing image (e.g., a
+#    template that hasn't been published yet) shouldn't abort the others.
+log "pulling latest images for: ${RUNTIMES[*]}"
+PULLED=()
+FAILED=()
+for rt in "${RUNTIMES[@]}"; do
+  IMG="ghcr.io/molecule-ai/workspace-template-$rt:latest"
+  if docker pull "$IMG" >/dev/null 2>&1; then
+    log "  ✓ $rt"
+    PULLED+=("$rt")
+  else
+    warn "  ✗ $rt (pull failed — image may not exist or auth missing)"
+    FAILED+=("$rt")
+  fi
+done
+
+if [ "$RECREATE" = "false" ]; then
+  log "skip-recreate set — leaving containers untouched"
+  log "done. pulled=${#PULLED[@]} failed=${#FAILED[@]}"
+  exit 0
+fi
+
+# 2. Find ws-* containers whose image is one of the runtimes we pulled.
+#    `docker inspect` exposes the image tag/digest each was created from.
+log "scanning ws-* containers for stale images..."
+TO_RECREATE=()
+for cn in $(docker ps -a --filter "name=ws-" --format "{{.Names}}"); do
+  IMG=$(docker inspect "$cn" --format '{{.Config.Image}}' 2>/dev/null || echo "")
+  for rt in "${PULLED[@]}"; do
+    if [[ "$IMG" == *"workspace-template-$rt"* ]]; then
+      TO_RECREATE+=("$cn")
+      break
+    fi
+  done
+done
+
+if [ "${#TO_RECREATE[@]}" -eq 0 ]; then
+  log "no running ws-* containers using a refreshed image — nothing to recreate"
+  exit 0
+fi
+
+# 3. Kill + remove. Canvas next-interaction will re-provision.
+log "recreating ${#TO_RECREATE[@]} containers (canvas will re-provision on next interaction)"
+for cn in "${TO_RECREATE[@]}"; do
+  docker rm -f "$cn" >/dev/null 2>&1 && log "  removed $cn" || warn "  failed to remove $cn"
+done
+
+log "done. open the canvas and the workspaces will re-provision against the new image."
--- a/tests/e2e/test_staging_full_saas.sh
+++ b/tests/e2e/test_staging_full_saas.sh
@ -195,14 +195,21 @@ TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(js
 ok "Tenant admin token retrieved (len=${#TENANT_TOKEN})"

 # ─── 4. Wait for tenant TLS / DNS propagation ──────────────────────────
+# 10 min — same envelope as canvas/e2e/staging-setup.ts TLS_TIMEOUT_MS.
+# CF DNS propagation + tunnel hostname registration + ACME cert + edge
+# cache routinely take 5-7 min under staging load; the original 3-min
+# cap blocked multiple staging→main PRs across 2026-04-24+. Stays
+# inside the parent provision envelope so a genuinely-stuck tenant
+# still fails loud at the earlier provision step rather than masquerading
+# as a TLS issue.
 log "4/11 Waiting for tenant TLS / DNS propagation..."
-TLS_DEADLINE=$(( $(date +%s) + 180 ))
+TLS_DEADLINE=$(( $(date +%s) + 600 ))
 while true; do
  if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then
    break
  fi
  if [ "$(date +%s)" -gt "$TLS_DEADLINE" ]; then
-    fail "Tenant URL never responded 2xx on /health within 3 min"
+    fail "Tenant URL never responded 2xx on /health within 10 min"
  fi
  sleep 5
 done
@ -403,6 +410,13 @@ fi
 if echo "$AGENT_TEXT" | grep -qF "Unknown provider"; then
  fail "A2A — REGRESSION: install.sh set PROVIDER to a value not in hermes's registry. Run 'hermes doctor' on the workspace to see valid values. Raw: $AGENT_TEXT"
 fi
+# "Invalid API key" — the comment block lists this as a CP #238 race
+# (tenant auth chain) signal but the grep was missing. Caller-side
+# 401's containing this exact phrase don't match the generic
+# "error|exception" catch-all below, so they'd slip through.
+if echo "$AGENT_TEXT" | grep -qF "Invalid API key"; then
+  fail "A2A — REGRESSION: tenant auth chain returned 'Invalid API key'. Likely CP boot-event 401 race (CP #238) or stale OPENAI_API_KEY in the runtime env. Raw: $AGENT_TEXT"
+fi
 # Generic catch-all — falls through if none of the known regressions hit.
 if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then
  fail "A2A returned an error-shaped response: $AGENT_TEXT"
--- a/workspace-server/internal/events/broadcaster.go
+++ b/workspace-server/internal/events/broadcaster.go
@ -15,6 +15,29 @@ import (

 const broadcastChannel = "events:broadcast"

+// EventEmitter is the contract handler code needs from a broadcaster.
+// Defining it here lets tests substitute a capture-only stub instead
+// of standing up the full Redis + WebSocket hub topology that the
+// concrete *Broadcaster builds (and that previously blocked
+// TestProvisionWorkspace_* regression tests on issue #1814).
+//
+// Includes BroadcastOnly because the activity-log + A2A-response paths
+// inside the handler package fan out via that method — narrowing
+// further would force production callers back to the concrete type.
+//
+// *Broadcaster satisfies this interface trivially. Production code that
+// needs the wider surface (SubscribeSSE, Subscribe) keeps using the
+// concrete *Broadcaster type — sse.go + cmd/server/main.go are the
+// only such call sites today.
+type EventEmitter interface {
+	RecordAndBroadcast(ctx context.Context, eventType string, workspaceID string, payload interface{}) error
+	BroadcastOnly(workspaceID string, eventType string, payload interface{})
+}
+
+// Compile-time assertion: a renamed/reshaped Broadcaster method that
+// silently broke this interface would fail to build here.
+var _ EventEmitter = (*Broadcaster)(nil)
+
 // sseSubscription is a single in-process SSE subscriber.
 // deliverToSSE writes to ch; StreamEvents reads from it.
 type sseSubscription struct {
--- a/workspace-server/internal/handlers/a2a_proxy.go
+++ b/workspace-server/internal/handlers/a2a_proxy.go
@ -483,6 +483,16 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
 // canvas (callerID == "") = 5 min, agent-to-agent = 30 min. Callers can
 // override via the X-Timeout header (applied to ctx upstream in ProxyA2A).
 func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, agentURL string, body []byte, callerID string) (*http.Response, context.CancelFunc, error) {
+	// #1483 SSRF defense-in-depth: the primary call path through
+	// proxyA2ARequest → resolveAgentURL already validates via isSafeURL
+	// (a2a_proxy.go:424), but adding the check here closes the gap for
+	// any future code path that calls dispatchA2A directly without
+	// going through resolveAgentURL. Wrapping as proxyDispatchBuildError
+	// keeps the caller's error-classification path unchanged — the same
+	// shape it already produces a 500 for.
+	if err := isSafeURL(agentURL); err != nil {
+		return nil, nil, &proxyDispatchBuildError{err: err}
+	}
 	forwardCtx := context.WithoutCancel(ctx)
 	var cancel context.CancelFunc
 	if _, hasDeadline := ctx.Deadline(); !hasDeadline {
--- a/workspace-server/internal/handlers/a2a_proxy_test.go
+++ b/workspace-server/internal/handlers/a2a_proxy_test.go
@ -1155,6 +1155,39 @@ func TestDispatchA2A_ContextDeadline_NoCancelAdded(t *testing.T) {
 	}
 }

+// TestDispatchA2A_RejectsUnsafeURL is the #1483 defense-in-depth
+// regression. setupTestDB disables SSRF for normal tests so existing
+// dispatchA2A unit tests can hit httptest.NewServer (loopback) — we
+// re-enable it here to verify the new in-function isSafeURL guard.
+// Production callers go through resolveAgentURL which already
+// validates; this test pins that dispatchA2A is now safe even when
+// called directly by a future caller that skips resolveAgentURL.
+func TestDispatchA2A_RejectsUnsafeURL(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	restoreSSRF := setSSRFCheckForTest(true)
+	t.Cleanup(restoreSSRF)
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+
+	// Cloud metadata IP — must be rejected before any HTTP call goes out.
+	_, cancel, err := handler.dispatchA2A(
+		context.Background(),
+		"http://169.254.169.254/latest/meta-data/",
+		[]byte(`{}`),
+		"",
+	)
+	if cancel != nil {
+		cancel()
+		t.Error("cancel must be nil when the URL is rejected pre-request")
+	}
+	if err == nil {
+		t.Fatal("expected SSRF rejection error, got nil")
+	}
+	if _, ok := err.(*proxyDispatchBuildError); !ok {
+		t.Errorf("expected *proxyDispatchBuildError (caller maps to 500), got %T: %v", err, err)
+	}
+}
+
 // --- handleA2ADispatchError ---

 func TestHandleA2ADispatchError_ContextDeadline(t *testing.T) {
--- a/workspace-server/internal/handlers/a2a_queue.go
+++ b/workspace-server/internal/handlers/a2a_queue.go
@ -288,7 +288,7 @@ func (h *WorkspaceHandler) DrainQueueForWorkspace(ctx context.Context, workspace
 	}
 	// logActivity=false: the original EnqueueA2A callsite already logged
 	// the dispatch attempt; re-logging here would double-count events.
-	status, _, proxyErr := h.proxyA2ARequest(ctx, workspaceID, item.Body, callerID, false)
+	status, respBody, proxyErr := h.proxyA2ARequest(ctx, workspaceID, item.Body, callerID, false)

 	// 202 Accepted = the dispatch was itself queued again (target still busy).
 	// That's not a failure — the queued item just stays queued naturally on
@ -321,4 +321,89 @@ func (h *WorkspaceHandler) DrainQueueForWorkspace(ctx context.Context, workspace
 	MarkQueueItemCompleted(ctx, item.ID)
 	log.Printf("A2AQueue drain: dispatched %s to workspace %s (attempt=%d)",
 		item.ID, workspaceID, item.Attempts)
+
+	// Stitch the response back to the originating delegation row, if this
+	// queue item was a delegation. Without this, check_task_status would
+	// see status='queued' (set by the executeDelegation queued-branch) and
+	// the LLM would think the work was never done. We embed delegation_id
+	// in params.message.metadata at Delegate-handler time; pull it out
+	// here and UPDATE the delegate_result row so the original caller can
+	// observe the real reply.
+	if delegationID := extractDelegationIDFromBody(item.Body); delegationID != "" {
+		h.stitchDrainResponseToDelegation(ctx, callerID, item.WorkspaceID, delegationID, respBody)
+	}
+}
+
+// extractDelegationIDFromBody pulls params.message.metadata.delegation_id
+// out of an A2A JSON-RPC body. Empty string when absent — drain treats
+// that as "this queue item didn't originate from /workspaces/:id/delegate"
+// and skips the stitch, so non-delegation queue uses (cross-workspace
+// peer-direct A2A) aren't affected.
+func extractDelegationIDFromBody(body []byte) string {
+	var envelope struct {
+		Params struct {
+			Message struct {
+				Metadata struct {
+					DelegationID string `json:"delegation_id"`
+				} `json:"metadata"`
+			} `json:"message"`
+		} `json:"params"`
+	}
+	if err := json.Unmarshal(body, &envelope); err != nil {
+		return ""
+	}
+	return envelope.Params.Message.Metadata.DelegationID
+}
+
+// stitchDrainResponseToDelegation writes the drained response into the
+// delegation's existing delegate_result row (created with status='queued'
+// by executeDelegation when the proxy first returned queued). This is the
+// other half of the loop that closes "queued → completed" so the LLM's
+// check_task_status reflects ground truth.
+//
+// Errors are logged-only — drain is fire-and-forget from Heartbeat, and a
+// stitch failure shouldn't block other queued items. The delegation will
+// just remain stuck at 'queued' in this case, which is the pre-fix
+// behaviour (no regression vs. shipping nothing).
+func (h *WorkspaceHandler) stitchDrainResponseToDelegation(ctx context.Context, sourceID, targetID, delegationID string, respBody []byte) {
+	if sourceID == "" || delegationID == "" {
+		return
+	}
+	responseText := extractResponseText(respBody)
+	respJSON, _ := json.Marshal(map[string]interface{}{
+		"text":          responseText,
+		"delegation_id": delegationID,
+	})
+	res, err := db.DB.ExecContext(ctx, `
+		UPDATE activity_logs
+		   SET status        = 'completed',
+		       summary       = $1,
+		       response_body = $2::jsonb
+		 WHERE workspace_id   = $3
+		   AND method         = 'delegate_result'
+		   AND target_id      = $4
+		   AND response_body->>'delegation_id' = $5
+	`, "Delegation completed ("+truncate(responseText, 80)+")", string(respJSON),
+		sourceID, targetID, delegationID)
+	if err != nil {
+		log.Printf("A2AQueue drain stitch: update failed for delegation %s: %v", delegationID, err)
+		return
+	}
+	if rows, _ := res.RowsAffected(); rows == 0 {
+		log.Printf("A2AQueue drain stitch: no delegate_result row for delegation %s (queued-row may not exist yet)", delegationID)
+		return
+	}
+	log.Printf("A2AQueue drain stitch: delegation %s queued → completed (%d chars)", delegationID, len(responseText))
+
+	// Broadcast DELEGATION_COMPLETE so the canvas chat feed flips the
+	// "⏸ queued" line to "✓ completed" in real time. Without this the
+	// transition only surfaces after the user reloads or polls activity.
+	if h.broadcaster != nil {
+		h.broadcaster.RecordAndBroadcast(ctx, "DELEGATION_COMPLETE", sourceID, map[string]interface{}{
+			"delegation_id":    delegationID,
+			"target_id":        targetID,
+			"response_preview": truncate(responseText, 200),
+			"via":              "queue_drain",
+		})
+	}
 }
--- a/workspace-server/internal/handlers/a2a_queue_test.go
+++ b/workspace-server/internal/handlers/a2a_queue_test.go
@ -80,6 +80,39 @@ func TestExtractIdempotencyKey_emptyOnMissing(t *testing.T) {
 	}
 }

+func TestExtractDelegationIDFromBody(t *testing.T) {
+	cases := []struct {
+		name string
+		body string
+		want string
+	}{
+		{
+			name: "delegation body — metadata.delegation_id present",
+			body: `{"method":"message/send","params":{"message":{"role":"user","messageId":"abc-123","metadata":{"delegation_id":"abc-123"},"parts":[{"type":"text","text":"hi"}]}}}`,
+			want: "abc-123",
+		},
+		{
+			name: "non-delegation body — no metadata (peer-direct A2A)",
+			body: `{"method":"message/send","params":{"message":{"role":"user","messageId":"m-1","parts":[{"type":"text","text":"hi"}]}}}`,
+			want: "",
+		},
+		{
+			name: "metadata present but no delegation_id key",
+			body: `{"params":{"message":{"metadata":{"trace_id":"t-1"}}}}`,
+			want: "",
+		},
+		{"malformed JSON", `not json`, ""},
+		{"empty body", ``, ""},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := extractDelegationIDFromBody([]byte(tc.body)); got != tc.want {
+				t.Errorf("extractDelegationIDFromBody = %q, want %q", got, tc.want)
+			}
+		})
+	}
+}
+
 // ──────────────────────────────────────────────────────────────────────────────
 // DrainQueueForWorkspace — nil-safe error extraction regression tests
 //
--- a/workspace-server/internal/handlers/activity.go
+++ b/workspace-server/internal/handlers/activity.go
@ -286,6 +286,37 @@ func (h *ActivityHandler) Notify(c *gin.Context) {
 		"name":         wsName,
 	})

+	// Persist to activity_logs so the chat history loader restores this
+	// message after a page reload. Pre-fix, send_message_to_user pushes
+	// were broadcast-only — survived the WebSocket session but vanished
+	// when the user refreshed because nothing wrote them to the DB.
+	//
+	// Shape chosen to match the existing loader query
+	// (`type=a2a_receive&source=canvas`):
+	//   - activity_type='a2a_receive' so it joins the same query path
+	//   - source_id=NULL so the canvas-source filter accepts it
+	//   - method='notify' to distinguish from real A2A receives in audits
+	//   - request_body=NULL so the loader doesn't append a duplicate
+	//     "user message" bubble for it
+	//   - response_body={"result": "<text>"} matches extractResponseText's
+	//     simplest branch ({result: string} → take verbatim)
+	//
+	// Errors are logged-only — broadcast already succeeded, the user
+	// sees the message; persistence failure just means the message
+	// won't survive reload (pre-fix behavior). Don't fail the whole
+	// notify on a DB hiccup.
+	respJSON, _ := json.Marshal(map[string]interface{}{"result": body.Message})
+	preview := body.Message
+	if len(preview) > 80 {
+		preview = preview[:80] + "…"
+	}
+	if _, err := db.DB.ExecContext(c.Request.Context(), `
+		INSERT INTO activity_logs (workspace_id, activity_type, method, summary, response_body, status)
+		VALUES ($1, 'a2a_receive', 'notify', $2, $3::jsonb, 'ok')
+	`, workspaceID, "Agent message: "+preview, string(respJSON)); err != nil {
+		log.Printf("Notify: failed to persist message for %s: %v", workspaceID, err)
+	}
+
 	c.JSON(http.StatusOK, gin.H{"status": "sent"})
 }

@ -373,7 +404,9 @@ func (h *ActivityHandler) Report(c *gin.Context) {
 }

 // LogActivity inserts an activity log and optionally broadcasts via WebSocket.
-func LogActivity(ctx context.Context, broadcaster *events.Broadcaster, params ActivityParams) {
+// Takes events.EventEmitter (#1814) so callers passing a stub broadcaster
+// in tests no longer need to construct the full *events.Broadcaster.
+func LogActivity(ctx context.Context, broadcaster events.EventEmitter, params ActivityParams) {
 	reqJSON, reqErr := json.Marshal(params.RequestBody)
 	if reqErr != nil {
 		log.Printf("LogActivity: failed to marshal request_body for %s: %v", params.WorkspaceID, reqErr)
--- a/workspace-server/internal/handlers/activity_test.go
+++ b/workspace-server/internal/handlers/activity_test.go
@ -217,6 +217,86 @@ func TestActivityReport_RejectsUnknownType(t *testing.T) {
 	}
 }

+func TestNotify_PersistsToActivityLogsForReloadRecovery(t *testing.T) {
+	// Regression guard for the "responses gone on reload" bug. send_message_to_user
+	// pushes (which route through Notify) used to be broadcast-only — they
+	// rendered in the canvas but vanished on page reload because nothing
+	// wrote them to activity_logs. The chat history loader queries
+	// `type=a2a_receive&source=canvas`, so the persisted row must:
+	//   - Use activity_type='a2a_receive' (loader's filter)
+	//   - Have source_id NULL (canvas-source filter)
+	//   - Carry the message text in response_body so extractResponseText
+	//     can reconstruct the agent reply on reload
+	mockDB, mock, _ := sqlmock.New()
+	defer mockDB.Close()
+	db.DB = mockDB
+
+	// Workspace existence check
+	mock.ExpectQuery(`SELECT name FROM workspaces`).
+		WithArgs("ws-notify").
+		WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("DD"))
+
+	// Persistence INSERT — verify shape
+	mock.ExpectExec(`INSERT INTO activity_logs`).
+		WithArgs(
+			"ws-notify",
+			sqlmock.AnyArg(), // summary
+			sqlmock.AnyArg(), // response_body JSON
+		).
+		WillReturnResult(sqlmock.NewResult(1, 1))
+
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	gin.SetMode(gin.TestMode)
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-notify"}}
+	body := `{"message":"agent reply that arrived after the sync POST timed out"}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/ws-notify/notify", strings.NewReader(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+	handler.Notify(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("DB expectations not met: %v", err)
+	}
+}
+
+func TestNotify_DBFailure_StillBroadcastsAnd200(t *testing.T) {
+	// Persistence is best-effort — a DB hiccup must NOT block the
+	// WebSocket push (which the user is already seeing in their open
+	// canvas). Pre-fix the WS push always succeeded; we don't want
+	// the new persistence step to regress that path.
+	mockDB, mock, _ := sqlmock.New()
+	defer mockDB.Close()
+	db.DB = mockDB
+
+	mock.ExpectQuery(`SELECT name FROM workspaces`).
+		WithArgs("ws-x").
+		WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("DD"))
+	mock.ExpectExec(`INSERT INTO activity_logs`).
+		WillReturnError(fmt.Errorf("simulated db hiccup"))
+
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	gin.SetMode(gin.TestMode)
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-x"}}
+	body := `{"message":"hi"}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/ws-x/notify", strings.NewReader(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+	handler.Notify(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("DB failure must not break the response; got %d", w.Code)
+	}
+}
+
 // ==================== Direct unit tests for SessionSearch helpers ====================

 // --- parseSessionSearchParams ---
--- a/workspace-server/internal/handlers/admin_workspace_images.go
+++ b/workspace-server/internal/handlers/admin_workspace_images.go
@ -0,0 +1,227 @@
+package handlers
+
+import (
+	"context"
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/docker/docker/api/types/container"
+	"github.com/docker/docker/api/types/filters"
+	dockerimage "github.com/docker/docker/api/types/image"
+	dockerclient "github.com/docker/docker/client"
+	"github.com/gin-gonic/gin"
+
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
+)
+
+// AdminWorkspaceImagesHandler serves POST /admin/workspace-images/refresh — the
+// production-side end of the runtime CD chain. Operators (or post-publish
+// automation) hit this to (1) pull the latest workspace template images from
+// GHCR via the Docker SDK and (2) recreate any running ws-* containers so
+// they adopt the new image. Without this, a freshly-published runtime sits
+// in the registry but containers keep running the old image until the next
+// manual restart.
+//
+// On a SaaS deployment the deploy pipeline already pulls on every release,
+// so the pull step is a no-op there; the recreate step is still the way to
+// make running workspaces adopt the new image without a full host restart.
+//
+// POST /admin/workspace-images/refresh
+//
+//	?runtime=claude-code   (optional; default = all 8 templates)
+//	&recreate=true|false   (default true; false = pull only)
+//
+// Returns JSON {pulled: [...], failed: [...], recreated: [...]}
+type AdminWorkspaceImagesHandler struct {
+	docker *dockerclient.Client
+}
+
+func NewAdminWorkspaceImagesHandler(docker *dockerclient.Client) *AdminWorkspaceImagesHandler {
+	return &AdminWorkspaceImagesHandler{docker: docker}
+}
+
+// allRuntimes is the canonical list mirroring docs/workspace-runtime-package.md.
+// Update both when a new template is added.
+var allRuntimes = []string{
+	"claude-code", "langgraph", "crewai", "autogen",
+	"deepagents", "hermes", "gemini-cli", "openclaw",
+}
+
+type refreshResult struct {
+	Pulled    []string `json:"pulled"`
+	Failed    []string `json:"failed"`
+	Recreated []string `json:"recreated"`
+}
+
+// ghcrAuthHeader returns the base64-encoded JSON auth payload Docker's
+// ImagePull expects in PullOptions.RegistryAuth, or empty string when no
+// GHCR_USER/GHCR_TOKEN env is set (lets public images pull through).
+//
+// The Docker SDK doesn't read ~/.docker/config.json — every authenticated
+// pull needs an explicit RegistryAuth string. Format per the Docker
+// engine API: {"username":"…","password":"…","serveraddress":"ghcr.io"}
+// → base64-encoded JSON with no trailing padding stripped (engine handles
+// either form).
+func ghcrAuthHeader() string {
+	user := strings.TrimSpace(os.Getenv("GHCR_USER"))
+	token := strings.TrimSpace(os.Getenv("GHCR_TOKEN"))
+	if user == "" || token == "" {
+		return ""
+	}
+	payload := map[string]string{
+		"username":      user,
+		"password":      token,
+		"serveraddress": "ghcr.io",
+	}
+	js, err := json.Marshal(payload)
+	if err != nil {
+		// Should be unreachable for a static map[string]string. Log so a
+		// future contributor adding a non-marshallable field notices.
+		log.Printf("workspace-images: failed to marshal GHCR auth: %v", err)
+		return ""
+	}
+	return base64.URLEncoding.EncodeToString(js)
+}
+
+func (h *AdminWorkspaceImagesHandler) Refresh(c *gin.Context) {
+	runtimes := allRuntimes
+	if r := c.Query("runtime"); r != "" {
+		// Accept a single runtime; reject anything not in the canonical list
+		// so a typo doesn't silently no-op.
+		found := false
+		for _, known := range allRuntimes {
+			if known == r {
+				found = true
+				break
+			}
+		}
+		if !found {
+			c.JSON(http.StatusBadRequest, gin.H{
+				"error":          fmt.Sprintf("unknown runtime: %s", r),
+				"known_runtimes": allRuntimes,
+			})
+			return
+		}
+		runtimes = []string{r}
+	}
+	recreate := c.DefaultQuery("recreate", "true") == "true"
+
+	res := refreshResult{Pulled: []string{}, Failed: []string{}, Recreated: []string{}}
+	auth := ghcrAuthHeader()
+
+	// 1. Pull each template image via the Docker SDK. Soft-fail per-runtime
+	//    so one missing image (e.g. unpublished template) doesn't abort
+	//    the others. Each pull's progress stream is drained to completion
+	//    — the engine treats early-close as "abandon", leaving partial
+	//    layers around with no reference.
+	pullCtx, cancel := context.WithTimeout(c.Request.Context(), 5*time.Minute)
+	defer cancel()
+	for _, rt := range runtimes {
+		image := fmt.Sprintf("ghcr.io/molecule-ai/workspace-template-%s:latest", rt)
+		opts := dockerimage.PullOptions{Platform: provisioner.DefaultImagePlatform()}
+		if auth != "" {
+			opts.RegistryAuth = auth
+		}
+		rc, err := h.docker.ImagePull(pullCtx, image, opts)
+		if err != nil {
+			log.Printf("workspace-images/refresh: pull %s failed: %v", rt, err)
+			res.Failed = append(res.Failed, rt)
+			continue
+		}
+		// Drain to completion. We discard progress payload because no
+		// caller renders it; the platform log already records pulled/failed
+		// per runtime. If a future caller wants live progress, decode the
+		// JSON-line stream into events here.
+		if _, err := io.Copy(io.Discard, rc); err != nil {
+			rc.Close()
+			log.Printf("workspace-images/refresh: drain %s failed: %v", rt, err)
+			res.Failed = append(res.Failed, rt)
+			continue
+		}
+		rc.Close()
+		res.Pulled = append(res.Pulled, rt)
+	}
+
+	if !recreate {
+		c.JSON(http.StatusOK, res)
+		return
+	}
+
+	// 2. Find ws-* containers running an image we just pulled. Recreate
+	//    them — kill+remove and let the platform's normal provisioning
+	//    flow re-create on next canvas interaction.
+	listCtx, listCancel := context.WithTimeout(c.Request.Context(), 30*time.Second)
+	defer listCancel()
+	containers, err := h.docker.ContainerList(listCtx, container.ListOptions{
+		All:     true,
+		Filters: filters.NewArgs(filters.Arg("name", "ws-")),
+	})
+	if err != nil {
+		log.Printf("workspace-images/refresh: container list failed: %v", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "container list failed", "partial_result": res})
+		return
+	}
+
+	pulledSet := map[string]struct{}{}
+	for _, rt := range res.Pulled {
+		pulledSet[rt] = struct{}{}
+	}
+	for _, ctr := range containers {
+		// ContainerList's ctr.Image is the *resolved digest* (sha256:…),
+		// not the human-readable tag. Use ContainerInspect to get the
+		// original Config.Image (e.g. "ghcr.io/molecule-ai/workspace-
+		// template-claude-code:latest") so we can match against the
+		// pulled-runtime set. The cost is one extra round-trip per
+		// ws-* container — there are at most 8 typically, so this is
+		// well below any UX threshold.
+		inspectCtx, inspectCancel := context.WithTimeout(c.Request.Context(), 10*time.Second)
+		full, err := h.docker.ContainerInspect(inspectCtx, ctr.ID)
+		inspectCancel()
+		if err != nil {
+			log.Printf("workspace-images/refresh: inspect %s failed: %v", ctr.ID[:12], err)
+			continue
+		}
+		imageRef := ""
+		if full.Config != nil {
+			imageRef = full.Config.Image
+		}
+		matched := ""
+		for rt := range pulledSet {
+			if strings.Contains(imageRef, "workspace-template-"+rt) {
+				matched = rt
+				break
+			}
+		}
+		if matched == "" {
+			continue
+		}
+		name := strings.TrimPrefix(ctr.Names[0], "/")
+		// Remove with force — the workspace will re-provision on the next
+		// canvas interaction. This drops in-flight conversations on the
+		// removed container; document via the response so callers can
+		// schedule the refresh during a quiet window.
+		rmCtx, rmCancel := context.WithTimeout(c.Request.Context(), 30*time.Second)
+		err = h.docker.ContainerRemove(rmCtx, ctr.ID, container.RemoveOptions{Force: true})
+		rmCancel()
+		if err != nil {
+			log.Printf("workspace-images/refresh: remove %s failed: %v", name, err)
+			continue
+		}
+		res.Recreated = append(res.Recreated, name)
+	}
+
+	authStatus := "no GHCR auth (public images only)"
+	if auth != "" {
+		authStatus = "GHCR_USER/GHCR_TOKEN auth"
+	}
+	log.Printf("workspace-images/refresh: pulled=%d failed=%d recreated=%d (%s)",
+		len(res.Pulled), len(res.Failed), len(res.Recreated), authStatus)
+	c.JSON(http.StatusOK, res)
+}
--- a/workspace-server/internal/handlers/admin_workspace_images_test.go
+++ b/workspace-server/internal/handlers/admin_workspace_images_test.go
@ -0,0 +1,73 @@
+package handlers
+
+import (
+	"encoding/base64"
+	"encoding/json"
+	"testing"
+)
+
+func TestGHCRAuthHeader_NoEnvReturnsEmpty(t *testing.T) {
+	t.Setenv("GHCR_USER", "")
+	t.Setenv("GHCR_TOKEN", "")
+	if got := ghcrAuthHeader(); got != "" {
+		t.Errorf("expected empty (no auth → public-only), got %q", got)
+	}
+}
+
+func TestGHCRAuthHeader_PartialEnvReturnsEmpty(t *testing.T) {
+	// Both must be set — defensive against half-configured env.
+	t.Setenv("GHCR_USER", "alice")
+	t.Setenv("GHCR_TOKEN", "")
+	if got := ghcrAuthHeader(); got != "" {
+		t.Errorf("user-only env should disable auth, got %q", got)
+	}
+	t.Setenv("GHCR_USER", "")
+	t.Setenv("GHCR_TOKEN", "fake-tok-xxx")
+	if got := ghcrAuthHeader(); got != "" {
+		t.Errorf("token-only env should disable auth, got %q", got)
+	}
+}
+
+func TestGHCRAuthHeader_EncodesDockerEnginePayload(t *testing.T) {
+	t.Setenv("GHCR_USER", "alice")
+	t.Setenv("GHCR_TOKEN", "fake-tok-value")
+	got := ghcrAuthHeader()
+	if got == "" {
+		t.Fatal("expected non-empty auth header")
+	}
+	raw, err := base64.URLEncoding.DecodeString(got)
+	if err != nil {
+		t.Fatalf("auth header is not valid base64-url: %v", err)
+	}
+	var payload map[string]string
+	if err := json.Unmarshal(raw, &payload); err != nil {
+		t.Fatalf("decoded auth is not valid JSON: %v (raw=%s)", err, raw)
+	}
+	if payload["username"] != "alice" {
+		t.Errorf("username: got %q, want alice", payload["username"])
+	}
+	if payload["password"] != "fake-tok-value" {
+		t.Errorf("password: got %q, want fake-tok-value", payload["password"])
+	}
+	if payload["serveraddress"] != "ghcr.io" {
+		t.Errorf("serveraddress: got %q, want ghcr.io", payload["serveraddress"])
+	}
+}
+
+func TestGHCRAuthHeader_TrimsWhitespace(t *testing.T) {
+	// .env lines often have trailing newlines or accidental spaces. Without
+	// trimming, a stray space would produce an auth payload the engine
+	// rejects with a confusing 401.
+	t.Setenv("GHCR_USER", "  alice  ")
+	t.Setenv("GHCR_TOKEN", "\tfake-tok-value\n")
+	got := ghcrAuthHeader()
+	raw, _ := base64.URLEncoding.DecodeString(got)
+	var payload map[string]string
+	_ = json.Unmarshal(raw, &payload)
+	if payload["username"] != "alice" {
+		t.Errorf("username not trimmed: got %q", payload["username"])
+	}
+	if payload["password"] != "fake-tok-value" {
+		t.Errorf("password not trimmed: got %q", payload["password"])
+	}
+}
--- a/workspace-server/internal/handlers/delegation.go
+++ b/workspace-server/internal/handlers/delegation.go
@ -78,13 +78,21 @@ func (h *DelegationHandler) Delegate(c *gin.Context) {
 	// reason (logged); we still dispatch the A2A request and surface the
 	// warning in the response.

-	// Build A2A payload
+	// Build A2A payload. Embedding delegation_id in metadata gives the
+	// queue drain path a way to look up the originating delegation row
+	// when stitching the response back (issue: previously the drain
+	// dispatched successfully but discarded the response, so
+	// check_task_status returned status='queued' forever even after a
+	// real reply landed). messageId mirrors delegation_id so the
+	// platform's idempotency-key extraction also keys off the same id.
 	a2aBody, _ := json.Marshal(map[string]interface{}{
 		"method": "message/send",
 		"params": map[string]interface{}{
 			"message": map[string]interface{}{
-				"role":  "user",
-				"parts": []map[string]interface{}{{"type": "text", "text": body.Task}},
+				"role":      "user",
+				"messageId": delegationID,
+				"parts":     []map[string]interface{}{{"type": "text", "text": body.Task}},
+				"metadata":  map[string]interface{}{"delegation_id": delegationID},
 			},
 		},
 	})
@ -284,6 +292,40 @@ func (h *DelegationHandler) executeDelegation(sourceID, targetID, delegationID s
 		return
 	}

+	// 202 + {queued: true} means the target was busy and the proxy
+	// enqueued the request for the next drain tick — NOT a completion.
+	// Treat it as such: write a clean 'queued' activity row with no
+	// JSON-as-text leakage into the summary, broadcast a status update,
+	// and return. The eventual drain doesn't (yet) feed a result back
+	// into this delegation, so callers polling check_task_status will
+	// see status='queued' and know to retry instead of believing the
+	// queued JSON is the agent's reply. Fixes the chat-leak where the
+	// LLM echoed "Delegation completed (workspace agent busy ...)" to
+	// the user.
+	if status == http.StatusAccepted && isQueuedProxyResponse(respBody) {
+		log.Printf("Delegation %s: target %s busy — queued for drain", delegationID, targetID)
+		h.updateDelegationStatus(sourceID, delegationID, "queued", "")
+		// Store delegation_id in response_body so DrainQueueForWorkspace's
+		// stitch step can find this row by JSON-path key after the queued
+		// dispatch eventually succeeds. Without the key, the drain finds
+		// the row by (workspace_id, target_id, method) but can't tell
+		// multiple-queued-delegations-to-same-target apart.
+		queuedJSON, _ := json.Marshal(map[string]interface{}{
+			"delegation_id": delegationID,
+			"queued":        true,
+		})
+		if _, err := db.DB.ExecContext(ctx, `
+			INSERT INTO activity_logs (workspace_id, activity_type, method, source_id, target_id, summary, response_body, status)
+			VALUES ($1, 'delegation', 'delegate_result', $2, $3, $4, $5::jsonb, 'queued')
+		`, sourceID, sourceID, targetID, "Delegation queued — target at capacity", string(queuedJSON)); err != nil {
+			log.Printf("Delegation %s: failed to insert queued log: %v", delegationID, err)
+		}
+		h.broadcaster.RecordAndBroadcast(ctx, "DELEGATION_STATUS", sourceID, map[string]interface{}{
+			"delegation_id": delegationID, "target_id": targetID, "status": "queued",
+		})
+		return
+	}
+
 	// A2A returned 200 — target received and processed the task
 	// Status: dispatched → received → completed (we don't have a separate "received" signal from the target yet)
 	responseText := extractResponseText(respBody)
@ -517,6 +559,21 @@ func isTransientProxyError(err *proxyA2AError) bool {
 	return false
 }

+// isQueuedProxyResponse reports whether the proxy returned a body shaped like
+// `{"queued": true, "queue_id": ..., "queue_depth": ..., "message": ...}` —
+// the busy-target enqueue path in a2a_proxy_helpers.go. Caller checks this
+// alongside HTTP 202 to distinguish a successful agent reply from a deferred
+// dispatch; without the distinction we'd write the queued-message JSON into
+// the delegation result row and the LLM would surface it as agent output.
+func isQueuedProxyResponse(body []byte) bool {
+	var resp map[string]interface{}
+	if json.Unmarshal(body, &resp) != nil {
+		return false
+	}
+	queued, _ := resp["queued"].(bool)
+	return queued
+}
+
 func extractResponseText(body []byte) string {
 	var resp map[string]interface{}
 	if json.Unmarshal(body, &resp) != nil {
--- a/workspace-server/internal/handlers/delegation_test.go
+++ b/workspace-server/internal/handlers/delegation_test.go
@ -376,6 +376,39 @@ func TestIsTransientProxyError_RetriesOnRestartRaceStatuses(t *testing.T) {
 	}
 }

+func TestIsQueuedProxyResponse(t *testing.T) {
+	// Regression guard for the chat-leak bug: when the proxy returns
+	// 202 with a queued-shape body, executeDelegation must classify it
+	// as "queued" — not "completed". Mis-classifying it causes the
+	// queued JSON to land in activity_logs.summary, which the LLM then
+	// echoes verbatim into the agent chat as
+	// "Delegation completed: Delegation completed (workspace agent
+	// busy — request queued, will dispatch...)".
+	cases := []struct {
+		name string
+		body string
+		want bool
+	}{
+		{
+			name: "real proxy busy-enqueue body",
+			body: `{"queued":true,"queue_id":"d0993390-5f5a-4f5d-90a2-66639e53e3c9","queue_depth":1,"message":"workspace agent busy — request queued, will dispatch when capacity available"}`,
+			want: true,
+		},
+		{"queued false explicitly", `{"queued":false}`, false},
+		{"queued field absent (real A2A reply)", `{"jsonrpc":"2.0","id":"1","result":{"kind":"message","parts":[{"kind":"text","text":"hi"}]}}`, false},
+		{"non-bool queued value (defensive)", `{"queued":"true"}`, false},
+		{"malformed JSON", `not-json`, false},
+		{"empty body", ``, false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := isQueuedProxyResponse([]byte(tc.body)); got != tc.want {
+				t.Errorf("isQueuedProxyResponse(%q) = %v, want %v", tc.body, got, tc.want)
+			}
+		})
+	}
+}
+
 func TestDelegationRetryDelay_IsSaneWindow(t *testing.T) {
 	// Regression guard: the retry delay must be long enough for the
 	// reactive URL refresh in proxyA2ARequest to kick in (which involves
--- a/workspace-server/internal/handlers/discovery.go
+++ b/workspace-server/internal/handlers/discovery.go
@ -102,6 +102,19 @@ func discoverHostPeer(ctx context.Context, c *gin.Context, targetID string) {
 		return
 	}

+	// #1484 SSRF defense-in-depth: the URL came from the workspaces table
+	// without any per-write validation (workspace runtimes register their
+	// own URLs via /registry/register, and a misbehaving / compromised
+	// runtime could register a 169.254.169.254 metadata URL). Validate
+	// before handing it to the caller, who might dispatch HTTP against it.
+	// Currently gated by the bearer-required Discover handler, but this
+	// guard makes discoverHostPeer safe regardless of upstream auth shape.
+	if err := isSafeURL(url.String); err != nil {
+		log.Printf("Discovery: rejecting unsafe registered URL for %s (#1484): %v", resolvedID, err)
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace URL failed safety check", "status": status})
+		return
+	}
+
 	db.CacheURL(ctx, resolvedID, url.String)
 	c.JSON(http.StatusOK, gin.H{
 		"id":     resolvedID,
@ -172,11 +185,31 @@ func writeExternalWorkspaceURL(ctx context.Context, c *gin.Context, callerID, ta
 		outURL = strings.Replace(outURL, "127.0.0.1", "host.docker.internal", 1)
 		outURL = strings.Replace(outURL, "localhost", "host.docker.internal", 1)
 	}
+
+	// #1484 SSRF defense-in-depth — same rationale as discoverHostPeer.
+	// We validate the post-rewrite URL because the rewrite changes which
+	// host the caller would dispatch against (host.docker.internal is
+	// only reachable inside a docker network; isSafeURL accepts it but
+	// blocks a metadata IP that survived the rewrite untouched).
+	if err := isSafeURL(outURL); err != nil {
+		log.Printf("Discovery: rejecting unsafe external workspace URL for %s (#1484): %v", targetID, err)
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace URL failed safety check"})
+		return true
+	}
+
 	c.JSON(http.StatusOK, gin.H{"id": targetID, "url": outURL, "name": wsName})
 	return true
 }

 // Peers handles GET /registry/:id/peers
+//
+// Optional ``?q=<substring>`` filters the result by case-insensitive
+// substring match against ``name`` or ``role`` (#1038). Filtering is done
+// in Go after the DB read — keeps the SQL identical to the no-filter path
+// (no injection risk, no DB-driver collation surprises) at the cost of
+// loading the unfiltered set first. Acceptable because the peer set is
+// always bounded by the small fanout of a single workspace's parent +
+// children + siblings (typically <50 rows).
 func (h *DiscoveryHandler) Peers(c *gin.Context) {
 	workspaceID := c.Param("id")
 	ctx := c.Request.Context()
@ -241,12 +274,34 @@ func (h *DiscoveryHandler) Peers(c *gin.Context) {
 		peers = append(peers, parent...)
 	}

+	peers = filterPeersByQuery(peers, c.Query("q"))
+
 	if peers == nil {
 		peers = make([]map[string]interface{}, 0)
 	}
 	c.JSON(http.StatusOK, peers)
 }

+// filterPeersByQuery returns peers whose name or role case-insensitively
+// contains q. Whitespace-trimmed empty q is a no-op (returns input unchanged).
+func filterPeersByQuery(peers []map[string]interface{}, q string) []map[string]interface{} {
+	q = strings.TrimSpace(q)
+	if q == "" {
+		return peers
+	}
+	needle := strings.ToLower(q)
+	out := make([]map[string]interface{}, 0, len(peers))
+	for _, p := range peers {
+		name := p["name"].(string)
+		role := p["role"].(string)
+		if strings.Contains(strings.ToLower(name), needle) ||
+			strings.Contains(strings.ToLower(role), needle) {
+			out = append(out, p)
+		}
+	}
+	return out
+}
+
 // queryPeerMaps returns clean JSON-serializable maps instead of Workspace structs.
 func queryPeerMaps(query string, args ...interface{}) ([]map[string]interface{}, error) {
 	rows, err := db.DB.Query(query, args...)
--- a/workspace-server/internal/handlers/discovery_test.go
+++ b/workspace-server/internal/handlers/discovery_test.go
@ -7,6 +7,7 @@ import (
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
+	"strings"
 	"testing"

 	"github.com/DATA-DOG/go-sqlmock"
@ -258,6 +259,149 @@ func TestPeers_RootWorkspace_NoPeers(t *testing.T) {
 	}
 }

+// ==================== Peers — ?q= filter (#1038) ====================
+
+// peersFilterFixture mocks the 4 SQL reads (parent_id lookup + siblings +
+// children + parent) with a known 3-peer set so each q-filter test can
+// focus on the post-fetch substring-match behaviour. Returns the handler
+// and the live mock so callers can assert ExpectationsWereMet at the end.
+func peersFilterFixture(t *testing.T) (*DiscoveryHandler, sqlmock.Sqlmock) {
+	t.Helper()
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
+		WithArgs("ws-self").
+		WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow("ws-pm"))
+
+	cols := []string{"id", "name", "role", "tier", "status", "agent_card", "url", "parent_id", "active_tasks"}
+
+	mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2").
+		WithArgs("ws-pm", "ws-self").
+		WillReturnRows(sqlmock.NewRows(cols).
+			AddRow("ws-alpha", "Alpha Researcher", "researcher", 1, "online", []byte("null"), "http://a", "ws-pm", 0).
+			AddRow("ws-beta", "Beta Designer", "designer", 1, "online", []byte("null"), "http://b", "ws-pm", 0))
+
+	mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.status").
+		WithArgs("ws-self").
+		WillReturnRows(sqlmock.NewRows(cols))
+
+	mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.status").
+		WithArgs("ws-pm").
+		WillReturnRows(sqlmock.NewRows(cols).
+			AddRow("ws-pm", "PM Workspace", "manager", 2, "online", []byte("null"), "http://pm", nil, 1))
+
+	return NewDiscoveryHandler(), mock
+}
+
+// runPeersWithQuery invokes Peers and returns BOTH the parsed peers and
+// the raw response body. The raw body is needed by TestPeers_Q_NoMatches
+// to distinguish JSON `[]` (intended) from `null` (regression of the
+// nil-guard at line 254-256) — once unmarshalled, both collapse to
+// len==0 and re-marshal to `[]`, so checking only the parsed value is
+// tautological.
+func runPeersWithQuery(t *testing.T, handler *DiscoveryHandler, q string) ([]map[string]interface{}, []byte) {
+	t.Helper()
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-self"}}
+	url := "/registry/ws-self/peers"
+	if q != "" {
+		url += "?q=" + q
+	}
+	c.Request = httptest.NewRequest("GET", url, nil)
+	handler.Peers(c)
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	body := w.Body.Bytes()
+	var peers []map[string]interface{}
+	if err := json.Unmarshal(body, &peers); err != nil {
+		t.Fatalf("parse response: %v", err)
+	}
+	return peers, body
+}
+
+// peerIDSet returns the set of peer ids — order-independent comparison
+// avoids fragile peers[0]["id"] asserts that would silently mask a future
+// sort/order change.
+func peerIDSet(peers []map[string]interface{}) map[string]struct{} {
+	out := make(map[string]struct{}, len(peers))
+	for _, p := range peers {
+		out[p["id"].(string)] = struct{}{}
+	}
+	return out
+}
+
+// TestPeers_QFilter covers the rule classifier — append-order
+// independent (uses set membership) so a future sort regression on the
+// production code can't slip through. NoMatches has its own raw-body
+// check (see TestPeers_Q_NoMatches_RawBodyIsArrayNotNull below) because
+// the `[]` vs `null` distinction collapses after json.Unmarshal.
+func TestPeers_QFilter(t *testing.T) {
+	cases := []struct {
+		name    string
+		q       string
+		wantIDs []string
+	}{
+		{"no-q returns all", "", []string{"ws-alpha", "ws-beta", "ws-pm"}},
+		{"name match", "alpha", []string{"ws-alpha"}},
+		{"name match case-insensitive", "ALPHA", []string{"ws-alpha"}},
+		{"role match", "design", []string{"ws-beta"}},
+		{"no matches", "nonexistent", nil},
+		{"whitespace-only is no-op", "%20%20", []string{"ws-alpha", "ws-beta", "ws-pm"}},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			handler, mock := peersFilterFixture(t)
+			peers, _ := runPeersWithQuery(t, handler, tc.q)
+
+			got := peerIDSet(peers)
+			want := make(map[string]struct{}, len(tc.wantIDs))
+			for _, id := range tc.wantIDs {
+				want[id] = struct{}{}
+			}
+			if len(got) != len(want) {
+				t.Fatalf("len: got %d %v, want %d %v", len(got), keysOf(got), len(want), tc.wantIDs)
+			}
+			for id := range want {
+				if _, ok := got[id]; !ok {
+					t.Errorf("missing id %q (got %v, want %v)", id, keysOf(got), tc.wantIDs)
+				}
+			}
+			if err := mock.ExpectationsWereMet(); err != nil {
+				t.Errorf("unmet sqlmock expectations: %v", err)
+			}
+		})
+	}
+}
+
+// TestPeers_Q_NoMatches_RawBodyIsArrayNotNull verifies the `peers = make(...)`
+// nil-guard at the end of Peers — when filtering reduces the slice to
+// non-nil-but-empty AND the original was nil, JSON must be `[]` not `null`.
+// This is the assertion the previous TestPeers_Q_NoMatches falsely claimed
+// to make: re-encoding an unmarshalled []map collapses null and [] both
+// to []. The fix here checks the recorder body bytes BEFORE unmarshal.
+func TestPeers_Q_NoMatches_RawBodyIsArrayNotNull(t *testing.T) {
+	handler, mock := peersFilterFixture(t)
+	_, body := runPeersWithQuery(t, handler, "nonexistent")
+	got := strings.TrimSpace(string(body))
+	if got != "[]" {
+		t.Errorf("raw body: got %q, want []", got)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+func keysOf(m map[string]struct{}) []string {
+	out := make([]string, 0, len(m))
+	for k := range m {
+		out = append(out, k)
+	}
+	return out
+}
+
 // ==================== CheckAccess ====================

 func TestCheckAccess_BadJSON(t *testing.T) {
@ -527,6 +671,94 @@ func TestWriteExternalWorkspaceURL_RewritesLocalhostForDockerCaller(t *testing.T
 	}
 }

+// --- #1484 SSRF defense-in-depth regression tests ---
+
+// TestDiscoverHostPeer_RejectsMetadataIPURL pins the #1484 guard:
+// even though discoverHostPeer is currently gated by a bearer-required
+// Discover handler, the function MUST refuse to hand back a URL that
+// resolves into the cloud-metadata range. setupTestDB disables SSRF
+// for normal tests, so we re-enable it here for the duration of the
+// case and provide a literal IP so the check doesn't depend on DNS.
+func TestDiscoverHostPeer_RejectsMetadataIPURL(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	restoreSSRF := setSSRFCheckForTest(true)
+	t.Cleanup(restoreSSRF)
+
+	mock.ExpectQuery(`SELECT url, status, forwarded_to FROM workspaces WHERE id =`).
+		WithArgs("ws-bad").
+		WillReturnRows(sqlmock.NewRows([]string{"url", "status", "forwarded_to"}).
+			AddRow("http://169.254.169.254/latest/meta-data/", "online", nil))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Request = httptest.NewRequest("GET", "/x", nil)
+
+	discoverHostPeer(context.Background(), c, "ws-bad")
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503 for metadata-IP URL, got %d: %s", w.Code, w.Body.String())
+	}
+	if !strings.Contains(w.Body.String(), "safety check") {
+		t.Errorf("response should mention 'safety check', got %s", w.Body.String())
+	}
+}
+
+// TestDiscoverHostPeer_AcceptsPublicURL is the positive counterpart —
+// a routable hostname (literal public-range IP, no DNS dependency)
+// passes through the new guard and returns 200. Without it, the
+// rejection test above could pass by falsely failing every URL.
+func TestDiscoverHostPeer_AcceptsPublicURL(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	restoreSSRF := setSSRFCheckForTest(true)
+	t.Cleanup(restoreSSRF)
+
+	mock.ExpectQuery(`SELECT url, status, forwarded_to FROM workspaces WHERE id =`).
+		WithArgs("ws-good").
+		WillReturnRows(sqlmock.NewRows([]string{"url", "status", "forwarded_to"}).
+			AddRow("http://8.8.8.8/a2a", "online", nil))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Request = httptest.NewRequest("GET", "/x", nil)
+
+	discoverHostPeer(context.Background(), c, "ws-good")
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200 for public-IP URL, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
+// TestWriteExternalWorkspaceURL_RejectsMetadataIPURL is the parallel
+// guard for the external-runtime path. Same #1484 rationale as the
+// host-peer test above; covers writeExternalWorkspaceURL specifically.
+func TestWriteExternalWorkspaceURL_RejectsMetadataIPURL(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	restoreSSRF := setSSRFCheckForTest(true)
+	t.Cleanup(restoreSSRF)
+
+	mock.ExpectQuery(`SELECT COALESCE\(url,''\) FROM workspaces WHERE id =`).
+		WithArgs("ws-ext").
+		WillReturnRows(sqlmock.NewRows([]string{"url"}).
+			AddRow("http://169.254.169.254/computeMetadata/v1/"))
+	// callerRuntime lookup happens before isSafeURL — must mock it.
+	mock.ExpectQuery(`SELECT COALESCE\(runtime,'langgraph'\) FROM workspaces WHERE id =`).
+		WithArgs("ws-caller").
+		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("langgraph"))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Request = httptest.NewRequest("GET", "/x", nil)
+
+	handled := writeExternalWorkspaceURL(context.Background(), c, "ws-caller", "ws-ext", "Bad WS")
+	if !handled {
+		t.Fatal("expected handled=true (the function did write a 503)")
+	}
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503 for metadata-IP URL, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
 // --- discoverHostPeer smoke (currently unreachable via Discover) ---

 func TestDiscoverHostPeer_Smoke_CacheHit(t *testing.T) {
--- a/workspace-server/internal/handlers/runtime_provision_timeouts.go
+++ b/workspace-server/internal/handlers/runtime_provision_timeouts.go
@ -0,0 +1,102 @@
+package handlers
+
+import (
+	"log"
+	"os"
+	"path/filepath"
+	"sync"
+
+	"gopkg.in/yaml.v3"
+)
+
+// runtimeProvisionTimeoutsCache holds the per-runtime provision-timeout
+// values declared in template config.yaml manifests (#2054 phase 2).
+// Lazy-init so the first workspace API request after process start pays
+// the read cost (a few KB of yaml across ~50 templates) and every
+// subsequent one is a map lookup.
+//
+// Cache lifetime = process lifetime. Templates only change on container
+// rebuild + workspace-server restart, which already invalidates the
+// in-memory state. A future template-hot-reload feature would need to
+// refresh this cache; today there is no such hook.
+//
+// NOT SAFE for package-level reuse — sync.Once cannot be reset, so a
+// shared singleton would lock tests out of trying multiple template
+// fixtures. Tests construct fresh struct values; production code holds
+// one per WorkspaceHandler instance.
+type runtimeProvisionTimeoutsCache struct {
+	once sync.Once
+	m    map[string]int // runtime → seconds
+}
+
+func (c *runtimeProvisionTimeoutsCache) get(configsDir string, runtime string) int {
+	c.once.Do(func() {
+		c.m = loadRuntimeProvisionTimeouts(configsDir)
+	})
+	return c.m[runtime]
+}
+
+// walkTemplateConfigs invokes fn(id, configBytes) for every immediate
+// subdirectory of configsDir that has a readable config.yaml. Bad reads
+// and empty dirs are silently skipped — the caller decides whether
+// missing fields warrant the slot. A bad configsDir logs once and
+// returns without invoking fn, matching the "start clean with no
+// templates" semantics.
+//
+// Shared between the templates.List handler (which decodes the full
+// templateSummary) and loadRuntimeProvisionTimeouts (which decodes the
+// narrow runtime-timeout subset). Centralising the walk means a future
+// template-discovery rule (subdir naming convention, README sentinel,
+// etc.) lands in one place.
+func walkTemplateConfigs(configsDir string, fn func(id string, configBytes []byte)) {
+	entries, err := os.ReadDir(configsDir)
+	if err != nil {
+		log.Printf("walkTemplateConfigs: read configsDir %s: %v", configsDir, err)
+		return
+	}
+	for _, e := range entries {
+		if !e.IsDir() {
+			continue
+		}
+		data, err := os.ReadFile(filepath.Join(configsDir, e.Name(), "config.yaml"))
+		if err != nil {
+			continue
+		}
+		fn(e.Name(), data)
+	}
+}
+
+// loadRuntimeProvisionTimeouts returns a map of runtime → seconds for
+// templates that declared `runtime_config.provision_timeout_seconds`.
+//
+// Templates without the field aren't represented (lookup returns zero,
+// which the caller treats as "fall through to canvas runtime profile").
+//
+// Multiple templates with the same runtime: take the MAX timeout — a
+// slow template's threshold should win over a fast template's so users
+// of either template see a true-positive timeout signal rather than a
+// false alarm. Same-runtime divergence is rare in practice (typically
+// one canonical template-{runtime} per runtime) but the rule is the
+// safer default.
+func loadRuntimeProvisionTimeouts(configsDir string) map[string]int {
+	out := map[string]int{}
+	walkTemplateConfigs(configsDir, func(_ string, data []byte) {
+		var raw struct {
+			Runtime       string `yaml:"runtime"`
+			RuntimeConfig struct {
+				ProvisionTimeoutSeconds int `yaml:"provision_timeout_seconds"`
+			} `yaml:"runtime_config"`
+		}
+		if err := yaml.Unmarshal(data, &raw); err != nil {
+			return
+		}
+		secs := raw.RuntimeConfig.ProvisionTimeoutSeconds
+		if secs <= 0 || raw.Runtime == "" {
+			return
+		}
+		if existing, ok := out[raw.Runtime]; !ok || secs > existing {
+			out[raw.Runtime] = secs
+		}
+	})
+	return out
+}
--- a/workspace-server/internal/handlers/runtime_provision_timeouts_test.go
+++ b/workspace-server/internal/handlers/runtime_provision_timeouts_test.go
@ -0,0 +1,128 @@
+package handlers
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+// writeTemplate is a tiny test fixture: drop a config.yaml under
+// tmp/<dir>/config.yaml with the given content. Mirrors the real
+// configsDir layout (one subdir per template, each with its own
+// config.yaml).
+func writeTemplate(t *testing.T, dir, name, content string) {
+	t.Helper()
+	p := filepath.Join(dir, name)
+	if err := os.MkdirAll(p, 0o755); err != nil {
+		t.Fatalf("mkdir %s: %v", p, err)
+	}
+	if err := os.WriteFile(filepath.Join(p, "config.yaml"), []byte(content), 0o644); err != nil {
+		t.Fatalf("write config.yaml: %v", err)
+	}
+}
+
+func TestLoadRuntimeProvisionTimeouts_HappyPath(t *testing.T) {
+	dir := t.TempDir()
+	writeTemplate(t, dir, "template-hermes", `
+name: Hermes
+runtime: hermes
+runtime_config:
+  provision_timeout_seconds: 720
+`)
+	writeTemplate(t, dir, "template-claude-code", `
+name: Claude
+runtime: claude-code
+runtime_config:
+  model: anthropic:claude-opus
+`)
+	got := loadRuntimeProvisionTimeouts(dir)
+	if got["hermes"] != 720 {
+		t.Errorf("hermes: got %d, want 720", got["hermes"])
+	}
+	// claude-code didn't declare a timeout — must not appear in the map
+	// (zero-value lookup is the no-override signal).
+	if _, ok := got["claude-code"]; ok {
+		t.Errorf("claude-code: present without declaration: %d", got["claude-code"])
+	}
+}
+
+func TestLoadRuntimeProvisionTimeouts_MaxOnDuplicateRuntime(t *testing.T) {
+	dir := t.TempDir()
+	writeTemplate(t, dir, "template-hermes-fast", `
+runtime: hermes
+runtime_config:
+  provision_timeout_seconds: 300
+`)
+	writeTemplate(t, dir, "template-hermes-slow", `
+runtime: hermes
+runtime_config:
+  provision_timeout_seconds: 900
+`)
+	got := loadRuntimeProvisionTimeouts(dir)
+	// Max wins so the slowest template's threshold doesn't false-alarm
+	// when both templates use the same runtime.
+	if got["hermes"] != 900 {
+		t.Errorf("max-on-duplicate: got %d, want 900", got["hermes"])
+	}
+}
+
+func TestLoadRuntimeProvisionTimeouts_SkipsBadInputs(t *testing.T) {
+	dir := t.TempDir()
+	// Missing runtime field — has timeout but no key to map under.
+	writeTemplate(t, dir, "template-no-runtime", `
+runtime_config:
+  provision_timeout_seconds: 600
+`)
+	// Zero/negative timeout — same as no declaration.
+	writeTemplate(t, dir, "template-zero", `
+runtime: zero-runtime
+runtime_config:
+  provision_timeout_seconds: 0
+`)
+	// Malformed yaml — must not crash.
+	writeTemplate(t, dir, "template-bad", "not: valid: yaml: at: all:")
+	// Loose file at the top level (not a dir) — must be ignored.
+	if err := os.WriteFile(filepath.Join(dir, "stray.txt"), []byte("ignore me"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	got := loadRuntimeProvisionTimeouts(dir)
+	if len(got) != 0 {
+		t.Errorf("expected empty map for skip cases, got %v", got)
+	}
+}
+
+func TestLoadRuntimeProvisionTimeouts_MissingDirReturnsEmpty(t *testing.T) {
+	got := loadRuntimeProvisionTimeouts("/nonexistent/path/should/not/exist/12345")
+	if len(got) != 0 {
+		t.Errorf("expected empty map on missing dir, got %v", got)
+	}
+}
+
+func TestRuntimeProvisionTimeoutsCache_LazyInitAndCached(t *testing.T) {
+	dir := t.TempDir()
+	writeTemplate(t, dir, "template-hermes", `
+runtime: hermes
+runtime_config:
+  provision_timeout_seconds: 720
+`)
+	c := runtimeProvisionTimeoutsCache{}
+
+	// First call populates.
+	if got := c.get(dir, "hermes"); got != 720 {
+		t.Errorf("first call: got %d, want 720", got)
+	}
+	// Second call hits cache — even if the underlying file changed we
+	// still see the original value (sync.Once contract).
+	if err := os.WriteFile(filepath.Join(dir, "template-hermes", "config.yaml"),
+		[]byte("runtime: hermes\nruntime_config:\n  provision_timeout_seconds: 60\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if got := c.get(dir, "hermes"); got != 720 {
+		t.Errorf("cached call: got %d, want 720 (cache must not re-read)", got)
+	}
+	// Unknown runtime returns zero — caller's signal to fall through to
+	// the canvas runtime profile default.
+	if got := c.get(dir, "unknown"); got != 0 {
+		t.Errorf("unknown runtime: got %d, want 0", got)
+	}
+}
--- a/workspace-server/internal/handlers/ssrf.go
+++ b/workspace-server/internal/handlers/ssrf.go
@ -127,7 +127,16 @@ var testAllowLoopback = false
 // container deployments the relaxation is off and every private range
 // stays blocked.
 func isPrivateOrMetadataIP(ip net.IP) bool {
-	saas := saasMode()
+	// MOLECULE_ENV=development is the dev-host pattern: platform and
+	// workspace containers share a docker bridge network (172.18.0.0/16,
+	// RFC-1918). Treat that the same as SaaS for private-range relaxation
+	// — both share the "trusted intra-network routing" property. Without
+	// this, every workspace registration via docker-internal hostname
+	// resolves to 172.18.x.x and gets rejected as
+	// "workspace URL is not publicly routable", breaking the entire
+	// docker-compose dev loop. Always-blocked categories (metadata link-
+	// local, TEST-NET, CGNAT) remain blocked regardless.
+	saas := saasMode() || devModeAllowsLoopback()

 	// IPv4 path.
 	if ip4 := ip.To4(); ip4 != nil {
--- a/workspace-server/internal/handlers/ssrf_test.go
+++ b/workspace-server/internal/handlers/ssrf_test.go
@ -387,6 +387,37 @@ func TestIsSafeURL_SaaSMode_StillBlocksMetadataEtAl(t *testing.T) {
 	}
 }

+// TestIsSafeURL_DevMode_AllowsRFC1918 pins the dev-mode RFC-1918 + ULA
+// relaxation that #2103 widened. The dev-host docker-compose pattern
+// puts the platform + workspaces on the same docker bridge (172.18.0.0/16),
+// so workspace registration via 172.18.x.x must NOT be rejected in dev.
+// SaaS already allowed this; dev mode now matches via
+// `saas := saasMode() || devModeAllowsLoopback()` in isPrivateOrMetadataIP.
+//
+// Without this test, a future refactor that quietly drops the
+// `|| devModeAllowsLoopback()` from line 130 wouldn't trip any test —
+// the existing `TestIsSafeURL_DevMode_StillBlocksOtherRanges` only
+// pins the security floor (metadata / TEST-NET / CGNAT), not the
+// behavior change.
+func TestIsSafeURL_DevMode_AllowsRFC1918(t *testing.T) {
+	// Make sure saasMode() returns false so the test exercises the
+	// devModeAllowsLoopback() branch specifically — not a SaaS-mode pass.
+	t.Setenv("MOLECULE_DEPLOY_MODE", "self-hosted")
+	t.Setenv("MOLECULE_ORG_ID", "")
+	t.Setenv("MOLECULE_ENV", "development")
+
+	for _, url := range []string{
+		"http://10.1.2.3/agent",
+		"http://172.18.0.42:8000/a2a",       // the docker-compose case from the issue
+		"http://192.168.1.100/agent",
+		"http://[fd00::1]/agent",            // IPv6 ULA fd00::/8 also relaxed
+	} {
+		if err := isSafeURL(url); err != nil {
+			t.Errorf("isSafeURL(%q) in dev mode: got %v, want nil", url, err)
+		}
+	}
+}
+
 // TestIsSafeURL_StrictMode_BlocksRFC1918 is the strict-mode counterpart to
 // TestIsSafeURL_SaaSMode_AllowsRFC1918.  In self-hosted / single-container
 // deployments there is no legitimate reason to reach RFC-1918 agents, so the
--- a/workspace-server/internal/handlers/templates.go
+++ b/workspace-server/internal/handlers/templates.go
@ -61,6 +61,13 @@ type templateSummary struct {
 	RequiredEnv []string `json:"required_env,omitempty"`
 	Skills      []string `json:"skills"`
 	SkillCount  int      `json:"skill_count"`
+	// ProvisionTimeoutSeconds lets a slow runtime declare its expected
+	// cold-boot duration in its template manifest. Canvas's
+	// ProvisioningTimeout banner respects this per-workspace via the
+	// `provision_timeout_ms` field in the workspace API response (#2054).
+	// 0 = template hasn't declared one, falls through to canvas's
+	// runtime-profile default.
+	ProvisionTimeoutSeconds int `json:"provision_timeout_seconds,omitempty"`
 }

 // resolveTemplateDir finds the template directory for a workspace on the host.
@ -80,24 +87,8 @@ func (h *TemplatesHandler) resolveTemplateDir(wsName string) string {

 // List handles GET /templates
 func (h *TemplatesHandler) List(c *gin.Context) {
-	entries, err := os.ReadDir(h.configsDir)
-	if err != nil {
-		c.JSON(http.StatusOK, []templateSummary{})
-		return
-	}
-
 	templates := make([]templateSummary, 0)
-	for _, entry := range entries {
-		if !entry.IsDir() {
-			continue
-		}
-
-		configPath := filepath.Join(h.configsDir, entry.Name(), "config.yaml")
-		data, err := os.ReadFile(configPath)
-		if err != nil {
-			continue
-		}
-
+	walkTemplateConfigs(h.configsDir, func(id string, data []byte) {
 		var raw struct {
 			Name          string   `yaml:"name"`
 			Description   string   `yaml:"description"`
@ -106,13 +97,14 @@ func (h *TemplatesHandler) List(c *gin.Context) {
 			Model         string   `yaml:"model"`
 			Skills        []string `yaml:"skills"`
 			RuntimeConfig struct {
-				Model       string      `yaml:"model"`
-				Models      []modelSpec `yaml:"models"`
-				RequiredEnv []string    `yaml:"required_env"`
+				Model                   string      `yaml:"model"`
+				Models                  []modelSpec `yaml:"models"`
+				RequiredEnv             []string    `yaml:"required_env"`
+				ProvisionTimeoutSeconds int         `yaml:"provision_timeout_seconds"`
 			} `yaml:"runtime_config"`
 		}
 		if err := yaml.Unmarshal(data, &raw); err != nil {
-			continue
+			return
 		}

 		// Model comes from either top-level (legacy) or runtime_config.model (current).
@ -122,18 +114,19 @@ func (h *TemplatesHandler) List(c *gin.Context) {
 		}

 		templates = append(templates, templateSummary{
-			ID:          entry.Name(),
-			Name:        raw.Name,
-			Description: raw.Description,
-			Tier:        raw.Tier,
-			Runtime:     raw.Runtime,
-			Model:       model,
-			Models:      raw.RuntimeConfig.Models,
-			RequiredEnv: raw.RuntimeConfig.RequiredEnv,
-			Skills:      raw.Skills,
-			SkillCount:  len(raw.Skills),
+			ID:                      id,
+			Name:                    raw.Name,
+			Description:             raw.Description,
+			Tier:                    raw.Tier,
+			Runtime:                 raw.Runtime,
+			Model:                   model,
+			Models:                  raw.RuntimeConfig.Models,
+			RequiredEnv:             raw.RuntimeConfig.RequiredEnv,
+			Skills:                  raw.Skills,
+			SkillCount:              len(raw.Skills),
+			ProvisionTimeoutSeconds: raw.RuntimeConfig.ProvisionTimeoutSeconds,
 		})
-	}
+	})

 	c.JSON(http.StatusOK, templates)
 }
--- a/workspace-server/internal/handlers/tokens_sqlmock_test.go
+++ b/workspace-server/internal/handlers/tokens_sqlmock_test.go
@ -0,0 +1,331 @@
+package handlers
+
+// Sqlmock-backed coverage for tokens.go. Closes #1819.
+//
+// The existing tokens_test.go uses the real `db.DB` and t.Skip's when
+// the test DB isn't reachable — which is the default in CI, so the
+// file shows 0% coverage. This file substitutes the package-level
+// `db.DB` with a sqlmock instance so every code path (List, Create,
+// Revoke + their error branches) is exercised in `go test` without
+// any external dependency.
+//
+// What's covered:
+//   List   — happy path, empty rows, scan failure, query error
+//   Create — rate-limited, IssueToken DB error, happy path
+//   Revoke — happy path, not found, DB error
+//
+// What's NOT covered here (intentional):
+//   - Wsauth/middleware-level cross-tenant gating: those are exercised
+//     by middleware/wsauth_middleware_test.go. The handler-level code
+//     trusts WorkspaceAuth has already gated the request.
+//   - The full IssueToken path's correctness (random bytes,
+//     base64 encoding, prefix derivation): wsauth/tokens_test.go owns
+//     that. Here we only verify the handler hands off + reports
+//     errors correctly.
+
+import (
+	"context"
+	"database/sql"
+	"database/sql/driver"
+	"encoding/json"
+	"errors"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
+	"github.com/gin-gonic/gin"
+)
+
+func init() { gin.SetMode(gin.TestMode) }
+
+// withMockDB swaps `db.DB` for a sqlmock and returns the mock plus a
+// restore func. Tests use this in place of setupTokenTestDB which
+// skips on a missing real DB.
+func withMockDB(t *testing.T) (sqlmock.Sqlmock, func()) {
+	t.Helper()
+	mock, m, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock: %v", err)
+	}
+	prev := db.DB
+	db.DB = mock
+	cleanup := func() {
+		db.DB = prev
+		_ = mock.Close()
+	}
+	return m, cleanup
+}
+
+// makeReq builds a recorder + Gin context with the given URL params,
+// drives the handler, and returns the recorder. Centralised so each
+// scenario is one-line setup + assertion.
+func makeReq(t *testing.T, h gin.HandlerFunc, method, url string, params gin.Params) *httptest.ResponseRecorder {
+	t.Helper()
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Request = httptest.NewRequest(method, url, nil)
+	c.Params = params
+	h(c)
+	return w
+}
+
+// ---- List ------------------------------------------------------------
+
+func TestTokenHandler_List_HappyPath(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	created := time.Date(2026, 4, 1, 12, 0, 0, 0, time.UTC)
+	last := created.Add(time.Hour)
+	mock.ExpectQuery(`SELECT id, prefix, created_at, last_used_at\s+FROM workspace_auth_tokens`).
+		WithArgs("ws-1", 50, 0).
+		WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "created_at", "last_used_at"}).
+			AddRow("tok-1", "abc12345", created, last).
+			AddRow("tok-2", "def67890", created, nil))
+
+	w := makeReq(t, NewTokenHandler().List, "GET",
+		"/workspaces/ws-1/tokens", gin.Params{{Key: "id", Value: "ws-1"}})
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	var body struct {
+		Tokens []tokenListItem `json:"tokens"`
+		Count  int             `json:"count"`
+	}
+	if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if body.Count != 2 || len(body.Tokens) != 2 {
+		t.Fatalf("count=%d tokens=%d, want 2/2", body.Count, len(body.Tokens))
+	}
+	if body.Tokens[0].ID != "tok-1" || body.Tokens[1].ID != "tok-2" {
+		t.Errorf("wrong order: %+v", body.Tokens)
+	}
+	if body.Tokens[1].LastUsed != nil {
+		t.Errorf("token-2 should have nil last_used_at, got %v", body.Tokens[1].LastUsed)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet: %v", err)
+	}
+}
+
+func TestTokenHandler_List_EmptyResult(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	mock.ExpectQuery(`SELECT id, prefix, created_at, last_used_at`).
+		WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "created_at", "last_used_at"}))
+
+	w := makeReq(t, NewTokenHandler().List, "GET",
+		"/workspaces/ws-2/tokens", gin.Params{{Key: "id", Value: "ws-2"}})
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 on empty list, got %d", w.Code)
+	}
+	var body struct {
+		Tokens []tokenListItem `json:"tokens"`
+		Count  int             `json:"count"`
+	}
+	_ = json.Unmarshal(w.Body.Bytes(), &body)
+	if body.Count != 0 || body.Tokens == nil {
+		// Tokens MUST be `[]` not `null` so callers iterating with
+		// `.length` or `for...of` don't NPE on JS.
+		t.Errorf("empty: count=%d, tokens=%v (want 0 + non-nil)", body.Count, body.Tokens)
+	}
+}
+
+func TestTokenHandler_List_QueryError(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	mock.ExpectQuery(`SELECT id, prefix, created_at, last_used_at`).
+		WillReturnError(errors.New("connection refused"))
+
+	w := makeReq(t, NewTokenHandler().List, "GET",
+		"/workspaces/ws-3/tokens", gin.Params{{Key: "id", Value: "ws-3"}})
+
+	if w.Code != http.StatusInternalServerError {
+		t.Errorf("query error must surface as 500, got %d", w.Code)
+	}
+}
+
+func TestTokenHandler_List_RespectsLimit(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	mock.ExpectQuery(`SELECT id, prefix, created_at, last_used_at`).
+		WithArgs("ws-1", 10, 5).
+		WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "created_at", "last_used_at"}))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Request = httptest.NewRequest("GET", "/workspaces/ws-1/tokens?limit=10&offset=5", nil)
+	c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+	NewTokenHandler().List(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("limit/offset query: %d", w.Code)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("limit/offset args not bound correctly: %v", err)
+	}
+}
+
+func TestTokenHandler_List_ScanError(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	// Inject a bad row that fails to Scan: pass non-time value where
+	// created_at expects time.Time. sqlmock surfaces this as a Scan err.
+	mock.ExpectQuery(`SELECT id, prefix, created_at, last_used_at`).
+		WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "created_at", "last_used_at"}).
+			AddRow("tok-1", "abc", "not-a-timestamp", nil))
+
+	w := makeReq(t, NewTokenHandler().List, "GET",
+		"/workspaces/ws-1/tokens", gin.Params{{Key: "id", Value: "ws-1"}})
+
+	if w.Code != http.StatusInternalServerError {
+		t.Errorf("scan error must surface as 500, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
+// ---- Create ----------------------------------------------------------
+
+func TestTokenHandler_Create_RateLimited(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	// Count query returns 50 (== max) → 429.
+	mock.ExpectQuery(`SELECT COUNT\(\*\) FROM workspace_auth_tokens`).
+		WithArgs("ws-1").
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(50))
+
+	w := makeReq(t, NewTokenHandler().Create, "POST",
+		"/workspaces/ws-1/tokens", gin.Params{{Key: "id", Value: "ws-1"}})
+
+	if w.Code != http.StatusTooManyRequests {
+		t.Errorf("max active tokens should 429, got %d", w.Code)
+	}
+}
+
+func TestTokenHandler_Create_IssueFails(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	// Count = 0 → fall through to IssueToken, which does an INSERT
+	// into workspace_auth_tokens. Mock the INSERT to fail; handler
+	// surfaces as 500.
+	mock.ExpectQuery(`SELECT COUNT\(\*\) FROM workspace_auth_tokens`).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
+	mock.ExpectExec(`INSERT INTO workspace_auth_tokens`).
+		WillReturnError(errors.New("disk full"))
+
+	w := makeReq(t, NewTokenHandler().Create, "POST",
+		"/workspaces/ws-1/tokens", gin.Params{{Key: "id", Value: "ws-1"}})
+
+	if w.Code != http.StatusInternalServerError {
+		t.Errorf("IssueToken DB error must 500, got %d", w.Code)
+	}
+}
+
+func TestTokenHandler_Create_HappyPath(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	mock.ExpectQuery(`SELECT COUNT\(\*\) FROM workspace_auth_tokens`).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
+	mock.ExpectExec(`INSERT INTO workspace_auth_tokens`).
+		WillReturnResult(sqlmock.NewResult(1, 1))
+
+	w := makeReq(t, NewTokenHandler().Create, "POST",
+		"/workspaces/ws-1/tokens", gin.Params{{Key: "id", Value: "ws-1"}})
+
+	if w.Code != http.StatusCreated {
+		t.Fatalf("expected 201, got %d: %s", w.Code, w.Body.String())
+	}
+	var body struct {
+		AuthToken   string `json:"auth_token"`
+		WorkspaceID string `json:"workspace_id"`
+	}
+	if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if body.AuthToken == "" {
+		t.Errorf("auth_token must be present and non-empty in response")
+	}
+	if body.WorkspaceID != "ws-1" {
+		t.Errorf("workspace_id mismatch: %q", body.WorkspaceID)
+	}
+}
+
+// ---- Revoke ----------------------------------------------------------
+
+func TestTokenHandler_Revoke_HappyPath(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	mock.ExpectExec(`UPDATE workspace_auth_tokens\s+SET revoked_at = now\(\)`).
+		WithArgs("tok-1", "ws-1").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := makeReq(t, NewTokenHandler().Revoke, "DELETE",
+		"/workspaces/ws-1/tokens/tok-1", gin.Params{
+			{Key: "id", Value: "ws-1"},
+			{Key: "tokenId", Value: "tok-1"},
+		})
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 on revoke, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
+func TestTokenHandler_Revoke_NotFound(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	// 0 rows affected → token not found OR already revoked.
+	mock.ExpectExec(`UPDATE workspace_auth_tokens`).
+		WithArgs("tok-ghost", "ws-1").
+		WillReturnResult(sqlmock.NewResult(0, 0))
+
+	w := makeReq(t, NewTokenHandler().Revoke, "DELETE",
+		"/workspaces/ws-1/tokens/tok-ghost", gin.Params{
+			{Key: "id", Value: "ws-1"},
+			{Key: "tokenId", Value: "tok-ghost"},
+		})
+
+	if w.Code != http.StatusNotFound {
+		t.Errorf("revoke missing token must 404, got %d", w.Code)
+	}
+}
+
+func TestTokenHandler_Revoke_DBError(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	mock.ExpectExec(`UPDATE workspace_auth_tokens`).
+		WillReturnError(errors.New("conn lost"))
+
+	w := makeReq(t, NewTokenHandler().Revoke, "DELETE",
+		"/workspaces/ws-1/tokens/tok-1", gin.Params{
+			{Key: "id", Value: "ws-1"},
+			{Key: "tokenId", Value: "tok-1"},
+		})
+
+	if w.Code != http.StatusInternalServerError {
+		t.Errorf("DB error must 500, got %d", w.Code)
+	}
+}
+
+// Compile-time noise removal: the imports list pulls in the sql /
+// driver packages and the silenced ctx so a future scenario that
+// needs them doesn't have to re-add the import. Documented here so
+// the apparent "unused import" dance isn't surprising.
+var (
+	_ context.Context = context.Background()
+	_ driver.Value    = (sql.RawBytes)(nil)
+)
--- a/workspace-server/internal/handlers/workspace.go
+++ b/workspace-server/internal/handlers/workspace.go
@ -27,7 +27,13 @@ import (
 )

 type WorkspaceHandler struct {
-	broadcaster *events.Broadcaster
+	// broadcaster narrowed from `*events.Broadcaster` to the
+	// events.EventEmitter interface (#1814) so tests can substitute a
+	// capture-only stub without standing up the real Redis + WS-hub
+	// topology. Production callers still pass *events.Broadcaster, which
+	// satisfies the interface — see the compile-time assertion in
+	// internal/events/broadcaster.go.
+	broadcaster events.EventEmitter
 	provisioner *provisioner.Provisioner
 	cpProv      *provisioner.CPProvisioner
 	platformURL string
@ -41,9 +47,13 @@ type WorkspaceHandler struct {
 	// calls made by HibernateWorkspace without requiring a running Docker daemon.
 	// Always nil in production; the real provisioner path is used when nil.
 	stopFnOverride func(ctx context.Context, workspaceID string)
+	// provisionTimeouts caches per-runtime provision-timeout values from
+	// template manifests (#2054 phase 2). Lazy-init on first scan; see
+	// runtime_provision_timeouts.go for the loader contract.
+	provisionTimeouts runtimeProvisionTimeoutsCache
 }

-func NewWorkspaceHandler(b *events.Broadcaster, p *provisioner.Provisioner, platformURL, configsDir string) *WorkspaceHandler {
+func NewWorkspaceHandler(b events.EventEmitter, p *provisioner.Provisioner, platformURL, configsDir string) *WorkspaceHandler {
 	return &WorkspaceHandler{
 		broadcaster: b,
 		provisioner: p,
@ -410,6 +420,17 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
 	})
 }

+// addProvisionTimeoutMs decorates a workspace response map with the
+// per-runtime provision-timeout override (#2054 phase 2) when one is
+// declared in the runtime's template manifest. No-op when the runtime
+// has no declared timeout — the canvas-side resolver falls through to
+// its runtime-profile default.
+func (h *WorkspaceHandler) addProvisionTimeoutMs(ws map[string]interface{}, runtime string) {
+	if secs := h.provisionTimeouts.get(h.configsDir, runtime); secs > 0 {
+		ws["provision_timeout_ms"] = secs * 1000
+	}
+}
+
 // scanWorkspaceRow is a helper to scan workspace+layout rows into a clean JSON map.
 func scanWorkspaceRow(rows interface {
 	Scan(dest ...interface{}) error
@ -508,6 +529,13 @@ func (h *WorkspaceHandler) List(c *gin.Context) {
 			log.Printf("List scan error: %v", err)
 			continue
 		}
+		// #2054 phase 2: surface per-runtime provision-timeout for
+		// canvas's ProvisioningTimeout banner. Decorating per-row
+		// (vs map-once-and-reuse) keeps the helper self-contained;
+		// the cache hit is sub-microsecond.
+		if rt, _ := ws["runtime"].(string); rt != "" {
+			h.addProvisionTimeoutMs(ws, rt)
+		}
 		workspaces = append(workspaces, ws)
 	}
 	if err := rows.Err(); err != nil {
@ -575,5 +603,11 @@ func (h *WorkspaceHandler) Get(c *gin.Context) {
 		ws["last_outbound_at"] = nil
 	}

+	// #2054 phase 2: per-runtime provision-timeout for canvas's
+	// ProvisioningTimeout banner.
+	if rt, _ := ws["runtime"].(string); rt != "" {
+		h.addProvisionTimeoutMs(ws, rt)
+	}
+
 	c.JSON(http.StatusOK, ws)
 }
--- a/workspace-server/internal/handlers/workspace_provision_test.go
+++ b/workspace-server/internal/handlers/workspace_provision_test.go
@ -9,7 +9,6 @@ import (
 	"testing"

 	"github.com/DATA-DOG/go-sqlmock"
-	"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/plugins"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
@ -1044,13 +1043,20 @@ var errInternalDB = fmt.Errorf("pq: connection refused")
 var errInternalOS = fmt.Errorf("operation failed: no such file or directory")

 // captureBroadcaster is a test broadcaster that captures the last data
-// payload passed to RecordAndBroadcast so tests can inspect it.
+// payload passed to RecordAndBroadcast so tests can inspect it. Now
+// satisfies events.EventEmitter (#1814) directly — RecordAndBroadcast
+// captures, BroadcastOnly is a no-op since none of the
+// WorkspaceHandler paths under test call it.
 type captureBroadcaster struct {
-	events.Broadcaster // embed to satisfy the interface — only RecordAndBroadcast is overridden
 	lastData map[string]interface{}
 	lastErr  error
 }

+// BroadcastOnly is required to satisfy events.EventEmitter. None of the
+// captureBroadcaster's exercising tests should land here — if a future
+// test does, it'll need to add capture state for that channel.
+func (c *captureBroadcaster) BroadcastOnly(_ string, _ string, _ interface{}) {}
+
 func (c *captureBroadcaster) RecordAndBroadcast(_ context.Context, _, _ string, data interface{}) error {
 	if m, ok := data.(map[string]interface{}); ok {
 		// Shallow-copy so the caller can't mutate our capture.
@ -1107,14 +1113,14 @@ func containsUnsafeString(v interface{}) bool {
 // never leaks internal error details in WORKSPACE_PROVISION_FAILED broadcasts.
 // Regression test for issue #1206.
 func TestProvisionWorkspace_NoInternalErrorsInBroadcast(t *testing.T) {
-	t.Skip("TODO: captureBroadcaster type mismatch with WorkspaceHandler.broadcaster (*events.Broadcaster). Needs broadcaster interface refactor — currently blocking package compile on main (2026-04-21).")
+	t.Skip("TODO: write the test body. The interface blocker (#1814) is fixed — captureBroadcaster now satisfies events.EventEmitter and can be passed to NewWorkspaceHandler. The remaining work is constructing the provisionWorkspace failure path + asserting captured payload doesn't contain unsafeErrorStrings.")
 }

 // TestProvisionWorkspaceCP_NoInternalErrorsInBroadcast asserts that
 // provisionWorkspaceCP never leaks err.Error() in WORKSPACE_PROVISION_FAILED
 // broadcasts. Regression test for issue #1206.
 func TestProvisionWorkspaceCP_NoInternalErrorsInBroadcast(t *testing.T) {
-	t.Skip("TODO: captureBroadcaster type mismatch with WorkspaceHandler.broadcaster (*events.Broadcaster). Needs broadcaster interface refactor — currently blocking package compile on main (2026-04-21).")
+	t.Skip("TODO: write the test body. Same status as TestProvisionWorkspace_NoInternalErrorsInBroadcast — interface blocker fixed (#1814), need to construct the provisionWorkspaceCP failure path + assert payload sanitization.")
 }

 // mockEnvMutator is a provisionhook.Registry stub that always returns a fixed error.
--- a/workspace-server/internal/middleware/wsauth_middleware_canvasorbearer_test.go
+++ b/workspace-server/internal/middleware/wsauth_middleware_canvasorbearer_test.go
@ -0,0 +1,296 @@
+package middleware
+
+// Coverage tests for the CanvasOrBearer middleware and IsSameOriginCanvas
+// per issue #1818. The existing wsauth_middleware_test.go covered the
+// fail-open and Origin paths but missed the bearer-validate, admin-secret,
+// same-origin-canvas, and IsSameOriginCanvas wrapper branches — leaving
+// CanvasOrBearer at 50% and IsSameOriginCanvas at 0% line coverage.
+//
+// These tests target the gaps without re-asserting behavior already
+// pinned by the existing suite.
+
+import (
+	"crypto/sha256"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/gin-gonic/gin"
+)
+
+// ── CanvasOrBearer: bearer-token branches (#1818) ───────────────────────────
+
+// TestCanvasOrBearer_ValidBearer_Passes exercises path 1 (the success
+// branch of the bearer-token validation block). Without this test the
+// only "OK" path covered was the Origin allowlist match — bearer
+// success was never asserted.
+func TestCanvasOrBearer_ValidBearer_Passes(t *testing.T) {
+	mockDB, mock, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock: %v", err)
+	}
+	defer mockDB.Close()
+
+	bearer := "valid-bearer-for-canvas-or-bearer"
+	hash := sha256.Sum256([]byte(bearer))
+
+	mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
+	mock.ExpectQuery(validateAnyTokenSelectQuery).
+		WithArgs(hash[:]).
+		WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("tok-1"))
+	mock.ExpectExec(validateTokenUpdateQuery).
+		WithArgs("tok-1").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	r := gin.New()
+	r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
+		c.JSON(http.StatusOK, gin.H{"ok": true})
+	})
+
+	w := httptest.NewRecorder()
+	req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
+	req.Header.Set("Authorization", "Bearer "+bearer)
+	r.ServeHTTP(w, req)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("valid bearer: got %d, want 200 (%s)", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock: %v", err)
+	}
+}
+
+// TestCanvasOrBearer_InvalidBearer_Returns401 covers the rejection
+// branch when a bearer is supplied but ValidateAnyToken fails. This
+// is the auth-escape case the issue called out: an attacker with a
+// revoked/expired token + matching Origin previously could bypass
+// auth, so the code MUST reject on bad bearer instead of falling
+// through to Origin.
+func TestCanvasOrBearer_InvalidBearer_Returns401(t *testing.T) {
+	mockDB, mock, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock: %v", err)
+	}
+	defer mockDB.Close()
+
+	bad := "expired-or-revoked-token"
+	hash := sha256.Sum256([]byte(bad))
+
+	mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
+	mock.ExpectQuery(validateAnyTokenSelectQuery).
+		WithArgs(hash[:]).
+		WillReturnRows(sqlmock.NewRows([]string{"id"})) // empty → ErrNoRows
+
+	// Origin would otherwise grant access — assert it doesn't here.
+	t.Setenv("CORS_ORIGINS", "https://acme.moleculesai.app")
+
+	handlerCalled := false
+	r := gin.New()
+	r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
+		handlerCalled = true
+		c.JSON(http.StatusOK, gin.H{"ok": true})
+	})
+
+	w := httptest.NewRecorder()
+	req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
+	req.Header.Set("Authorization", "Bearer "+bad)
+	req.Header.Set("Origin", "https://acme.moleculesai.app")
+	r.ServeHTTP(w, req)
+
+	if w.Code != http.StatusUnauthorized {
+		t.Errorf("invalid bearer with matching origin: got %d, want 401", w.Code)
+	}
+	if handlerCalled {
+		t.Error("invalid bearer leaked to handler — Origin fallback bypassed bearer validation")
+	}
+}
+
+// TestCanvasOrBearer_AdminTokenEnv_Passes exercises the ADMIN_TOKEN
+// constant-time-compare branch. The env-secret short-circuit is what
+// keeps the canvas dashboard usable when no DB-backed token rows are
+// provisioned yet (Hyperion bootstrap path).
+func TestCanvasOrBearer_AdminTokenEnv_Passes(t *testing.T) {
+	mockDB, mock, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock: %v", err)
+	}
+	defer mockDB.Close()
+
+	mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
+	// No ValidateAnyToken expectation — env match short-circuits before that.
+
+	t.Setenv("ADMIN_TOKEN", "platform-admin-secret")
+
+	r := gin.New()
+	r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
+		c.JSON(http.StatusOK, gin.H{"ok": true})
+	})
+
+	w := httptest.NewRecorder()
+	req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
+	req.Header.Set("Authorization", "Bearer platform-admin-secret")
+	r.ServeHTTP(w, req)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("admin env match: got %d, want 200 (%s)", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock: %v", err)
+	}
+}
+
+// TestCanvasOrBearer_DBError_FailOpen pins the documented behavior on a
+// HasAnyLiveTokenGlobal failure. The middleware logs and falls open so a
+// flaky DB doesn't lock canvas users out of cosmetic routes. Hardcoded in
+// the comment block; this is a reminder if anyone changes that semantic.
+func TestCanvasOrBearer_DBError_FailOpen(t *testing.T) {
+	mockDB, mock, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock: %v", err)
+	}
+	defer mockDB.Close()
+	mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
+		WillReturnError(http.ErrAbortHandler) // any non-nil error suffices
+
+	r := gin.New()
+	r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
+		c.JSON(http.StatusOK, gin.H{"ok": true})
+	})
+
+	w := httptest.NewRecorder()
+	req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
+	r.ServeHTTP(w, req)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("DB error fail-open: got %d, want 200 (%s)", w.Code, w.Body.String())
+	}
+}
+
+// TestCanvasOrBearer_SameOriginCanvas_Passes exercises path 3: when no
+// CORS Origin matches but the Referer/Host pair indicates the canvas is
+// served from the same combined-tenant image. CANVAS_PROXY_URL gates
+// this branch — without flipping canvasProxyActive the path is dead.
+func TestCanvasOrBearer_SameOriginCanvas_Passes(t *testing.T) {
+	prev := canvasProxyActive
+	canvasProxyActive = true
+	defer func() { canvasProxyActive = prev }()
+
+	mockDB, mock, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock: %v", err)
+	}
+	defer mockDB.Close()
+	mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
+
+	r := gin.New()
+	r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
+		c.JSON(http.StatusOK, gin.H{"ok": true})
+	})
+
+	w := httptest.NewRecorder()
+	req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
+	req.Host = "tenant.moleculesai.app"
+	req.Header.Set("Referer", "https://tenant.moleculesai.app/dashboard")
+	r.ServeHTTP(w, req)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("same-origin canvas: got %d, want 200 (%s)", w.Code, w.Body.String())
+	}
+}
+
+// ── IsSameOriginCanvas: exported wrapper + branch coverage (#1818) ──────────
+
+// TestIsSameOriginCanvas_ExportedWrapper_DelegatesToInternal — IsSame...
+// is at 0% per the audit because nothing in the test suite calls the
+// exported variant. It's a one-line delegation but the auth-boundary
+// surface must stay testable from outside the package.
+func TestIsSameOriginCanvas_ExportedWrapper_DelegatesToInternal(t *testing.T) {
+	prev := canvasProxyActive
+	canvasProxyActive = true
+	defer func() { canvasProxyActive = prev }()
+
+	c, _ := gin.CreateTestContext(httptest.NewRecorder())
+	c.Request = httptest.NewRequest(http.MethodGet, "/x", nil)
+	c.Request.Host = "tenant.moleculesai.app"
+	c.Request.Header.Set("Referer", "https://tenant.moleculesai.app/x")
+
+	if !IsSameOriginCanvas(c) {
+		t.Error("exported IsSameOriginCanvas should accept matching Referer")
+	}
+}
+
+// TestIsSameOriginCanvas_DisabledByEnv — when CANVAS_PROXY_URL was unset
+// at boot, canvasProxyActive is false and the wrapper must return false
+// even on a perfect Referer match. Self-hosted / dev installs rely on
+// this short-circuit.
+func TestIsSameOriginCanvas_DisabledByEnv(t *testing.T) {
+	prev := canvasProxyActive
+	canvasProxyActive = false
+	defer func() { canvasProxyActive = prev }()
+
+	c, _ := gin.CreateTestContext(httptest.NewRecorder())
+	c.Request = httptest.NewRequest(http.MethodGet, "/x", nil)
+	c.Request.Host = "tenant.moleculesai.app"
+	c.Request.Header.Set("Referer", "https://tenant.moleculesai.app/x")
+
+	if IsSameOriginCanvas(c) {
+		t.Error("CANVAS_PROXY_URL unset → IsSameOriginCanvas must return false")
+	}
+}
+
+// TestIsSameOriginCanvas_BranchCoverage exercises every Referer/Origin
+// combination isSameOriginCanvas accepts or rejects. Table-driven so
+// adding a new edge case is one row.
+func TestIsSameOriginCanvas_BranchCoverage(t *testing.T) {
+	prev := canvasProxyActive
+	canvasProxyActive = true
+	defer func() { canvasProxyActive = prev }()
+
+	cases := []struct {
+		name    string
+		host    string
+		referer string
+		origin  string
+		want    bool
+	}{
+		// Referer accepts:
+		{"https referer matches host with path", "h.example.com", "https://h.example.com/dash", "", true},
+		{"http referer matches host with path", "h.example.com", "http://h.example.com/dash", "", true},
+		{"https referer matches host root no path", "h.example.com", "https://h.example.com", "", true},
+		{"http referer matches host root no path", "h.example.com", "http://h.example.com", "", true},
+
+		// Origin fallback (no Referer, used by WS upgrade):
+		{"https origin only (no referer)", "h.example.com", "", "https://h.example.com", true},
+		{"http origin only (no referer)", "h.example.com", "", "http://h.example.com", true},
+
+		// Reject paths — the security-critical ones:
+		{"empty host short-circuits", "", "https://h.example.com/", "", false},
+		{"referer host suffix attack — h.example.com.evil.com", "h.example.com", "https://h.example.com.evil.com/", "", false},
+		{"referer different host", "h.example.com", "https://other.example.com/", "", false},
+		{"origin different host", "h.example.com", "", "https://other.example.com", false},
+		{"no referer no origin", "h.example.com", "", "", false},
+	}
+
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			ctx, _ := gin.CreateTestContext(httptest.NewRecorder())
+			ctx.Request = httptest.NewRequest(http.MethodGet, "/x", nil)
+			ctx.Request.Host = c.host
+			if c.referer != "" {
+				ctx.Request.Header.Set("Referer", c.referer)
+			}
+			if c.origin != "" {
+				ctx.Request.Header.Set("Origin", c.origin)
+			}
+			if got := isSameOriginCanvas(ctx); got != c.want {
+				t.Errorf("isSameOriginCanvas(host=%q, referer=%q, origin=%q) = %v, want %v",
+					c.host, c.referer, c.origin, got, c.want)
+			}
+		})
+	}
+}
--- a/workspace-server/internal/provisioner/backend_contract_test.go
+++ b/workspace-server/internal/provisioner/backend_contract_test.go
@ -38,6 +38,7 @@ package provisioner

 import (
 	"context"
+	"errors"
 	"testing"
 )

@ -138,23 +139,90 @@ func runBackendContract(t *testing.T, backend Backend) {
 }

 // TestDockerBackend_Contract feeds the Docker backend through the
-// shared runner. Skipped pending hardening: the scaffold exposed a
-// real bug — neither backend's Stop/IsRunning handles a nil underlying
-// client gracefully (both panic). Filing that as a separate issue;
-// once both backends return (*, error) instead of panicking, flip this
-// to t.Run and the contract scenarios exercise the fix.
+// shared runner. Was Skip'd pending hardening of Provisioner.{Stop,
+// IsRunning} against nil Docker client — that hardening landed in
+// fix/provisioner-nil-guards-1813, so the scenarios run now. A
+// zero-valued *Provisioner returns ErrNoBackend from each method
+// instead of panicking, satisfying the no-panic contract.
 func TestDockerBackend_Contract(t *testing.T) {
-	t.Skip("scaffolding only — unblock by hardening Provisioner.{Stop,IsRunning} against nil Docker client; see docs/architecture/backends.md drift risk #6")
 	var p Provisioner
 	runBackendContract(t, &p)
 }

 // TestCPProvisionerBackend_Contract — same story as the Docker variant.
-// CPProvisioner panics on nil httpClient today; once that's hardened,
-// remove the Skip and this runner exercises both backends through a
-// single contract.
+// Hardening landed in fix/provisioner-nil-guards-1813; zero-valued
+// *CPProvisioner returns ErrNoBackend from Stop/IsRunning instead of
+// panicking on nil httpClient.
 func TestCPProvisionerBackend_Contract(t *testing.T) {
-	t.Skip("scaffolding only — unblock by hardening CPProvisioner.{Stop,IsRunning} against nil httpClient; see docs/architecture/backends.md drift risk #6")
 	var p CPProvisioner
 	runBackendContract(t, &p)
 }
+
+// TestZeroValuedBackends_NoPanic pins the contract that motivated #1813:
+// a Backend with no underlying client wired up must return cleanly,
+// never panic. The orphan sweeper and shutdown paths both call these
+// methods speculatively where the receiver could be unset.
+//
+// The exact error shape varies between backends:
+//   - Docker Provisioner has no DB-lookup layer; zero-valued always
+//     returns ErrNoBackend.
+//   - CPProvisioner threads through a package-level resolveInstanceID
+//     lookup; when the DB has no row for the workspace (or db.DB
+//     itself is nil), instance_id comes back empty and the method
+//     short-circuits to (false, nil). Only when there's a real
+//     instance_id to query does the missing-httpClient case surface
+//     as ErrNoBackend.
+//
+// Both shapes are acceptable — the contract is "no panic, error is
+// nil or ErrNoBackend." Anything else is a bug.
+func TestZeroValuedBackends_NoPanic(t *testing.T) {
+	acceptableErr := func(err error) bool {
+		return err == nil || errors.Is(err, ErrNoBackend)
+	}
+	t.Run("Provisioner.Stop", func(t *testing.T) {
+		var p Provisioner
+		if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) {
+			t.Errorf("zero-valued Provisioner.Stop: got %v, want ErrNoBackend", err)
+		}
+	})
+	t.Run("Provisioner.IsRunning", func(t *testing.T) {
+		var p Provisioner
+		running, err := p.IsRunning(context.Background(), "any")
+		if !errors.Is(err, ErrNoBackend) {
+			t.Errorf("zero-valued Provisioner.IsRunning: got err=%v, want ErrNoBackend", err)
+		}
+		if running {
+			t.Errorf("zero-valued Provisioner.IsRunning: got running=true, want false")
+		}
+	})
+	t.Run("CPProvisioner.Stop", func(t *testing.T) {
+		var p CPProvisioner
+		if err := p.Stop(context.Background(), "any"); !acceptableErr(err) {
+			t.Errorf("zero-valued CPProvisioner.Stop: got %v, want nil or ErrNoBackend", err)
+		}
+	})
+	t.Run("CPProvisioner.IsRunning", func(t *testing.T) {
+		var p CPProvisioner
+		_, err := p.IsRunning(context.Background(), "any")
+		if !acceptableErr(err) {
+			t.Errorf("zero-valued CPProvisioner.IsRunning: got err=%v, want nil or ErrNoBackend", err)
+		}
+	})
+	// Nil receivers must also be safe. The orphan sweeper code path can
+	// hit Stop on a *Provisioner that's nil (not just zero-valued) when
+	// the wiring path hasn't initialized at all. For nil receivers we
+	// always expect ErrNoBackend (the nil-check at the top fires before
+	// any DB lookup could absorb the case).
+	t.Run("nil_Provisioner.Stop", func(t *testing.T) {
+		var p *Provisioner
+		if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) {
+			t.Errorf("nil *Provisioner.Stop: got %v, want ErrNoBackend", err)
+		}
+	})
+	t.Run("nil_CPProvisioner.Stop", func(t *testing.T) {
+		var p *CPProvisioner
+		if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) {
+			t.Errorf("nil *CPProvisioner.Stop: got %v, want ErrNoBackend", err)
+		}
+	})
+}
--- a/workspace-server/internal/provisioner/cp_provisioner.go
+++ b/workspace-server/internal/provisioner/cp_provisioner.go
@ -194,6 +194,9 @@ func (p *CPProvisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string,
 // blocking the next provision with InvalidGroup.Duplicate — a full
 // "Save & Restart" crash on SaaS.
 func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error {
+	if p == nil {
+		return ErrNoBackend
+	}
 	instanceID, err := resolveInstanceID(ctx, workspaceID)
 	if err != nil {
 		return fmt.Errorf("cp provisioner: stop: resolve instance_id: %w", err)
@ -201,9 +204,17 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error {
 	if instanceID == "" {
 		// No instance was ever provisioned (or already deprovisioned and
 		// the column was cleared). Nothing to terminate — idempotent.
+		// Reached even when httpClient is nil since the empty-instance
+		// path doesn't need HTTP — symmetric with IsRunning.
 		log.Printf("CP provisioner: Stop for %s — no instance_id on file, nothing to do", workspaceID)
 		return nil
 	}
+	if p.httpClient == nil {
+		// HTTP wiring missing but we have an instance_id to terminate —
+		// can't make the DELETE call. Report ErrNoBackend so the
+		// orphan sweeper / shutdown path can branch.
+		return ErrNoBackend
+	}
 	url := fmt.Sprintf("%s/cp/workspaces/%s?instance_id=%s", p.baseURL, workspaceID, instanceID)
 	req, _ := http.NewRequestWithContext(ctx, "DELETE", url, nil)
 	p.provisionAuthHeaders(req)
@ -269,6 +280,9 @@ var resolveInstanceID = func(ctx context.Context, workspaceID string) (string, e
 // Both callers are happy with (true, err); callers that need the
 // previous (false, err) shape must inspect err themselves.
 func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) {
+	if p == nil {
+		return false, ErrNoBackend
+	}
 	instanceID, err := resolveInstanceID(ctx, workspaceID)
 	if err != nil {
 		// Treat DB errors the same as transport errors — (true, err) keeps
@ -277,9 +291,20 @@ func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool
 	}
 	if instanceID == "" {
 		// No instance recorded. Report "not running" cleanly (no error)
-		// so restart cascades can trigger a fresh provision.
+		// so restart cascades can trigger a fresh provision. This path
+		// is reached even on a zero-valued provisioner (no httpClient
+		// wired) — that's intentional; the resolveInstanceID lookup
+		// goes through the package-level db var, not p.httpClient, so
+		// a no-instance workspace gets a clean answer regardless of
+		// HTTP wiring state.
 		return false, nil
 	}
+	if p.httpClient == nil {
+		// HTTP wiring missing but we have an instance_id to query —
+		// can't proceed without a client. Report ErrNoBackend so the
+		// caller can branch (a2a_proxy keeps alive, healthsweep skips).
+		return false, ErrNoBackend
+	}
 	url := fmt.Sprintf("%s/cp/workspaces/%s/status?instance_id=%s", p.baseURL, workspaceID, instanceID)
 	req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
 	p.provisionAuthHeaders(req)
--- a/workspace-server/internal/provisioner/provisioner.go
+++ b/workspace-server/internal/provisioner/provisioner.go
@ -5,6 +5,7 @@ import (
 	"archive/tar"
 	"bytes"
 	"context"
+	"errors"
 	"fmt"
 	"io"
 	"log"
@ -24,6 +25,15 @@ import (
 	ocispec "github.com/opencontainers/image-spec/specs-go/v1"
 )

+// ErrNoBackend is returned by lifecycle methods (Stop, IsRunning) when
+// the receiver is zero-valued — i.e. no Docker daemon connection has
+// been wired up. The orphan sweeper and the contract-test scaffolding
+// both speculatively call these methods on a possibly-nil receiver;
+// returning a typed error rather than panicking lets callers reason
+// about the case explicitly. See docs/architecture/backends.md
+// drift-risk #6.
+var ErrNoBackend = errors.New("provisioner: no backend configured (zero-valued receiver)")
+
 // RuntimeImages maps runtime names to their Docker image refs on GHCR.
 // Each standalone template repo publishes its image via the reusable
 // publish-template-image workflow in molecule-ci on every main merge.
@ -823,6 +833,9 @@ func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) erro
 // respawn the container before ContainerRemove runs, leaving a zombie
 // that re-registers via heartbeat after deletion.
 func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
+	if p == nil || p.cli == nil {
+		return ErrNoBackend
+	}
 	name := ContainerName(workspaceID)

 	// Force-remove kills and removes in one atomic operation, bypassing
@ -856,6 +869,9 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
 // so a real crash still surfaces via exec heartbeat or TTL, both of which
 // have narrower false-positive windows than daemon-inspect RPC.
 func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) {
+	if p == nil || p.cli == nil {
+		return false, ErrNoBackend
+	}
 	name := ContainerName(workspaceID)
 	info, err := p.cli.ContainerInspect(ctx, name)
 	if err != nil {
@ -1066,6 +1082,13 @@ func pullImageAndDrain(ctx context.Context, cli dockerImageClient, ref, platform
 //
 // Tracked in issue #1875; remove this fallback once the template repos
 // publish multi-arch manifests.
+// DefaultImagePlatform is the exported alias used by the admin
+// workspace-images handler so its ImagePull picks the same platform as
+// the provisioner's. Avoids duplicating the Apple-Silicon-needs-amd64
+// logic and keeps both call sites in sync if Docker manifest support
+// changes (e.g., when the templates start shipping multi-arch).
+func DefaultImagePlatform() string { return defaultImagePlatform() }
+
 func defaultImagePlatform() string {
 	if v, ok := os.LookupEnv("MOLECULE_IMAGE_PLATFORM"); ok {
 		return v
--- a/workspace-server/internal/router/router.go
+++ b/workspace-server/internal/router/router.go
@ -402,6 +402,17 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
 		r.POST("/admin/a2a-queue/drop-stale", middleware.AdminAuth(db.DB), qH.DropStale)
 	}

+	// Admin — workspace template image refresh. Pulls latest images from GHCR
+	// and recreates running ws-* containers so they adopt the new image.
+	// Final step of the runtime CD chain — see docs/workspace-runtime-package.md.
+	// Operators (or post-publish automation) hit this after a runtime release.
+	// Reuses the provisioner's Docker client; no-op when prov is nil
+	// (test / non-Docker deploy).
+	if prov != nil {
+		imgH := handlers.NewAdminWorkspaceImagesHandler(prov.DockerClient())
+		r.POST("/admin/workspace-images/refresh", middleware.AdminAuth(db.DB), imgH.Refresh)
+	}
+
 	// Admin — test token minting (issue #6). Hidden in production via TestTokensEnabled().
 	// NOT behind AdminAuth — this is the bootstrap endpoint E2E tests and
 	// fresh installs use to obtain their first admin bearer. Adding AdminAuth
--- a/workspace/.coveragerc
+++ b/workspace/.coveragerc
@ -0,0 +1,13 @@
+# coverage.py config — consumed by `pytest --cov` via the pytest-cov
+# plugin. Lives here (not in pytest.ini) because coverage.py only reads
+# .coveragerc / setup.cfg / tox.ini / pyproject.toml — the [coverage:*]
+# sections in pytest.ini are silently ignored. See issue #1817.
+[run]
+omit =
+    */tests/*
+    */__init__.py
+    plugins_registry/*
+
+[report]
+# Skip files at 100% in the term-missing output to keep CI logs readable.
+skip_covered = True
--- a/workspace/a2a_executor.py
+++ b/workspace/a2a_executor.py
@ -422,7 +422,14 @@ class LangGraphA2AExecutor(AgentExecutor):
                    try:
                        msg.metadata = {"tool_trace": tool_trace}
                    except (AttributeError, TypeError):
-                        pass
+                        # `new_agent_text_message()` returns a plain string in
+                        # MagicMock paths in tests, where assignment to
+                        # .metadata raises despite hasattr being true (the
+                        # mock has the attribute as a property). Suppression
+                        # is intentional — production Message objects always
+                        # accept the assignment. See #1787 + commit dcbcf19
+                        # for the original test-mock motivation.
+                        logger.debug("metadata attach skipped (non-Message return from new_agent_text_message)")
                await event_queue.enqueue_event(msg)
                _result = final_text

--- a/workspace/pytest.ini
+++ b/workspace/pytest.ini
@ -3,4 +3,24 @@ testpaths = tests
 python_files = test_*.py
 python_functions = test_*
 asyncio_mode = auto
-addopts = -q
+# Coverage config moved here from .github/workflows/ci.yml so local
+# `pytest` matches CI without operator-typed flags. cov-fail-under
+# pins the floor at 86% — 5pp below the 91.11% measured on staging
+# (run 24957664272, 2026-04-26). Floor exists so a regression that
+# drops coverage doesn't sneak past CI; tightening above 86% should
+# follow real measurement, not aspiration. See issue #1817.
+#
+# Why 86 not 92: the earlier 97% measurement was without the
+# .coveragerc omit list. Once `*/__init__.py`, `*/tests/*`, and
+# `plugins_registry/*` are excluded (the issue's prescribed omit
+# set, more meaningful since those files don't carry behavior),
+# the actual measurement of behavior-bearing code is 91.11% — and
+# 86% sits at the issue's prescribed `current - 5pp` margin.
+addopts =
+    -q
+    --cov=.
+    --cov-report=term-missing
+    --cov-fail-under=86
+# Coverage omit / report config lives in workspace/.coveragerc — coverage.py
+# only reads .coveragerc / setup.cfg / tox.ini / pyproject.toml, NOT
+# pytest.ini, so [coverage:*] sections here would be silently ignored.
--- a/workspace/scripts/molecule-git-token-helper.sh
+++ b/workspace/scripts/molecule-git-token-helper.sh
@ -110,18 +110,45 @@ _read_cache() {
 }

 # _write_cache — atomically persist token + expiry.
+#
+# Hardened per #1552:
+#  - umask 077 around the writes so .tmp files are 600 from creation,
+#    closing the TOCTOU window where a concurrent reader could read
+#    the token while it was still mode 644 (between the create-with-
+#    default-umask and the later chmod 600).
+#  - Don't swallow chmod errors with `|| true`. A chmod failure leaves
+#    tokens potentially world-readable; surface it as a WARN line so
+#    ops can grep `[molecule-git-token-helper] WARN` and see real
+#    permission failures instead of silent 644 files.
 _write_cache() {
    local token="$1"
    mkdir -p "${CACHE_DIR}"
-    chmod 700 "${CACHE_DIR}" 2>/dev/null || true
+    if ! chmod 700 "${CACHE_DIR}" 2>/dev/null; then
+        echo "[molecule-git-token-helper] WARN: failed to chmod 700 ${CACHE_DIR} — cache dir may be world-readable" >&2
+    fi
    now=$(_now_epoch)
    expiry=$((now + TOKEN_CACHE_TTL_SEC))
+
+    # Restrictive umask so the .tmp files are 600 from creation. Restored
+    # before return so callers' umask isn't perturbed.
+    local prev_umask
+    prev_umask=$(umask)
+    umask 077
+
    # Write atomically via tmp + mv to avoid partial reads.
    printf '%s' "${token}" > "${CACHE_TOKEN_FILE}.tmp"
    printf '%s' "${expiry}" > "${CACHE_EXPIRY_FILE}.tmp"
    mv -f "${CACHE_TOKEN_FILE}.tmp" "${CACHE_TOKEN_FILE}"
    mv -f "${CACHE_EXPIRY_FILE}.tmp" "${CACHE_EXPIRY_FILE}"
-    chmod 600 "${CACHE_TOKEN_FILE}" "${CACHE_EXPIRY_FILE}" 2>/dev/null || true
+
+    umask "${prev_umask}"
+
+    # Belt-and-suspenders chmod — umask 077 should make the files 600
+    # already, but a chmod that fails on the post-rename file is itself
+    # a real signal worth surfacing.
+    if ! chmod 600 "${CACHE_TOKEN_FILE}" "${CACHE_EXPIRY_FILE}" 2>/dev/null; then
+        echo "[molecule-git-token-helper] WARN: chmod 600 failed on cache files — token may be world-readable" >&2
+    fi
 }

 # _fetch_token_from_api — hit the platform endpoint.
@ -230,10 +257,21 @@ case "${ACTION}" in
            echo "[molecule-git-token-helper] _refresh_gh: gh auth login failed (non-fatal)" >&2
        }
        # Also update GH_TOKEN file for scripts that source it.
+        # Same #1552 hardening as _write_cache — umask 077 around the
+        # write so the .tmp file is 600 from creation, and surface a
+        # WARN on chmod failure instead of swallowing it.
        gh_token_file="${HOME}/.gh_token"
+        # `local` is illegal here (top-level case branch, not a
+        # function); shadow with a uniquely-named global instead.
+        _gh_prev_umask=$(umask)
+        umask 077
        printf '%s' "${api_token}" > "${gh_token_file}.tmp"
        mv -f "${gh_token_file}.tmp" "${gh_token_file}"
-        chmod 600 "${gh_token_file}" 2>/dev/null || true
+        umask "${_gh_prev_umask}"
+        unset _gh_prev_umask
+        if ! chmod 600 "${gh_token_file}" 2>/dev/null; then
+            echo "[molecule-git-token-helper] WARN: chmod 600 failed on ${gh_token_file} — token may be world-readable" >&2
+        fi
        echo "[molecule-git-token-helper] _refresh_gh: token refreshed successfully" >&2
        ;;
    _invalidate_cache)
--- a/workspace/tests/test_config.py
+++ b/workspace/tests/test_config.py
@ -2,10 +2,12 @@

 import os

+import pytest
 import yaml

 from config import (
    A2AConfig,
+    ComplianceConfig,
    DelegationConfig,
    SandboxConfig,
    WorkspaceConfig,
@ -244,3 +246,46 @@ def test_shared_context_from_yaml(tmp_path):

    cfg = load_config(str(tmp_path))
    assert cfg.shared_context == ["guidelines.md", "architecture.md"]
+
+
+# ===== Compliance default lock (#2059) =====
+#
+# PR #2056 flipped ComplianceConfig.mode default from "" to "owasp_agentic"
+# so every shipped template gets prompt-injection detection + PII redaction
+# by default. These tests pin the new default at all four entry points so
+# a silent revert (or a refactor that reintroduces the old no-op default)
+# fails fast instead of shipping a workspace with compliance silently off.
+
+
+def test_compliance_dataclass_default():
+    """ComplianceConfig() — no args — must default to owasp_agentic + detect."""
+    cfg = ComplianceConfig()
+    assert cfg.mode == "owasp_agentic"
+    assert cfg.prompt_injection == "detect"
+
+
+@pytest.mark.parametrize(
+    "yaml_payload, expected_mode",
+    [
+        # No `compliance:` key at all — full default path.
+        ({}, "owasp_agentic"),
+        # Explicit empty block — exercises load_config's
+        # `.get("mode", "owasp_agentic")` default-fill at config.py:377.
+        # Common shape during template editing.
+        ({"compliance": {}}, "owasp_agentic"),
+        # Documented opt-out: explicit `mode: ""` disables compliance.
+        ({"compliance": {"mode": ""}}, ""),
+    ],
+    ids=["yaml_omits_block", "yaml_block_empty", "yaml_explicit_optout"],
+)
+def test_compliance_default_via_load_config(tmp_path, yaml_payload, expected_mode):
+    """load_config honors the owasp_agentic default at every yaml shape and
+    still respects explicit opt-out."""
+    config_yaml = tmp_path / "config.yaml"
+    config_yaml.write_text(yaml.dump(yaml_payload))
+
+    cfg = load_config(str(tmp_path))
+    assert cfg.compliance.mode == expected_mode
+    # prompt_injection was never overridden in any payload — must stay at
+    # the dataclass default regardless of the mode value.
+    assert cfg.compliance.prompt_injection == "detect"