From ba78a5c00d5beb119cd40156d7d7cad95c1f74bf Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 00:22:30 -0700
Subject: [PATCH 01/34] test(ops): unit tests for sweep-cf-orphans decide()
 (#2027)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #2027.

The CF orphan sweep deletes DNS records — a misclassification could nuke
a live workspace's tunnel. The decision function had MAX_DELETE_PCT
percentage gating but no automated test of category → action mapping.

Approach: extract the decide() function to scripts/ops/sweep_cf_decide.py
as a verbatim copy bracketed by `# CANONICAL DECIDE BEGIN/END` markers.
The shell script keeps its inline heredoc (so the operational path is
untouched) but bracketed by the same markers. A parity test
(TestParityWithBashScript) reads both files and asserts the bracketed
blocks match line-for-line — drift fails CI loudly.

Coverage (25 tests, 1 file, stdlib unittest only):
- Rule 1 platform-core: apex, _vercel, _domainkey, www/api/app/doc/send/status/staging-api
- Rule 3 ws-*: live (matches EC2 prefix) on prod + staging; orphan on prod + staging
- Rule 4 e2e-*: live + orphan on staging; orphan on prod
- Rule 2 generic tenant: live prod + staging; unknown subdomain kept-for-safety
- Rule 5 fallthrough: external domain + unrelated apex
- Rule priority: api.moleculesai.app stays platform-core (not tenant); _vercel stays verification
- Safety gate: under/at/over default 50% threshold; zero-total no-divide; custom threshold
- Empty live-sets: documents that decide() alone classifies as orphan, gate is the defense

CI: new .github/workflows/test-ops-scripts.yml runs `python -m unittest
discover` against scripts/ops/ on every PR/push that touches the
directory. Lightweight — no requirements file, stdlib only.

Local: `cd scripts/ops && python -m unittest test_sweep_cf_decide -v` →
25 tests, all OK.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/test-ops-scripts.yml |  30 +++
 scripts/ops/sweep-cf-orphans.sh        |  14 +-
 scripts/ops/sweep_cf_decide.py         |  90 +++++++++
 scripts/ops/test_sweep_cf_decide.py    | 268 +++++++++++++++++++++++++
 4 files changed, 397 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/test-ops-scripts.yml
 create mode 100644 scripts/ops/sweep_cf_decide.py
 create mode 100644 scripts/ops/test_sweep_cf_decide.py

diff --git a/.github/workflows/test-ops-scripts.yml b/.github/workflows/test-ops-scripts.yml
new file mode 100644
index 00000000..c377d9e9
--- /dev/null
+++ b/.github/workflows/test-ops-scripts.yml
@@ -0,0 +1,30 @@
+name: Ops Scripts Tests
+
+# Runs the unittest suite for scripts/ops/ on every PR + push that touches
+# the directory. Kept separate from the main CI so a script-only change
+# doesn't trigger the heavier Go/Canvas/Python pipelines.
+
+on:
+  push:
+    branches: [main, staging]
+    paths:
+      - 'scripts/ops/**'
+      - '.github/workflows/test-ops-scripts.yml'
+  pull_request:
+    branches: [main, staging]
+    paths:
+      - 'scripts/ops/**'
+      - '.github/workflows/test-ops-scripts.yml'
+
+jobs:
+  test:
+    name: Ops scripts (unittest)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Run unittest
+        working-directory: scripts/ops
+        run: python -m unittest discover -p 'test_*.py' -v
diff --git a/scripts/ops/sweep-cf-orphans.sh b/scripts/ops/sweep-cf-orphans.sh
index 5e757b79..cdd58f21 100755
--- a/scripts/ops/sweep-cf-orphans.sh
+++ b/scripts/ops/sweep-cf-orphans.sh
@@ -120,15 +120,18 @@ export PROD_SLUGS STAGING_SLUGS EC2_NAMES TOTAL_CF
 DECISIONS=$(echo "$CF_JSON" | python3 -c '
 import json, os, re, sys
 d = json.load(sys.stdin)
-prod = set(os.environ["PROD_SLUGS"].split())
-staging = set(os.environ["STAGING_SLUGS"].split())
-all_slugs = prod | staging
+prod_slugs = set(os.environ["PROD_SLUGS"].split())
+staging_slugs = set(os.environ["STAGING_SLUGS"].split())
 ec2_names = set(n for n in os.environ["EC2_NAMES"].split() if n)
 
-def decide(r):
+# CANONICAL DECIDE BEGIN
+# Edits inside this block must mirror scripts/ops/sweep_cf_decide.py — the
+# parity test in test_sweep_cf_decide.py asserts they match byte-for-byte.
+def decide(r, prod_slugs, staging_slugs, ec2_names):
     n = r["name"]
     rid = r["id"]
     typ = r["type"]
+    all_slugs = prod_slugs | staging_slugs
 
     # Rule 1: platform core — leave alone
     if n == "moleculesai.app":
@@ -170,9 +173,10 @@ def decide(r):
         return ("keep", "unknown-subdomain-kept-for-safety", rid, n, typ)
 
     return ("keep", "not-a-pattern-we-sweep", rid, n, typ)
+# CANONICAL DECIDE END
 
 for r in d["result"]:
-    action, reason, rid, name, typ = decide(r)
+    action, reason, rid, name, typ = decide(r, prod_slugs, staging_slugs, ec2_names)
     print(json.dumps({"action": action, "reason": reason, "id": rid, "name": name, "type": typ}))
 ')
 
diff --git a/scripts/ops/sweep_cf_decide.py b/scripts/ops/sweep_cf_decide.py
new file mode 100644
index 00000000..2e36d7cb
--- /dev/null
+++ b/scripts/ops/sweep_cf_decide.py
@@ -0,0 +1,90 @@
+"""Decision logic extracted from sweep-cf-orphans.sh for unit testing (#2027).
+
+The bash script embeds the same logic inline as a python heredoc — this
+module is a verbatim copy used by test_sweep_cf_decide.py. The parity
+test (TestParityWithBashScript) reads the bash script and asserts the
+canonical block in this file is present byte-for-byte, so the two
+cannot drift apart silently.
+
+If you change the rules: edit BOTH this file AND the inline block in
+``scripts/ops/sweep-cf-orphans.sh`` (the canonical block runs from
+``# CANONICAL DECIDE BEGIN`` to ``# CANONICAL DECIDE END`` markers in
+both files; the parity check compares those slices).
+
+Inputs to ``decide(record, prod_slugs, staging_slugs, ec2_names)``:
+  record       Cloudflare DNS record dict {name, id, type}
+  prod_slugs   set of CP-prod org slugs (live tenants)
+  staging_slugs set of CP-staging org slugs
+  ec2_names    set of live EC2 Name tags (e.g. ``ws-d3605ef2-f7d``)
+
+Returns ``(action, reason, id, name, type)`` matching the bash heredoc.
+"""
+from __future__ import annotations
+
+import re
+from typing import Iterable
+
+
+# CANONICAL DECIDE BEGIN
+def decide(r, prod_slugs, staging_slugs, ec2_names):
+    n = r["name"]
+    rid = r["id"]
+    typ = r["type"]
+    all_slugs = prod_slugs | staging_slugs
+
+    # Rule 1: platform core — leave alone
+    if n == "moleculesai.app":
+        return ("keep", "apex", rid, n, typ)
+    if n.startswith("_") or n.endswith("._domainkey.moleculesai.app"):
+        return ("keep", "verification/key", rid, n, typ)
+    if n in {"api.moleculesai.app","app.moleculesai.app","doc.moleculesai.app",
+            "send.moleculesai.app","status.moleculesai.app","www.moleculesai.app",
+            "staging-api.moleculesai.app"}:
+        return ("keep", "platform-core", rid, n, typ)
+
+    # Rule 3: ws-<hex8>-<rest>.(staging.)moleculesai.app
+    m = re.match(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$", n)
+    if m:
+        prefix = m.group(1)
+        # Live EC2 names are like "ws-d3605ef2-f7d" — same shape as DNS subdomain.
+        for ename in ec2_names:
+            if ename.startswith(prefix):
+                return ("keep", "live-ec2", rid, n, typ)
+        return ("delete", "orphan-ws", rid, n, typ)
+
+    # Rule 4: e2e-* tenants (includes canary, canvas variants)
+    m = re.match(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$", n)
+    if m:
+        slug = m.group(1)
+        if slug in all_slugs:
+            return ("keep", "live-e2e-tenant", rid, n, typ)
+        return ("delete", "orphan-e2e-tenant", rid, n, typ)
+
+    # Rule 2: any other tenant subdomain (slug.moleculesai.app or slug.staging.moleculesai.app)
+    m = re.match(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$", n)
+    if m:
+        slug = m.group(1)
+        if slug in all_slugs:
+            return ("keep", "live-tenant", rid, n, typ)
+        # Only flag as orphan if name looks like a tenant (not a one-off like "hermes-final-*")
+        # To avoid false-positive nukes on ad-hoc records, we KEEP anything that
+        # does not match a known pattern. Orphan only for explicit tenant-shaped names.
+        return ("keep", "unknown-subdomain-kept-for-safety", rid, n, typ)
+
+    return ("keep", "not-a-pattern-we-sweep", rid, n, typ)
+# CANONICAL DECIDE END
+
+
+def safety_gate(total: int, delete_count: int, max_delete_pct: int = 50) -> bool:
+    """Return True iff the sweep is safe to execute.
+
+    Mirrors the shell-side gate in sweep-cf-orphans.sh: if the deletion
+    fraction exceeds ``max_delete_pct`` the sweep refuses to run. The
+    bash script computes the integer percentage as ``DELETE_COUNT*100/TOTAL``
+    — keeping the same arithmetic here so a future "raise to 75%" tweak
+    needs to be made in only one place semantically.
+    """
+    if total <= 0:
+        return True  # nothing to delete; gate is trivially satisfied
+    pct = delete_count * 100 // total
+    return pct <= max_delete_pct
diff --git a/scripts/ops/test_sweep_cf_decide.py b/scripts/ops/test_sweep_cf_decide.py
new file mode 100644
index 00000000..240c88b3
--- /dev/null
+++ b/scripts/ops/test_sweep_cf_decide.py
@@ -0,0 +1,268 @@
+"""Tests for the sweep-cf-orphans.sh decision function (#2027).
+
+Run locally: ``python3 -m unittest scripts/ops/test_sweep_cf_decide.py -v``
+
+Why this exists: the inline Python heredoc in sweep-cf-orphans.sh decides
+which Cloudflare DNS records to delete. A misclassification could nuke a
+live workspace's DNS record. These tests cover the rule priority order +
+the safety gate, plus a parity check that asserts the inline block in the
+shell script matches the importable module byte-for-byte (so the two
+cannot drift silently).
+"""
+from __future__ import annotations
+
+import os
+import unittest
+
+import sweep_cf_decide as M
+
+
+# --- Fixtures ---------------------------------------------------------------
+
+PROD = {"acme", "globex", "initech"}
+STAGING = {"e2e-test-runner", "soak", "playground"}
+LIVE_EC2 = {"ws-d3605ef2-f7d", "ws-aaaaaaaa-bbb", "ws-cafef00d-dec"}
+
+
+def rec(name: str, rid: str = "rid-x", typ: str = "A") -> dict:
+    return {"name": name, "id": rid, "type": typ}
+
+
+def call(record: dict) -> tuple:
+    return M.decide(record, PROD, STAGING, LIVE_EC2)
+
+
+# --- Rule 1: platform core --------------------------------------------------
+
+
+class TestPlatformCore(unittest.TestCase):
+    """Apex, www, api, app, _verification keys must NEVER be touched."""
+
+    def test_apex_kept(self):
+        action, reason, *_ = call(rec("moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "apex"))
+
+    def test_underscore_records_kept(self):
+        # _vercel, _domainkey, _railway-verify, etc.
+        for n in ("_vercel.moleculesai.app", "_railway-verify.moleculesai.app"):
+            action, reason, *_ = call(rec(n))
+            self.assertEqual((action, reason), ("keep", "verification/key"), n)
+
+    def test_dkim_kept(self):
+        action, reason, *_ = call(rec("send._domainkey.moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "verification/key"))
+
+    def test_platform_subdomains_kept(self):
+        for n in (
+            "api.moleculesai.app",
+            "app.moleculesai.app",
+            "doc.moleculesai.app",
+            "send.moleculesai.app",
+            "status.moleculesai.app",
+            "www.moleculesai.app",
+            "staging-api.moleculesai.app",
+        ):
+            action, reason, *_ = call(rec(n))
+            self.assertEqual((action, reason), ("keep", "platform-core"), n)
+
+
+# --- Rule 3: ws-<hex8>-<rest> -----------------------------------------------
+
+
+class TestWsRule(unittest.TestCase):
+    """ws-* DNS records keep iff a live EC2 with the same prefix exists."""
+
+    def test_live_ws_kept(self):
+        action, reason, *_ = call(rec("ws-d3605ef2-f7d.moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "live-ec2"))
+
+    def test_live_ws_kept_on_staging(self):
+        action, reason, *_ = call(rec("ws-aaaaaaaa-bbb.staging.moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "live-ec2"))
+
+    def test_dead_ws_deleted(self):
+        action, reason, *_ = call(rec("ws-deadbeef-fff.moleculesai.app"))
+        self.assertEqual((action, reason), ("delete", "orphan-ws"))
+
+    def test_dead_ws_on_staging_deleted(self):
+        action, reason, *_ = call(rec("ws-deadbeef-fff.staging.moleculesai.app"))
+        self.assertEqual((action, reason), ("delete", "orphan-ws"))
+
+
+# --- Rule 4: e2e-* tenants --------------------------------------------------
+
+
+class TestE2ERule(unittest.TestCase):
+    def test_live_e2e_kept(self):
+        action, reason, *_ = call(rec("e2e-test-runner.staging.moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "live-e2e-tenant"))
+
+    def test_dead_e2e_deleted(self):
+        action, reason, *_ = call(rec("e2e-ghost-1234.staging.moleculesai.app"))
+        self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant"))
+
+    def test_dead_e2e_on_prod_deleted(self):
+        # e2e-* on prod (no .staging) is also tenant-shaped — deletion path.
+        action, reason, *_ = call(rec("e2e-ghost.moleculesai.app"))
+        self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant"))
+
+
+# --- Rule 2: generic tenant subdomain ---------------------------------------
+
+
+class TestTenantSubdomainRule(unittest.TestCase):
+    def test_live_prod_tenant_kept(self):
+        action, reason, *_ = call(rec("acme.moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "live-tenant"))
+
+    def test_live_staging_tenant_kept(self):
+        action, reason, *_ = call(rec("soak.staging.moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "live-tenant"))
+
+    def test_unknown_subdomain_kept_for_safety(self):
+        # The script intentionally KEEPS unknown patterns to avoid blast.
+        action, reason, *_ = call(rec("hermes-final-2.moleculesai.app"))
+        self.assertEqual((action, reason), ("keep", "unknown-subdomain-kept-for-safety"))
+
+
+# --- Rule 5 / fallthrough ---------------------------------------------------
+
+
+class TestNotASweepPattern(unittest.TestCase):
+    def test_external_domain_kept(self):
+        # Domain-spoofing attempt — must NOT match any of the moleculesai.app rules.
+        action, reason, *_ = call(rec("api.openai.com.evil.internal"))
+        self.assertEqual((action, reason), ("keep", "not-a-pattern-we-sweep"))
+
+    def test_unrelated_apex_kept(self):
+        action, reason, *_ = call(rec("example.com"))
+        self.assertEqual((action, reason), ("keep", "not-a-pattern-we-sweep"))
+
+
+# --- Rule priority ----------------------------------------------------------
+
+
+class TestRulePriority(unittest.TestCase):
+    """Rule 1 (platform-core) wins over later rules even if the name shape
+    overlaps — e.g. ``api.moleculesai.app`` matches Rule 2's tenant pattern
+    but must be classified as platform-core."""
+
+    def test_api_subdomain_classified_as_platform_not_tenant(self):
+        action, reason, *_ = call(rec("api.moleculesai.app"))
+        self.assertEqual(reason, "platform-core")
+
+    def test_underscore_record_classified_before_tenant(self):
+        action, reason, *_ = call(rec("_vercel.moleculesai.app"))
+        self.assertEqual(reason, "verification/key")
+
+
+# --- Safety gate ------------------------------------------------------------
+
+
+class TestSafetyGate(unittest.TestCase):
+    """The bash gate refuses to delete >MAX_DELETE_PCT (default 50%)."""
+
+    def test_under_threshold_passes(self):
+        self.assertTrue(M.safety_gate(total=100, delete_count=49))
+        self.assertTrue(M.safety_gate(total=100, delete_count=50))
+
+    def test_over_threshold_fails(self):
+        self.assertFalse(M.safety_gate(total=100, delete_count=51))
+        self.assertFalse(M.safety_gate(total=10, delete_count=10))
+
+    def test_zero_total_passes_trivially(self):
+        # No records → nothing to delete → gate trivially OK (no div-by-zero).
+        self.assertTrue(M.safety_gate(total=0, delete_count=0))
+
+    def test_custom_threshold(self):
+        self.assertTrue(M.safety_gate(total=100, delete_count=70, max_delete_pct=75))
+        self.assertFalse(M.safety_gate(total=100, delete_count=76, max_delete_pct=75))
+
+
+# --- Empty live-sets behavior (incident-prevention) -------------------------
+
+
+class TestEmptyLiveSets(unittest.TestCase):
+    """If the CP admin API returns no orgs (auth broken, network blip),
+    every tenant-shaped record looks orphan. The decide function alone
+    has no defense — that's the safety_gate's job. This test pins the
+    expected behavior so the safety-gate contract is documented."""
+
+    def test_dead_e2e_orphans_when_live_set_empty(self):
+        empty = set()
+        action, reason, *_ = M.decide(
+            rec("e2e-test-runner.staging.moleculesai.app"),
+            empty, empty, set(),
+        )
+        # decide() classifies as orphan — gate is the line of defense.
+        self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant"))
+
+    def test_live_ws_still_kept_when_ec2_set_empty(self):
+        # Symmetric: ws-* without matching EC2 = orphan.
+        action, reason, *_ = M.decide(
+            rec("ws-cafef00d-dec.moleculesai.app"),
+            PROD, STAGING, set(),
+        )
+        self.assertEqual((action, reason), ("delete", "orphan-ws"))
+
+
+# --- Parity check -----------------------------------------------------------
+
+
+class TestParityWithBashScript(unittest.TestCase):
+    """The decision logic exists in two places: the canonical block in
+    sweep_cf_decide.py and the inline heredoc in sweep-cf-orphans.sh.
+    This test asserts the two byte-for-byte match between the
+    ``# CANONICAL DECIDE BEGIN`` / ``# CANONICAL DECIDE END`` markers,
+    so an edit to one without the other fails CI loudly."""
+
+    @staticmethod
+    def _slice_canonical(text: str) -> str:
+        """Return the canonical block, line-anchored (mentions of the marker
+        words inside docstrings are ignored — only an exact-match line
+        ``# CANONICAL DECIDE BEGIN`` opens the slice)."""
+        lines = text.splitlines()
+        begin_idx = end_idx = None
+        for i, line in enumerate(lines):
+            stripped = line.strip()
+            if begin_idx is None and stripped == "# CANONICAL DECIDE BEGIN":
+                begin_idx = i
+            elif begin_idx is not None and stripped == "# CANONICAL DECIDE END":
+                end_idx = i
+                break
+        if begin_idx is None or end_idx is None:
+            raise AssertionError(
+                "missing CANONICAL DECIDE BEGIN/END markers — "
+                "first 30 lines were:\n" + "\n".join(lines[:30])
+            )
+        block = lines[begin_idx + 1:end_idx]
+        # Strip leading whitespace per-line so the .sh heredoc (no indent)
+        # and the .py module (also no indent at function scope) compare equal
+        # even if a future move into a class adds indent on one side.
+        return "\n".join(line.strip() for line in block if line.strip())
+
+    def test_blocks_match(self):
+        here = os.path.dirname(__file__)
+        with open(os.path.join(here, "sweep_cf_decide.py"), "r", encoding="utf-8") as f:
+            py_block = self._slice_canonical(f.read())
+        with open(os.path.join(here, "sweep-cf-orphans.sh"), "r", encoding="utf-8") as f:
+            # Strip the bash-only marker comment line (the .py file doesn't
+            # carry the "Edits inside this block must mirror …" reminder).
+            sh_text = f.read().replace(
+                "# Edits inside this block must mirror scripts/ops/sweep_cf_decide.py — the\n",
+                "",
+            ).replace(
+                "# parity test in test_sweep_cf_decide.py asserts they match byte-for-byte.\n",
+                "",
+            )
+            sh_block = self._slice_canonical(sh_text)
+        self.assertEqual(
+            py_block,
+            sh_block,
+            "CANONICAL DECIDE block has drifted between sweep_cf_decide.py "
+            "and sweep-cf-orphans.sh — re-sync them.",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 6494e9192b350f6c022f96fb816ae6de9fa0cc43 Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 00:28:15 -0700
Subject: [PATCH 02/34] refactor(ops): apply simplify findings on #2027 PR
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Code-quality + efficiency review of PR #2079:

- Hoist all_slugs = prod_slugs | staging_slugs out of decide() into the
  caller (was rebuilt on every record — 1k records × ~50-slug union per
  call). decide() signature now (r, all_slugs, ec2_names).
- Compile regexes at module scope (_WS_RE, _E2E_RE, _TENANT_RE) +
  hoist platform-core literal set (_PLATFORM_CORE_NAMES). Same change
  mirrored in the bash heredoc.
- Drop decorative # Rule N: comments (numbering was out of order, 3
  before 2 — actively confusing).
- Move the "edits must mirror" reminder OUTSIDE the CANONICAL DECIDE
  block in the .sh file, eliminating the .replace() comment-skip hack
  in TestParityWithBashScript.
- Drop per-line .strip() in _slice_canonical (would mask a real
  indentation bug; both blocks already at column 0).
- subTest() in TestPlatformCore loops so a single failure no longer
  short-circuits the rest of the items.
- merge_group + concurrency on test-ops-scripts.yml (parity with
  ci.yml gate behaviour).
- Fix don't apostrophe in inline comment that closed the python
  heredoc's single-quote and broke bash -n.

All 25 tests still pass. bash -n clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/test-ops-scripts.yml |  6 ++
 scripts/ops/sweep-cf-orphans.sh        | 41 ++++++-----
 scripts/ops/sweep_cf_decide.py         | 56 ++++++++-------
 scripts/ops/test_sweep_cf_decide.py    | 99 ++++++++------------------
 4 files changed, 86 insertions(+), 116 deletions(-)

diff --git a/.github/workflows/test-ops-scripts.yml b/.github/workflows/test-ops-scripts.yml
index c377d9e9..9a3a5fa3 100644
--- a/.github/workflows/test-ops-scripts.yml
+++ b/.github/workflows/test-ops-scripts.yml
@@ -15,6 +15,12 @@ on:
     paths:
       - 'scripts/ops/**'
       - '.github/workflows/test-ops-scripts.yml'
+  merge_group:
+    types: [checks_requested]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
 
 jobs:
   test:
diff --git a/scripts/ops/sweep-cf-orphans.sh b/scripts/ops/sweep-cf-orphans.sh
index cdd58f21..569bcbcf 100755
--- a/scripts/ops/sweep-cf-orphans.sh
+++ b/scripts/ops/sweep-cf-orphans.sh
@@ -117,66 +117,69 @@ log "  CF records: $TOTAL_CF"
 #   5. Anything else → keep (we only sweep patterns we understand).
 
 export PROD_SLUGS STAGING_SLUGS EC2_NAMES TOTAL_CF
+# Edits inside the CANONICAL DECIDE block below must mirror
+# scripts/ops/sweep_cf_decide.py — the parity test in
+# test_sweep_cf_decide.py asserts they match byte-for-byte.
 DECISIONS=$(echo "$CF_JSON" | python3 -c '
 import json, os, re, sys
 d = json.load(sys.stdin)
 prod_slugs = set(os.environ["PROD_SLUGS"].split())
 staging_slugs = set(os.environ["STAGING_SLUGS"].split())
+all_slugs = prod_slugs | staging_slugs
 ec2_names = set(n for n in os.environ["EC2_NAMES"].split() if n)
 
+_PLATFORM_CORE_NAMES = {
+    "api.moleculesai.app", "app.moleculesai.app", "doc.moleculesai.app",
+    "send.moleculesai.app", "status.moleculesai.app", "www.moleculesai.app",
+    "staging-api.moleculesai.app",
+}
+_WS_RE = re.compile(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$")
+_E2E_RE = re.compile(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$")
+_TENANT_RE = re.compile(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$")
+
 # CANONICAL DECIDE BEGIN
-# Edits inside this block must mirror scripts/ops/sweep_cf_decide.py — the
-# parity test in test_sweep_cf_decide.py asserts they match byte-for-byte.
-def decide(r, prod_slugs, staging_slugs, ec2_names):
+def decide(r, all_slugs, ec2_names):
     n = r["name"]
     rid = r["id"]
     typ = r["type"]
-    all_slugs = prod_slugs | staging_slugs
 
-    # Rule 1: platform core — leave alone
     if n == "moleculesai.app":
         return ("keep", "apex", rid, n, typ)
     if n.startswith("_") or n.endswith("._domainkey.moleculesai.app"):
         return ("keep", "verification/key", rid, n, typ)
-    if n in {"api.moleculesai.app","app.moleculesai.app","doc.moleculesai.app",
-            "send.moleculesai.app","status.moleculesai.app","www.moleculesai.app",
-            "staging-api.moleculesai.app"}:
+    if n in _PLATFORM_CORE_NAMES:
         return ("keep", "platform-core", rid, n, typ)
 
-    # Rule 3: ws-<hex8>-<rest>.(staging.)moleculesai.app
-    m = re.match(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$", n)
+    m = _WS_RE.match(n)
     if m:
         prefix = m.group(1)
-        # Live EC2 names are like "ws-d3605ef2-f7d" — same shape as DNS subdomain.
+        # Live EC2 names share the ws-<hex8>-<rest> shape with the DNS subdomain.
         for ename in ec2_names:
             if ename.startswith(prefix):
                 return ("keep", "live-ec2", rid, n, typ)
         return ("delete", "orphan-ws", rid, n, typ)
 
-    # Rule 4: e2e-* tenants (includes canary, canvas variants)
-    m = re.match(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$", n)
+    m = _E2E_RE.match(n)
     if m:
         slug = m.group(1)
         if slug in all_slugs:
             return ("keep", "live-e2e-tenant", rid, n, typ)
         return ("delete", "orphan-e2e-tenant", rid, n, typ)
 
-    # Rule 2: any other tenant subdomain (slug.moleculesai.app or slug.staging.moleculesai.app)
-    m = re.match(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$", n)
+    m = _TENANT_RE.match(n)
     if m:
         slug = m.group(1)
         if slug in all_slugs:
             return ("keep", "live-tenant", rid, n, typ)
-        # Only flag as orphan if name looks like a tenant (not a one-off like "hermes-final-*")
-        # To avoid false-positive nukes on ad-hoc records, we KEEP anything that
-        # does not match a known pattern. Orphan only for explicit tenant-shaped names.
+        # KEEP unknown tenant-shaped names — avoid false-positive nukes on
+        # ad-hoc records (e.g. hermes-final-*) that do not match a known slug.
         return ("keep", "unknown-subdomain-kept-for-safety", rid, n, typ)
 
     return ("keep", "not-a-pattern-we-sweep", rid, n, typ)
 # CANONICAL DECIDE END
 
 for r in d["result"]:
-    action, reason, rid, name, typ = decide(r, prod_slugs, staging_slugs, ec2_names)
+    action, reason, rid, name, typ = decide(r, all_slugs, ec2_names)
     print(json.dumps({"action": action, "reason": reason, "id": rid, "name": name, "type": typ}))
 ')
 
diff --git a/scripts/ops/sweep_cf_decide.py b/scripts/ops/sweep_cf_decide.py
index 2e36d7cb..23cfb2fd 100644
--- a/scripts/ops/sweep_cf_decide.py
+++ b/scripts/ops/sweep_cf_decide.py
@@ -11,10 +11,11 @@ If you change the rules: edit BOTH this file AND the inline block in
 ``# CANONICAL DECIDE BEGIN`` to ``# CANONICAL DECIDE END`` markers in
 both files; the parity check compares those slices).
 
-Inputs to ``decide(record, prod_slugs, staging_slugs, ec2_names)``:
+Inputs to ``decide(record, all_slugs, ec2_names)``:
   record       Cloudflare DNS record dict {name, id, type}
-  prod_slugs   set of CP-prod org slugs (live tenants)
-  staging_slugs set of CP-staging org slugs
+  all_slugs    set of CP org slugs (prod ∪ staging) — caller computes the
+               union once instead of per-record (decide is hot-path: 100s
+               to 1000s of records per sweep)
   ec2_names    set of live EC2 Name tags (e.g. ``ws-d3605ef2-f7d``)
 
 Returns ``(action, reason, id, name, type)`` matching the bash heredoc.
@@ -22,53 +23,57 @@ Returns ``(action, reason, id, name, type)`` matching the bash heredoc.
 from __future__ import annotations
 
 import re
-from typing import Iterable
+
+
+# Pre-compile per-record regexes once at module load — saves the per-call
+# pattern-cache lookup across 1000s of CF records per sweep. Mirrored at
+# the same scope in sweep-cf-orphans.sh's heredoc.
+_PLATFORM_CORE_NAMES = {
+    "api.moleculesai.app", "app.moleculesai.app", "doc.moleculesai.app",
+    "send.moleculesai.app", "status.moleculesai.app", "www.moleculesai.app",
+    "staging-api.moleculesai.app",
+}
+_WS_RE = re.compile(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$")
+_E2E_RE = re.compile(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$")
+_TENANT_RE = re.compile(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$")
 
 
 # CANONICAL DECIDE BEGIN
-def decide(r, prod_slugs, staging_slugs, ec2_names):
+def decide(r, all_slugs, ec2_names):
     n = r["name"]
     rid = r["id"]
     typ = r["type"]
-    all_slugs = prod_slugs | staging_slugs
 
-    # Rule 1: platform core — leave alone
     if n == "moleculesai.app":
         return ("keep", "apex", rid, n, typ)
     if n.startswith("_") or n.endswith("._domainkey.moleculesai.app"):
         return ("keep", "verification/key", rid, n, typ)
-    if n in {"api.moleculesai.app","app.moleculesai.app","doc.moleculesai.app",
-            "send.moleculesai.app","status.moleculesai.app","www.moleculesai.app",
-            "staging-api.moleculesai.app"}:
+    if n in _PLATFORM_CORE_NAMES:
         return ("keep", "platform-core", rid, n, typ)
 
-    # Rule 3: ws-<hex8>-<rest>.(staging.)moleculesai.app
-    m = re.match(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$", n)
+    m = _WS_RE.match(n)
     if m:
         prefix = m.group(1)
-        # Live EC2 names are like "ws-d3605ef2-f7d" — same shape as DNS subdomain.
+        # Live EC2 names share the ws-<hex8>-<rest> shape with the DNS subdomain.
         for ename in ec2_names:
             if ename.startswith(prefix):
                 return ("keep", "live-ec2", rid, n, typ)
         return ("delete", "orphan-ws", rid, n, typ)
 
-    # Rule 4: e2e-* tenants (includes canary, canvas variants)
-    m = re.match(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$", n)
+    m = _E2E_RE.match(n)
     if m:
         slug = m.group(1)
         if slug in all_slugs:
             return ("keep", "live-e2e-tenant", rid, n, typ)
         return ("delete", "orphan-e2e-tenant", rid, n, typ)
 
-    # Rule 2: any other tenant subdomain (slug.moleculesai.app or slug.staging.moleculesai.app)
-    m = re.match(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$", n)
+    m = _TENANT_RE.match(n)
     if m:
         slug = m.group(1)
         if slug in all_slugs:
             return ("keep", "live-tenant", rid, n, typ)
-        # Only flag as orphan if name looks like a tenant (not a one-off like "hermes-final-*")
-        # To avoid false-positive nukes on ad-hoc records, we KEEP anything that
-        # does not match a known pattern. Orphan only for explicit tenant-shaped names.
+        # KEEP unknown tenant-shaped names — avoid false-positive nukes on
+        # ad-hoc records (e.g. hermes-final-*) that do not match a known slug.
         return ("keep", "unknown-subdomain-kept-for-safety", rid, n, typ)
 
     return ("keep", "not-a-pattern-we-sweep", rid, n, typ)
@@ -78,13 +83,12 @@ def decide(r, prod_slugs, staging_slugs, ec2_names):
 def safety_gate(total: int, delete_count: int, max_delete_pct: int = 50) -> bool:
     """Return True iff the sweep is safe to execute.
 
-    Mirrors the shell-side gate in sweep-cf-orphans.sh: if the deletion
-    fraction exceeds ``max_delete_pct`` the sweep refuses to run. The
-    bash script computes the integer percentage as ``DELETE_COUNT*100/TOTAL``
-    — keeping the same arithmetic here so a future "raise to 75%" tweak
-    needs to be made in only one place semantically.
+    Mirrors the shell-side gate: if the deletion fraction exceeds
+    ``max_delete_pct`` the sweep refuses to run. Same integer arithmetic
+    as the bash script (``DELETE_COUNT*100/TOTAL``) so a future threshold
+    tweak only needs to land in one semantic place.
     """
     if total <= 0:
-        return True  # nothing to delete; gate is trivially satisfied
+        return True
     pct = delete_count * 100 // total
     return pct <= max_delete_pct
diff --git a/scripts/ops/test_sweep_cf_decide.py b/scripts/ops/test_sweep_cf_decide.py
index 240c88b3..930ba3c0 100644
--- a/scripts/ops/test_sweep_cf_decide.py
+++ b/scripts/ops/test_sweep_cf_decide.py
@@ -17,10 +17,8 @@ import unittest
 import sweep_cf_decide as M
 
 
-# --- Fixtures ---------------------------------------------------------------
-
-PROD = {"acme", "globex", "initech"}
-STAGING = {"e2e-test-runner", "soak", "playground"}
+# Caller responsibility (per the new decide signature): compute the union once.
+ALL_SLUGS = {"acme", "globex", "initech", "e2e-test-runner", "soak", "playground"}
 LIVE_EC2 = {"ws-d3605ef2-f7d", "ws-aaaaaaaa-bbb", "ws-cafef00d-dec"}
 
 
@@ -29,10 +27,7 @@ def rec(name: str, rid: str = "rid-x", typ: str = "A") -> dict:
 
 
 def call(record: dict) -> tuple:
-    return M.decide(record, PROD, STAGING, LIVE_EC2)
-
-
-# --- Rule 1: platform core --------------------------------------------------
+    return M.decide(record, ALL_SLUGS, LIVE_EC2)
 
 
 class TestPlatformCore(unittest.TestCase):
@@ -43,10 +38,10 @@ class TestPlatformCore(unittest.TestCase):
         self.assertEqual((action, reason), ("keep", "apex"))
 
     def test_underscore_records_kept(self):
-        # _vercel, _domainkey, _railway-verify, etc.
         for n in ("_vercel.moleculesai.app", "_railway-verify.moleculesai.app"):
-            action, reason, *_ = call(rec(n))
-            self.assertEqual((action, reason), ("keep", "verification/key"), n)
+            with self.subTest(name=n):
+                action, reason, *_ = call(rec(n))
+                self.assertEqual((action, reason), ("keep", "verification/key"))
 
     def test_dkim_kept(self):
         action, reason, *_ = call(rec("send._domainkey.moleculesai.app"))
@@ -62,11 +57,9 @@ class TestPlatformCore(unittest.TestCase):
             "www.moleculesai.app",
             "staging-api.moleculesai.app",
         ):
-            action, reason, *_ = call(rec(n))
-            self.assertEqual((action, reason), ("keep", "platform-core"), n)
-
-
-# --- Rule 3: ws-<hex8>-<rest> -----------------------------------------------
+            with self.subTest(name=n):
+                action, reason, *_ = call(rec(n))
+                self.assertEqual((action, reason), ("keep", "platform-core"))
 
 
 class TestWsRule(unittest.TestCase):
@@ -89,9 +82,6 @@ class TestWsRule(unittest.TestCase):
         self.assertEqual((action, reason), ("delete", "orphan-ws"))
 
 
-# --- Rule 4: e2e-* tenants --------------------------------------------------
-
-
 class TestE2ERule(unittest.TestCase):
     def test_live_e2e_kept(self):
         action, reason, *_ = call(rec("e2e-test-runner.staging.moleculesai.app"))
@@ -102,14 +92,10 @@ class TestE2ERule(unittest.TestCase):
         self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant"))
 
     def test_dead_e2e_on_prod_deleted(self):
-        # e2e-* on prod (no .staging) is also tenant-shaped — deletion path.
         action, reason, *_ = call(rec("e2e-ghost.moleculesai.app"))
         self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant"))
 
 
-# --- Rule 2: generic tenant subdomain ---------------------------------------
-
-
 class TestTenantSubdomainRule(unittest.TestCase):
     def test_live_prod_tenant_kept(self):
         action, reason, *_ = call(rec("acme.moleculesai.app"))
@@ -120,14 +106,10 @@ class TestTenantSubdomainRule(unittest.TestCase):
         self.assertEqual((action, reason), ("keep", "live-tenant"))
 
     def test_unknown_subdomain_kept_for_safety(self):
-        # The script intentionally KEEPS unknown patterns to avoid blast.
         action, reason, *_ = call(rec("hermes-final-2.moleculesai.app"))
         self.assertEqual((action, reason), ("keep", "unknown-subdomain-kept-for-safety"))
 
 
-# --- Rule 5 / fallthrough ---------------------------------------------------
-
-
 class TestNotASweepPattern(unittest.TestCase):
     def test_external_domain_kept(self):
         # Domain-spoofing attempt — must NOT match any of the moleculesai.app rules.
@@ -139,13 +121,10 @@ class TestNotASweepPattern(unittest.TestCase):
         self.assertEqual((action, reason), ("keep", "not-a-pattern-we-sweep"))
 
 
-# --- Rule priority ----------------------------------------------------------
-
-
 class TestRulePriority(unittest.TestCase):
-    """Rule 1 (platform-core) wins over later rules even if the name shape
-    overlaps — e.g. ``api.moleculesai.app`` matches Rule 2's tenant pattern
-    but must be classified as platform-core."""
+    """Platform-core check must precede the tenant-subdomain regex —
+    e.g. ``api.moleculesai.app`` matches the tenant pattern but must
+    classify as platform-core."""
 
     def test_api_subdomain_classified_as_platform_not_tenant(self):
         action, reason, *_ = call(rec("api.moleculesai.app"))
@@ -156,9 +135,6 @@ class TestRulePriority(unittest.TestCase):
         self.assertEqual(reason, "verification/key")
 
 
-# --- Safety gate ------------------------------------------------------------
-
-
 class TestSafetyGate(unittest.TestCase):
     """The bash gate refuses to delete >MAX_DELETE_PCT (default 50%)."""
 
@@ -179,48 +155,42 @@ class TestSafetyGate(unittest.TestCase):
         self.assertFalse(M.safety_gate(total=100, delete_count=76, max_delete_pct=75))
 
 
-# --- Empty live-sets behavior (incident-prevention) -------------------------
-
-
 class TestEmptyLiveSets(unittest.TestCase):
     """If the CP admin API returns no orgs (auth broken, network blip),
-    every tenant-shaped record looks orphan. The decide function alone
-    has no defense — that's the safety_gate's job. This test pins the
-    expected behavior so the safety-gate contract is documented."""
+    every tenant-shaped record looks orphan. decide() alone has no
+    defense — that's safety_gate's job. This test pins the contract so
+    a future "make decide() defensive" change doesn't silently bypass
+    the gate."""
 
     def test_dead_e2e_orphans_when_live_set_empty(self):
-        empty = set()
         action, reason, *_ = M.decide(
             rec("e2e-test-runner.staging.moleculesai.app"),
-            empty, empty, set(),
+            set(), set(),
         )
-        # decide() classifies as orphan — gate is the line of defense.
         self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant"))
 
     def test_live_ws_still_kept_when_ec2_set_empty(self):
-        # Symmetric: ws-* without matching EC2 = orphan.
         action, reason, *_ = M.decide(
             rec("ws-cafef00d-dec.moleculesai.app"),
-            PROD, STAGING, set(),
+            ALL_SLUGS, set(),
         )
         self.assertEqual((action, reason), ("delete", "orphan-ws"))
 
 
-# --- Parity check -----------------------------------------------------------
-
-
 class TestParityWithBashScript(unittest.TestCase):
     """The decision logic exists in two places: the canonical block in
     sweep_cf_decide.py and the inline heredoc in sweep-cf-orphans.sh.
-    This test asserts the two byte-for-byte match between the
+    This test asserts the two match between the
     ``# CANONICAL DECIDE BEGIN`` / ``# CANONICAL DECIDE END`` markers,
-    so an edit to one without the other fails CI loudly."""
+    so an edit to one without the other fails CI loudly. The mirror-
+    reminder comment lives OUTSIDE the markers in the .sh file so we
+    don't need to special-case it here."""
 
     @staticmethod
-    def _slice_canonical(text: str) -> str:
-        """Return the canonical block, line-anchored (mentions of the marker
-        words inside docstrings are ignored — only an exact-match line
-        ``# CANONICAL DECIDE BEGIN`` opens the slice)."""
+    def _slice_canonical(text: str) -> list[str]:
+        """Return the lines between the canonical markers, exclusive.
+        Markers are matched line-anchored (a stripped-line literal match)
+        so the docstring's prose mention is ignored."""
         lines = text.splitlines()
         begin_idx = end_idx = None
         for i, line in enumerate(lines):
@@ -235,27 +205,14 @@ class TestParityWithBashScript(unittest.TestCase):
                 "missing CANONICAL DECIDE BEGIN/END markers — "
                 "first 30 lines were:\n" + "\n".join(lines[:30])
             )
-        block = lines[begin_idx + 1:end_idx]
-        # Strip leading whitespace per-line so the .sh heredoc (no indent)
-        # and the .py module (also no indent at function scope) compare equal
-        # even if a future move into a class adds indent on one side.
-        return "\n".join(line.strip() for line in block if line.strip())
+        return lines[begin_idx + 1:end_idx]
 
     def test_blocks_match(self):
         here = os.path.dirname(__file__)
         with open(os.path.join(here, "sweep_cf_decide.py"), "r", encoding="utf-8") as f:
             py_block = self._slice_canonical(f.read())
         with open(os.path.join(here, "sweep-cf-orphans.sh"), "r", encoding="utf-8") as f:
-            # Strip the bash-only marker comment line (the .py file doesn't
-            # carry the "Edits inside this block must mirror …" reminder).
-            sh_text = f.read().replace(
-                "# Edits inside this block must mirror scripts/ops/sweep_cf_decide.py — the\n",
-                "",
-            ).replace(
-                "# parity test in test_sweep_cf_decide.py asserts they match byte-for-byte.\n",
-                "",
-            )
-            sh_block = self._slice_canonical(sh_text)
+            sh_block = self._slice_canonical(f.read())
         self.assertEqual(
             py_block,
             sh_block,

From 5fe6397765515dd1d1f3911b98bde2b9662e788b Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 00:57:44 -0700
Subject: [PATCH 03/34] fix(discovery): apply ?q= filter to Peers list (#1038)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Peers handler at workspace-server/internal/handlers/discovery.go
ignored the ?q= query param entirely — every caller got the full peer
list regardless of what they searched for. The handler exposes peer
identities + URLs, so leaking the unfiltered set on a "filtered"
endpoint is an info-disclosure bug (CWE-862).

Fix: read c.Query("q") and post-filter the in-memory peers slice by
case-insensitive substring match against name OR role. Filtering is
done in Go after the existing 3 SQL reads — keeps the SQL bytes
identical to the no-filter path (no injection vector, no DB-driver
collation surprises) at a small cost. The peer set is bounded by a
single workspace's parent + children + siblings (typically <50
rows), so the in-memory pass is negligible.

Empty / whitespace-only q is a no-op — preserves the no-filter
allocation profile.

Tests (6 new in discovery_test.go):
- TestPeers_NoQ_ReturnsAll — regression baseline (3 peers, no filter)
- TestPeers_Q_FiltersByName — q=alpha → ws-alpha only
- TestPeers_Q_CaseInsensitive — q=ALPHA → ws-alpha (locks in ToLower)
- TestPeers_Q_FiltersByRole — q=design → ws-beta (role-side match)
- TestPeers_Q_NoMatches — empty array, JSON [] not null
- TestPeers_Q_WhitespaceOnly — q='  ' treated as no-filter

Helpers peersFilterFixture + runPeersWithQuery + peerNames keep each
test scoped to the q-behaviour, not re-declaring SQL expectations.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../internal/handlers/discovery.go            |  35 +++++
 .../internal/handlers/discovery_test.go       | 144 ++++++++++++++++++
 2 files changed, 179 insertions(+)

diff --git a/workspace-server/internal/handlers/discovery.go b/workspace-server/internal/handlers/discovery.go
index d8edf814..fbc2cc2a 100644
--- a/workspace-server/internal/handlers/discovery.go
+++ b/workspace-server/internal/handlers/discovery.go
@@ -177,6 +177,14 @@ func writeExternalWorkspaceURL(ctx context.Context, c *gin.Context, callerID, ta
 }
 
 // Peers handles GET /registry/:id/peers
+//
+// Optional ``?q=<substring>`` filters the result by case-insensitive
+// substring match against ``name`` or ``role`` (#1038). Filtering is done
+// in Go after the DB read — keeps the SQL identical to the no-filter path
+// (no injection risk, no DB-driver collation surprises) at the cost of
+// loading the unfiltered set first. Acceptable because the peer set is
+// always bounded by the small fanout of a single workspace's parent +
+// children + siblings (typically <50 rows).
 func (h *DiscoveryHandler) Peers(c *gin.Context) {
 	workspaceID := c.Param("id")
 	ctx := c.Request.Context()
@@ -241,12 +249,39 @@ func (h *DiscoveryHandler) Peers(c *gin.Context) {
 		peers = append(peers, parent...)
 	}
 
+	peers = filterPeersByQuery(peers, c.Query("q"))
+
 	if peers == nil {
 		peers = make([]map[string]interface{}, 0)
 	}
 	c.JSON(http.StatusOK, peers)
 }
 
+// filterPeersByQuery returns the subset of peers whose name or role
+// case-insensitively contains q. An empty/whitespace-only q is a no-op
+// (returns the input slice unchanged) so the no-filter path stays free
+// of allocations. Map values come from queryPeerMaps which always
+// stringifies name + role, so the type asserts here cannot fail in
+// practice — the comma-ok form belt-and-braces against future shape
+// changes upstream.
+func filterPeersByQuery(peers []map[string]interface{}, q string) []map[string]interface{} {
+	q = strings.TrimSpace(q)
+	if q == "" {
+		return peers
+	}
+	needle := strings.ToLower(q)
+	out := make([]map[string]interface{}, 0, len(peers))
+	for _, p := range peers {
+		name, _ := p["name"].(string)
+		role, _ := p["role"].(string)
+		if strings.Contains(strings.ToLower(name), needle) ||
+			strings.Contains(strings.ToLower(role), needle) {
+			out = append(out, p)
+		}
+	}
+	return out
+}
+
 // queryPeerMaps returns clean JSON-serializable maps instead of Workspace structs.
 func queryPeerMaps(query string, args ...interface{}) ([]map[string]interface{}, error) {
 	rows, err := db.DB.Query(query, args...)
diff --git a/workspace-server/internal/handlers/discovery_test.go b/workspace-server/internal/handlers/discovery_test.go
index 7c90005e..a027723f 100644
--- a/workspace-server/internal/handlers/discovery_test.go
+++ b/workspace-server/internal/handlers/discovery_test.go
@@ -258,6 +258,150 @@ func TestPeers_RootWorkspace_NoPeers(t *testing.T) {
 	}
 }
 
+// ==================== Peers — ?q= filter (#1038) ====================
+
+// peersFilterFixture mocks all 3 SQL reads with a known 3-peer set so each
+// q-filter test can focus on the post-fetch substring-match behaviour
+// without re-stating the full query expectations.
+func peersFilterFixture(t *testing.T) (*DiscoveryHandler, sqlmock.Sqlmock) {
+	t.Helper()
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
+		WithArgs("ws-self").
+		WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow("ws-pm"))
+
+	cols := []string{"id", "name", "role", "tier", "status", "agent_card", "url", "parent_id", "active_tasks"}
+
+	mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2").
+		WithArgs("ws-pm", "ws-self").
+		WillReturnRows(sqlmock.NewRows(cols).
+			AddRow("ws-alpha", "Alpha Researcher", "researcher", 1, "online", []byte("null"), "http://a", "ws-pm", 0).
+			AddRow("ws-beta", "Beta Designer", "designer", 1, "online", []byte("null"), "http://b", "ws-pm", 0))
+
+	mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.status").
+		WithArgs("ws-self").
+		WillReturnRows(sqlmock.NewRows(cols))
+
+	mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.status").
+		WithArgs("ws-pm").
+		WillReturnRows(sqlmock.NewRows(cols).
+			AddRow("ws-pm", "PM Workspace", "manager", 2, "online", []byte("null"), "http://pm", nil, 1))
+
+	return NewDiscoveryHandler(), mock
+}
+
+func runPeersWithQuery(t *testing.T, handler *DiscoveryHandler, q string) []map[string]interface{} {
+	t.Helper()
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-self"}}
+	url := "/registry/ws-self/peers"
+	if q != "" {
+		url += "?q=" + q
+	}
+	c.Request = httptest.NewRequest("GET", url, nil)
+	handler.Peers(c)
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	var peers []map[string]interface{}
+	if err := json.Unmarshal(w.Body.Bytes(), &peers); err != nil {
+		t.Fatalf("parse response: %v", err)
+	}
+	return peers
+}
+
+// TestPeers_NoQ_ReturnsAll locks in the regression baseline: without ?q,
+// the handler returns the full 3-peer fixture (alpha + beta + pm).
+func TestPeers_NoQ_ReturnsAll(t *testing.T) {
+	handler, mock := peersFilterFixture(t)
+	peers := runPeersWithQuery(t, handler, "")
+	if len(peers) != 3 {
+		t.Errorf("no-q: expected 3 peers, got %d (%v)", len(peers), peerNames(peers))
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestPeers_Q_FiltersByName covers the primary CWE-862 fix path.
+func TestPeers_Q_FiltersByName(t *testing.T) {
+	handler, mock := peersFilterFixture(t)
+	peers := runPeersWithQuery(t, handler, "alpha")
+	if len(peers) != 1 || peers[0]["id"] != "ws-alpha" {
+		t.Errorf("q=alpha: expected just ws-alpha, got %v", peerNames(peers))
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestPeers_Q_CaseInsensitive guards against a future strings.Contains
+// without ToLower regression.
+func TestPeers_Q_CaseInsensitive(t *testing.T) {
+	handler, mock := peersFilterFixture(t)
+	peers := runPeersWithQuery(t, handler, "ALPHA")
+	if len(peers) != 1 || peers[0]["id"] != "ws-alpha" {
+		t.Errorf("q=ALPHA: expected ws-alpha, got %v", peerNames(peers))
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestPeers_Q_FiltersByRole — role-based match (designer is the role of
+// ws-beta; the substring "design" must surface beta and only beta).
+func TestPeers_Q_FiltersByRole(t *testing.T) {
+	handler, mock := peersFilterFixture(t)
+	peers := runPeersWithQuery(t, handler, "design")
+	if len(peers) != 1 || peers[0]["id"] != "ws-beta" {
+		t.Errorf("q=design: expected ws-beta, got %v", peerNames(peers))
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestPeers_Q_NoMatches returns an empty array, not null.
+func TestPeers_Q_NoMatches(t *testing.T) {
+	handler, mock := peersFilterFixture(t)
+	peers := runPeersWithQuery(t, handler, "nonexistent")
+	if len(peers) != 0 {
+		t.Errorf("q=nonexistent: expected 0 peers, got %d (%v)", len(peers), peerNames(peers))
+	}
+	// JSON should serialize as `[]` not `null` — re-encode and inspect.
+	if got, _ := json.Marshal(peers); string(got) != "[]" {
+		t.Errorf("expected JSON [], got %s", string(got))
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestPeers_Q_WhitespaceOnly is treated as an empty filter.
+func TestPeers_Q_WhitespaceOnly(t *testing.T) {
+	handler, mock := peersFilterFixture(t)
+	peers := runPeersWithQuery(t, handler, "%20%20")
+	if len(peers) != 3 {
+		t.Errorf("q=whitespace: expected unfiltered 3 peers, got %d", len(peers))
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+func peerNames(peers []map[string]interface{}) []string {
+	out := make([]string, 0, len(peers))
+	for _, p := range peers {
+		if id, ok := p["id"].(string); ok {
+			out = append(out, id)
+		}
+	}
+	return out
+}
+
 // ==================== CheckAccess ====================
 
 func TestCheckAccess_BadJSON(t *testing.T) {

From 641b1391e2a2c4782b528880408a3f393d421f0c Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 01:02:19 -0700
Subject: [PATCH 04/34] refactor(discovery): apply simplify findings on #1038
 PR
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Code-quality + efficiency review of PR #2081:

- Drop comma-ok on map type-asserts in filterPeersByQuery —
  queryPeerMaps writes name/role unconditionally as string, so the
  silent-empty-string fallback was cargo-culted defense that would
  HIDE a real upstream shape change in tests rather than surface it.
  Plain p["name"].(string) panics on violation, caught by tests.
- Trim filterPeersByQuery doc from 5 lines to 1 — function is 15
  lines and self-evident.
- Refactor 6 separate Test functions into one table-driven
  TestPeers_QFilter with 6 sub-tests. Net ~80 lines saved + naming
  becomes readable subtest names instead of TestPeers_Q_Foo_Bar.
- Set-based peer-id comparison (peerIDSet) replaces fragile
  peers[0]["id"] == "ws-alpha" asserts that would silently mask a
  future sort/order regression on the production code.
- Fix the broken TestPeers_Q_NoMatches assertion: re-encoding an
  unmarshalled []map collapses both null and [] to [], so the
  previous json.Marshal(peers) == "[]" check was tautological. Move
  the [] vs null distinction to a dedicated test
  (TestPeers_Q_NoMatches_RawBodyIsArrayNotNull) that inspects the
  recorder body BEFORE unmarshal.

runPeersWithQuery now returns both parsed peers and raw body so the
nil-guard test can use the bytes directly.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../internal/handlers/discovery.go            |  13 +-
 .../internal/handlers/discovery_test.go       | 180 +++++++++---------
 2 files changed, 94 insertions(+), 99 deletions(-)

diff --git a/workspace-server/internal/handlers/discovery.go b/workspace-server/internal/handlers/discovery.go
index fbc2cc2a..6422e002 100644
--- a/workspace-server/internal/handlers/discovery.go
+++ b/workspace-server/internal/handlers/discovery.go
@@ -257,13 +257,8 @@ func (h *DiscoveryHandler) Peers(c *gin.Context) {
 	c.JSON(http.StatusOK, peers)
 }
 
-// filterPeersByQuery returns the subset of peers whose name or role
-// case-insensitively contains q. An empty/whitespace-only q is a no-op
-// (returns the input slice unchanged) so the no-filter path stays free
-// of allocations. Map values come from queryPeerMaps which always
-// stringifies name + role, so the type asserts here cannot fail in
-// practice — the comma-ok form belt-and-braces against future shape
-// changes upstream.
+// filterPeersByQuery returns peers whose name or role case-insensitively
+// contains q. Whitespace-trimmed empty q is a no-op (returns input unchanged).
 func filterPeersByQuery(peers []map[string]interface{}, q string) []map[string]interface{} {
 	q = strings.TrimSpace(q)
 	if q == "" {
@@ -272,8 +267,8 @@ func filterPeersByQuery(peers []map[string]interface{}, q string) []map[string]i
 	needle := strings.ToLower(q)
 	out := make([]map[string]interface{}, 0, len(peers))
 	for _, p := range peers {
-		name, _ := p["name"].(string)
-		role, _ := p["role"].(string)
+		name := p["name"].(string)
+		role := p["role"].(string)
 		if strings.Contains(strings.ToLower(name), needle) ||
 			strings.Contains(strings.ToLower(role), needle) {
 			out = append(out, p)
diff --git a/workspace-server/internal/handlers/discovery_test.go b/workspace-server/internal/handlers/discovery_test.go
index a027723f..e2368aac 100644
--- a/workspace-server/internal/handlers/discovery_test.go
+++ b/workspace-server/internal/handlers/discovery_test.go
@@ -7,6 +7,7 @@ import (
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
+	"strings"
 	"testing"
 
 	"github.com/DATA-DOG/go-sqlmock"
@@ -260,9 +261,10 @@ func TestPeers_RootWorkspace_NoPeers(t *testing.T) {
 
 // ==================== Peers — ?q= filter (#1038) ====================
 
-// peersFilterFixture mocks all 3 SQL reads with a known 3-peer set so each
-// q-filter test can focus on the post-fetch substring-match behaviour
-// without re-stating the full query expectations.
+// peersFilterFixture mocks the 4 SQL reads (parent_id lookup + siblings +
+// children + parent) with a known 3-peer set so each q-filter test can
+// focus on the post-fetch substring-match behaviour. Returns the handler
+// and the live mock so callers can assert ExpectationsWereMet at the end.
 func peersFilterFixture(t *testing.T) (*DiscoveryHandler, sqlmock.Sqlmock) {
 	t.Helper()
 	mock := setupTestDB(t)
@@ -292,7 +294,13 @@ func peersFilterFixture(t *testing.T) (*DiscoveryHandler, sqlmock.Sqlmock) {
 	return NewDiscoveryHandler(), mock
 }
 
-func runPeersWithQuery(t *testing.T, handler *DiscoveryHandler, q string) []map[string]interface{} {
+// runPeersWithQuery invokes Peers and returns BOTH the parsed peers and
+// the raw response body. The raw body is needed by TestPeers_Q_NoMatches
+// to distinguish JSON `[]` (intended) from `null` (regression of the
+// nil-guard at line 254-256) — once unmarshalled, both collapse to
+// len==0 and re-marshal to `[]`, so checking only the parsed value is
+// tautological.
+func runPeersWithQuery(t *testing.T, handler *DiscoveryHandler, q string) ([]map[string]interface{}, []byte) {
 	t.Helper()
 	w := httptest.NewRecorder()
 	c, _ := gin.CreateTestContext(w)
@@ -306,98 +314,90 @@ func runPeersWithQuery(t *testing.T, handler *DiscoveryHandler, q string) []map[
 	if w.Code != http.StatusOK {
 		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
 	}
+	body := w.Body.Bytes()
 	var peers []map[string]interface{}
-	if err := json.Unmarshal(w.Body.Bytes(), &peers); err != nil {
+	if err := json.Unmarshal(body, &peers); err != nil {
 		t.Fatalf("parse response: %v", err)
 	}
-	return peers
+	return peers, body
 }
 
-// TestPeers_NoQ_ReturnsAll locks in the regression baseline: without ?q,
-// the handler returns the full 3-peer fixture (alpha + beta + pm).
-func TestPeers_NoQ_ReturnsAll(t *testing.T) {
-	handler, mock := peersFilterFixture(t)
-	peers := runPeersWithQuery(t, handler, "")
-	if len(peers) != 3 {
-		t.Errorf("no-q: expected 3 peers, got %d (%v)", len(peers), peerNames(peers))
-	}
-	if err := mock.ExpectationsWereMet(); err != nil {
-		t.Errorf("unmet sqlmock expectations: %v", err)
-	}
-}
-
-// TestPeers_Q_FiltersByName covers the primary CWE-862 fix path.
-func TestPeers_Q_FiltersByName(t *testing.T) {
-	handler, mock := peersFilterFixture(t)
-	peers := runPeersWithQuery(t, handler, "alpha")
-	if len(peers) != 1 || peers[0]["id"] != "ws-alpha" {
-		t.Errorf("q=alpha: expected just ws-alpha, got %v", peerNames(peers))
-	}
-	if err := mock.ExpectationsWereMet(); err != nil {
-		t.Errorf("unmet sqlmock expectations: %v", err)
-	}
-}
-
-// TestPeers_Q_CaseInsensitive guards against a future strings.Contains
-// without ToLower regression.
-func TestPeers_Q_CaseInsensitive(t *testing.T) {
-	handler, mock := peersFilterFixture(t)
-	peers := runPeersWithQuery(t, handler, "ALPHA")
-	if len(peers) != 1 || peers[0]["id"] != "ws-alpha" {
-		t.Errorf("q=ALPHA: expected ws-alpha, got %v", peerNames(peers))
-	}
-	if err := mock.ExpectationsWereMet(); err != nil {
-		t.Errorf("unmet sqlmock expectations: %v", err)
-	}
-}
-
-// TestPeers_Q_FiltersByRole — role-based match (designer is the role of
-// ws-beta; the substring "design" must surface beta and only beta).
-func TestPeers_Q_FiltersByRole(t *testing.T) {
-	handler, mock := peersFilterFixture(t)
-	peers := runPeersWithQuery(t, handler, "design")
-	if len(peers) != 1 || peers[0]["id"] != "ws-beta" {
-		t.Errorf("q=design: expected ws-beta, got %v", peerNames(peers))
-	}
-	if err := mock.ExpectationsWereMet(); err != nil {
-		t.Errorf("unmet sqlmock expectations: %v", err)
-	}
-}
-
-// TestPeers_Q_NoMatches returns an empty array, not null.
-func TestPeers_Q_NoMatches(t *testing.T) {
-	handler, mock := peersFilterFixture(t)
-	peers := runPeersWithQuery(t, handler, "nonexistent")
-	if len(peers) != 0 {
-		t.Errorf("q=nonexistent: expected 0 peers, got %d (%v)", len(peers), peerNames(peers))
-	}
-	// JSON should serialize as `[]` not `null` — re-encode and inspect.
-	if got, _ := json.Marshal(peers); string(got) != "[]" {
-		t.Errorf("expected JSON [], got %s", string(got))
-	}
-	if err := mock.ExpectationsWereMet(); err != nil {
-		t.Errorf("unmet sqlmock expectations: %v", err)
-	}
-}
-
-// TestPeers_Q_WhitespaceOnly is treated as an empty filter.
-func TestPeers_Q_WhitespaceOnly(t *testing.T) {
-	handler, mock := peersFilterFixture(t)
-	peers := runPeersWithQuery(t, handler, "%20%20")
-	if len(peers) != 3 {
-		t.Errorf("q=whitespace: expected unfiltered 3 peers, got %d", len(peers))
-	}
-	if err := mock.ExpectationsWereMet(); err != nil {
-		t.Errorf("unmet sqlmock expectations: %v", err)
-	}
-}
-
-func peerNames(peers []map[string]interface{}) []string {
-	out := make([]string, 0, len(peers))
+// peerIDSet returns the set of peer ids — order-independent comparison
+// avoids fragile peers[0]["id"] asserts that would silently mask a future
+// sort/order change.
+func peerIDSet(peers []map[string]interface{}) map[string]struct{} {
+	out := make(map[string]struct{}, len(peers))
 	for _, p := range peers {
-		if id, ok := p["id"].(string); ok {
-			out = append(out, id)
-		}
+		out[p["id"].(string)] = struct{}{}
+	}
+	return out
+}
+
+// TestPeers_QFilter covers the rule classifier — append-order
+// independent (uses set membership) so a future sort regression on the
+// production code can't slip through. NoMatches has its own raw-body
+// check (see TestPeers_Q_NoMatches_RawBodyIsArrayNotNull below) because
+// the `[]` vs `null` distinction collapses after json.Unmarshal.
+func TestPeers_QFilter(t *testing.T) {
+	cases := []struct {
+		name    string
+		q       string
+		wantIDs []string
+	}{
+		{"no-q returns all", "", []string{"ws-alpha", "ws-beta", "ws-pm"}},
+		{"name match", "alpha", []string{"ws-alpha"}},
+		{"name match case-insensitive", "ALPHA", []string{"ws-alpha"}},
+		{"role match", "design", []string{"ws-beta"}},
+		{"no matches", "nonexistent", nil},
+		{"whitespace-only is no-op", "%20%20", []string{"ws-alpha", "ws-beta", "ws-pm"}},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			handler, mock := peersFilterFixture(t)
+			peers, _ := runPeersWithQuery(t, handler, tc.q)
+
+			got := peerIDSet(peers)
+			want := make(map[string]struct{}, len(tc.wantIDs))
+			for _, id := range tc.wantIDs {
+				want[id] = struct{}{}
+			}
+			if len(got) != len(want) {
+				t.Fatalf("len: got %d %v, want %d %v", len(got), keysOf(got), len(want), tc.wantIDs)
+			}
+			for id := range want {
+				if _, ok := got[id]; !ok {
+					t.Errorf("missing id %q (got %v, want %v)", id, keysOf(got), tc.wantIDs)
+				}
+			}
+			if err := mock.ExpectationsWereMet(); err != nil {
+				t.Errorf("unmet sqlmock expectations: %v", err)
+			}
+		})
+	}
+}
+
+// TestPeers_Q_NoMatches_RawBodyIsArrayNotNull verifies the `peers = make(...)`
+// nil-guard at the end of Peers — when filtering reduces the slice to
+// non-nil-but-empty AND the original was nil, JSON must be `[]` not `null`.
+// This is the assertion the previous TestPeers_Q_NoMatches falsely claimed
+// to make: re-encoding an unmarshalled []map collapses null and [] both
+// to []. The fix here checks the recorder body bytes BEFORE unmarshal.
+func TestPeers_Q_NoMatches_RawBodyIsArrayNotNull(t *testing.T) {
+	handler, mock := peersFilterFixture(t)
+	_, body := runPeersWithQuery(t, handler, "nonexistent")
+	got := strings.TrimSpace(string(body))
+	if got != "[]" {
+		t.Errorf("raw body: got %q, want []", got)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+func keysOf(m map[string]struct{}) []string {
+	out := make([]string, 0, len(m))
+	for k := range m {
+		out = append(out, k)
 	}
 	return out
 }

From 9b42a5e31134d95465c2b63f5081bd4f63bb9249 Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 01:30:36 -0700
Subject: [PATCH 05/34] test(ci): runtime + a2a-sdk pin compatibility gate
 (controlplane#253)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes Molecule-AI/molecule-controlplane#253.

Prevents recurrence of the 5-hour staging outage from 2026-04-24:
molecule-ai-workspace-runtime 0.1.13 declared `a2a-sdk<1.0` in its
metadata but actually imported `a2a.server.routes` (1.0+ only). pip
resolved successfully; every tenant workspace crashed at import. The
canary tenant ultimately caught it but only after 5 hours of degraded
staging. PR #249 fixed the version pin manually; nothing automated
catches the same class of bug for the next release.

This workflow:

- Installs molecule-ai-workspace-runtime fresh from PyPI in a Python
  3.11 venv (mirrors EC2 user-data install pattern)
- Layers in workspace/requirements.txt (the runtime image's actual
  dep set, including the a2a-sdk[http-server]>=1.0,<2.0 pin)
- Runs `from molecule_runtime.main import main_sync` — same import
  the runtime entrypoint does
- Fails CI if pip resolution silently produced a combo that the
  runtime can't actually import

Triggers:
- PR + push to main/staging touching workspace/requirements.txt or
  this workflow (catches local pin changes)
- Daily 13:00 UTC schedule (catches upstream PyPI publishes that
  break the pin combo without any change in our repo)
- workflow_dispatch (manual)

Concurrency cancels in-progress runs on the same ref.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/runtime-pin-compat.yml | 59 ++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 .github/workflows/runtime-pin-compat.yml

diff --git a/.github/workflows/runtime-pin-compat.yml b/.github/workflows/runtime-pin-compat.yml
new file mode 100644
index 00000000..042620e8
--- /dev/null
+++ b/.github/workflows/runtime-pin-compat.yml
@@ -0,0 +1,59 @@
+name: Runtime Pin Compatibility
+
+# CI gate that prevents the 5-hour staging outage from 2026-04-24 from
+# recurring (controlplane#253). The original failure mode:
+#   1. molecule-ai-workspace-runtime 0.1.13 declared `a2a-sdk<1.0` in its
+#      requires_dist metadata (incorrect — it actually imports
+#      a2a.server.routes which only exists in a2a-sdk 1.0+)
+#   2. `pip install molecule-ai-workspace-runtime` resolved cleanly
+#   3. `from molecule_runtime.main import main_sync` raised ImportError
+#   4. Every tenant workspace crashed; the canary tenant caught it but
+#      only after 5 hours of degraded staging
+#
+# This workflow installs the runtime in a fresh Python venv from PyPI
+# and tries the same import the EC2 user-data does. If pip resolution
+# silently produces a broken combo, this gate fails before the tenant
+# image gets published.
+
+on:
+  push:
+    branches: [main, staging]
+    paths:
+      - 'workspace/requirements.txt'
+      - '.github/workflows/runtime-pin-compat.yml'
+  pull_request:
+    branches: [main, staging]
+    paths:
+      - 'workspace/requirements.txt'
+      - '.github/workflows/runtime-pin-compat.yml'
+  # Daily catch for upstream PyPI publishes that break the pin combo
+  # without any change in our repo (e.g. someone re-yanks an a2a-sdk
+  # release or molecule-ai-workspace-runtime publishes a bad bump).
+  schedule:
+    - cron: '0 13 * * *'  # 06:00 PT
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  default-install:
+    name: Default install + import smoke
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install runtime + workspace requirements
+        run: |
+          python -m venv /tmp/venv
+          /tmp/venv/bin/pip install --upgrade pip
+          /tmp/venv/bin/pip install molecule-ai-workspace-runtime
+          /tmp/venv/bin/pip install -r workspace/requirements.txt
+          /tmp/venv/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
+            | grep -E '^(Name|Version):'
+      - name: Smoke import — fail if metadata declares deps that don't satisfy real imports
+        run: |
+          /tmp/venv/bin/python -c "from molecule_runtime.main import main_sync; print('runtime imports OK')"

From b817251c85fa8e79c301e25aa9f5751c1b6642ca Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 01:32:56 -0700
Subject: [PATCH 06/34] refactor(ci): apply simplify findings on #2083
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Review of the runtime-pin-compat workflow:

- Add merge_group trigger so when this becomes a required check the
  queue green-checks it (mirrors ci.yml convention).
- Cache pip on workspace/requirements.txt — actions/setup-python@v5
  with cache: pip + cache-dependency-path. Saves ~30s per fire.
- Document the load-bearing install order: runtime FIRST so pip
  honors the runtime's declared a2a-sdk constraint (the surface that
  broke 2026-04-24); workspace/requirements.txt SECOND so a2a-sdk
  is upgraded to the runtime image's pinned version. Import smoke
  validates the upgraded combination.

Skipped: branch-protection wiring (separate ops decision, not in
scope here); ci.yml integration (the standalone schedule trigger
is the load-bearing reason to keep this workflow separate).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/runtime-pin-compat.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/workflows/runtime-pin-compat.yml b/.github/workflows/runtime-pin-compat.yml
index 042620e8..a1aeb7cd 100644
--- a/.github/workflows/runtime-pin-compat.yml
+++ b/.github/workflows/runtime-pin-compat.yml
@@ -32,6 +32,10 @@ on:
   schedule:
     - cron: '0 13 * * *'  # 06:00 PT
   workflow_dispatch:
+  # Required-check support: when this becomes a branch-protection gate,
+  # merge_group runs let the queue green-check this in addition to PRs.
+  merge_group:
+    types: [checks_requested]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -46,7 +50,16 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: '3.11'
+          cache: pip
+          cache-dependency-path: workspace/requirements.txt
       - name: Install runtime + workspace requirements
+        # Install order is load-bearing: install the runtime FIRST so pip
+        # honors whatever a2a-sdk constraint the runtime metadata declares
+        # (this is the surface that broke in 2026-04-24 — runtime declared
+        # `a2a-sdk<1.0` but actually needed >=1.0). The follow-up install
+        # of workspace/requirements.txt then upgrades a2a-sdk to the
+        # constraint our runtime image actually pins. The import smoke
+        # below verifies the upgraded combination is consistent.
         run: |
           python -m venv /tmp/venv
           /tmp/venv/bin/pip install --upgrade pip

From 5ce7af2d2cbf1161212ff6da82f136dfe3efebf5 Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 01:59:56 -0700
Subject: [PATCH 07/34] fix(ci): set WORKSPACE_ID for the runtime-pin smoke
 import
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

platform_auth.py validates WORKSPACE_ID at module load — EC2 user-data
sets it from cloud-init, but the CI smoke-test was missing it and
failed with 'WORKSPACE_ID is empty'. Set a placeholder UUID so the
import gate exercises only the dep-resolution path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/runtime-pin-compat.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/runtime-pin-compat.yml b/.github/workflows/runtime-pin-compat.yml
index a1aeb7cd..976c3194 100644
--- a/.github/workflows/runtime-pin-compat.yml
+++ b/.github/workflows/runtime-pin-compat.yml
@@ -68,5 +68,10 @@ jobs:
           /tmp/venv/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
             | grep -E '^(Name|Version):'
       - name: Smoke import — fail if metadata declares deps that don't satisfy real imports
+        # WORKSPACE_ID is validated at import time by platform_auth.py — EC2
+        # user-data sets it from the cloud-init template; set a placeholder
+        # here so the import smoke doesn't trip on the env-var guard.
+        env:
+          WORKSPACE_ID: 00000000-0000-0000-0000-000000000001
         run: |
           /tmp/venv/bin/python -c "from molecule_runtime.main import main_sync; print('runtime imports OK')"

From 577294b8f4593abfddba2239c097e7830b15fd8e Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 02:01:57 -0700
Subject: [PATCH 08/34] test(config): lock ComplianceConfig default to
 owasp_agentic (#2059)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #2056 flipped ComplianceConfig.mode default from "" to "owasp_agentic"
so every shipped template gets prompt-injection detection + PII redaction
by default. The flip is correct + already shipping, but no test asserts
the new default — a silent revert (or a refactor that reintroduces the
old "" default) would pass workspace/tests/ and ship a workspace with
compliance silently off.

Add 4 regression tests:

- test_compliance_dataclass_default — ComplianceConfig() with no args
  returns mode='owasp_agentic' + prompt_injection='detect'
- test_compliance_default_when_yaml_omits_block — load_config on a yaml
  without `compliance:` key still produces owasp_agentic
- test_compliance_default_when_yaml_block_is_empty — load_config on
  `compliance: {}` (a common shape during template editing) still
  produces owasp_agentic; covers the load_config()
  `.get("mode", "owasp_agentic")` default-fill path
- test_compliance_explicit_optout_still_works — `mode: ""` in yaml
  must disable compliance (the documented opt-out path)

23/23 tests pass locally (4 new + 19 existing).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 workspace/tests/test_config.py | 52 ++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/workspace/tests/test_config.py b/workspace/tests/test_config.py
index 786bc8b7..a24eea06 100644
--- a/workspace/tests/test_config.py
+++ b/workspace/tests/test_config.py
@@ -6,6 +6,7 @@ import yaml
 
 from config import (
     A2AConfig,
+    ComplianceConfig,
     DelegationConfig,
     SandboxConfig,
     WorkspaceConfig,
@@ -244,3 +245,54 @@ def test_shared_context_from_yaml(tmp_path):
 
     cfg = load_config(str(tmp_path))
     assert cfg.shared_context == ["guidelines.md", "architecture.md"]
+
+
+# ===== Compliance default lock (#2059) =====
+#
+# PR #2056 flipped ComplianceConfig.mode default from "" to "owasp_agentic"
+# so every shipped template gets prompt-injection detection + PII redaction
+# by default. These tests pin the new default at all four entry points so
+# a silent revert (or a refactor that reintroduces the old no-op default)
+# fails fast instead of shipping a workspace with compliance silently off.
+
+
+def test_compliance_dataclass_default():
+    """ComplianceConfig() — no args — must default to owasp_agentic + detect."""
+    cfg = ComplianceConfig()
+    assert cfg.mode == "owasp_agentic"
+    assert cfg.prompt_injection == "detect"
+
+
+def test_compliance_default_when_yaml_omits_block(tmp_path):
+    """A config.yaml with no `compliance:` key still gets owasp_agentic."""
+    config_yaml = tmp_path / "config.yaml"
+    config_yaml.write_text(yaml.dump({}))
+
+    cfg = load_config(str(tmp_path))
+    assert cfg.compliance.mode == "owasp_agentic"
+    assert cfg.compliance.prompt_injection == "detect"
+
+
+def test_compliance_default_when_yaml_block_is_empty(tmp_path):
+    """An explicitly empty `compliance: {}` block still gets owasp_agentic.
+
+    This is the load_config default-fill path — `.get("mode", "owasp_agentic")`
+    must keep delivering the new default even when the operator dropped a
+    bare `compliance:` block in their yaml (a common shape during template
+    editing).
+    """
+    config_yaml = tmp_path / "config.yaml"
+    config_yaml.write_text(yaml.dump({"compliance": {}}))
+
+    cfg = load_config(str(tmp_path))
+    assert cfg.compliance.mode == "owasp_agentic"
+    assert cfg.compliance.prompt_injection == "detect"
+
+
+def test_compliance_explicit_optout_still_works(tmp_path):
+    """Explicit `mode: ""` in yaml must disable compliance — opt-out path."""
+    config_yaml = tmp_path / "config.yaml"
+    config_yaml.write_text(yaml.dump({"compliance": {"mode": ""}}))
+
+    cfg = load_config(str(tmp_path))
+    assert cfg.compliance.mode == ""

From 4a4a7408040bf55b610d545c8498c69cd0f78d80 Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 02:03:59 -0700
Subject: [PATCH 09/34] refactor(test_config): parametrize the 3 yaml-default
 cases (simplify on #2085)

Collapses test_compliance_default_when_yaml_omits_block,
_when_yaml_block_is_empty, _explicit_optout_still_works into one
parametrized test_compliance_default_via_load_config with three
ids (yaml_omits_block, yaml_block_empty, yaml_explicit_optout).

The dataclass-default test stays separate (no tmp_path needed).

Coverage and assertions identical; net -19 lines, same 4 logical cases.
prompt_injection check moves out of per-case to a single tail-assert
since no payload overrode it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 workspace/tests/test_config.py | 51 +++++++++++++++-------------------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/workspace/tests/test_config.py b/workspace/tests/test_config.py
index a24eea06..2a805b3f 100644
--- a/workspace/tests/test_config.py
+++ b/workspace/tests/test_config.py
@@ -2,6 +2,7 @@
 
 import os
 
+import pytest
 import yaml
 
 from config import (
@@ -263,36 +264,28 @@ def test_compliance_dataclass_default():
     assert cfg.prompt_injection == "detect"
 
 
-def test_compliance_default_when_yaml_omits_block(tmp_path):
-    """A config.yaml with no `compliance:` key still gets owasp_agentic."""
+@pytest.mark.parametrize(
+    "yaml_payload, expected_mode",
+    [
+        # No `compliance:` key at all — full default path.
+        ({}, "owasp_agentic"),
+        # Explicit empty block — exercises load_config's
+        # `.get("mode", "owasp_agentic")` default-fill at config.py:377.
+        # Common shape during template editing.
+        ({"compliance": {}}, "owasp_agentic"),
+        # Documented opt-out: explicit `mode: ""` disables compliance.
+        ({"compliance": {"mode": ""}}, ""),
+    ],
+    ids=["yaml_omits_block", "yaml_block_empty", "yaml_explicit_optout"],
+)
+def test_compliance_default_via_load_config(tmp_path, yaml_payload, expected_mode):
+    """load_config honors the owasp_agentic default at every yaml shape and
+    still respects explicit opt-out."""
     config_yaml = tmp_path / "config.yaml"
-    config_yaml.write_text(yaml.dump({}))
+    config_yaml.write_text(yaml.dump(yaml_payload))
 
     cfg = load_config(str(tmp_path))
-    assert cfg.compliance.mode == "owasp_agentic"
+    assert cfg.compliance.mode == expected_mode
+    # prompt_injection was never overridden in any payload — must stay at
+    # the dataclass default regardless of the mode value.
     assert cfg.compliance.prompt_injection == "detect"
-
-
-def test_compliance_default_when_yaml_block_is_empty(tmp_path):
-    """An explicitly empty `compliance: {}` block still gets owasp_agentic.
-
-    This is the load_config default-fill path — `.get("mode", "owasp_agentic")`
-    must keep delivering the new default even when the operator dropped a
-    bare `compliance:` block in their yaml (a common shape during template
-    editing).
-    """
-    config_yaml = tmp_path / "config.yaml"
-    config_yaml.write_text(yaml.dump({"compliance": {}}))
-
-    cfg = load_config(str(tmp_path))
-    assert cfg.compliance.mode == "owasp_agentic"
-    assert cfg.compliance.prompt_injection == "detect"
-
-
-def test_compliance_explicit_optout_still_works(tmp_path):
-    """Explicit `mode: ""` in yaml must disable compliance — opt-out path."""
-    config_yaml = tmp_path / "config.yaml"
-    config_yaml.write_text(yaml.dump({"compliance": {"mode": ""}}))
-
-    cfg = load_config(str(tmp_path))
-    assert cfg.compliance.mode == ""

From 48b494def32f418d7feff7499988c6bbf1d0c4cc Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 26 Apr 2026 02:17:51 -0700
Subject: [PATCH 10/34] fix(provisioner): nil guards on Stop/IsRunning, unblock
 contract tests (closes #1813)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both backends panicked when called on a zero-valued or nil receiver:
Provisioner.{Stop,IsRunning} dereferenced p.cli; CPProvisioner.{Stop,
IsRunning} dereferenced p.httpClient. The orphan sweeper and shutdown
paths can call these speculatively where the receiver isn't fully
wired — the panic crashed the goroutine instead of the caller seeing
a clean error.

Three changes:

1. Add ErrNoBackend (typed sentinel) and nil-guard the four methods.
   - Provisioner.{Stop,IsRunning}: guard p == nil || p.cli == nil at
     the top.
   - CPProvisioner.Stop: guard p == nil up top, then httpClient nil
     AFTER resolveInstanceID + empty-instance check (the empty
     instance_id path doesn't need HTTP and stays a no-op success
     even on zero-valued receivers — preserved historical contract
     from TestIsRunning_EmptyInstanceIDReturnsFalse).
   - CPProvisioner.IsRunning: same shape — empty instance_id stays
     (false, nil); httpClient-nil with non-empty instance_id returns
     ErrNoBackend.

2. Flip the t.Skip on TestDockerBackend_Contract +
   TestCPProvisionerBackend_Contract — both contract tests run now
   that the panics are gone. Skipped scenarios were the regression
   guard for this fix.

3. Add TestZeroValuedBackends_NoPanic — explicit assertion that
   zero-valued and nil receivers return cleanly (no panic). Docker
   backend always returns ErrNoBackend on zero-valued; CPProvisioner
   may return (false, nil) when the DB-lookup layer absorbs the case
   (no instance to query → no HTTP needed). Both are acceptable per
   the issue's contract — the gate is no-panic.

Tests:
  - 6 sub-cases across the new TestZeroValuedBackends_NoPanic
  - TestDockerBackend_Contract + TestCPProvisionerBackend_Contract
    now run their 2 scenarios (4 sub-cases each)
  - All existing provisioner tests still green
  - go build ./... + go vet ./... + go test ./... clean

Closes drift-risk #6 in docs/architecture/backends.md.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../provisioner/backend_contract_test.go      | 88 ++++++++++++++++---
 .../internal/provisioner/cp_provisioner.go    | 27 +++++-
 .../internal/provisioner/provisioner.go       | 16 ++++
 3 files changed, 120 insertions(+), 11 deletions(-)

diff --git a/workspace-server/internal/provisioner/backend_contract_test.go b/workspace-server/internal/provisioner/backend_contract_test.go
index 0c31daf0..a5948f54 100644
--- a/workspace-server/internal/provisioner/backend_contract_test.go
+++ b/workspace-server/internal/provisioner/backend_contract_test.go
@@ -38,6 +38,7 @@ package provisioner
 
 import (
 	"context"
+	"errors"
 	"testing"
 )
 
@@ -138,23 +139,90 @@ func runBackendContract(t *testing.T, backend Backend) {
 }
 
 // TestDockerBackend_Contract feeds the Docker backend through the
-// shared runner. Skipped pending hardening: the scaffold exposed a
-// real bug — neither backend's Stop/IsRunning handles a nil underlying
-// client gracefully (both panic). Filing that as a separate issue;
-// once both backends return (*, error) instead of panicking, flip this
-// to t.Run and the contract scenarios exercise the fix.
+// shared runner. Was Skip'd pending hardening of Provisioner.{Stop,
+// IsRunning} against nil Docker client — that hardening landed in
+// fix/provisioner-nil-guards-1813, so the scenarios run now. A
+// zero-valued *Provisioner returns ErrNoBackend from each method
+// instead of panicking, satisfying the no-panic contract.
 func TestDockerBackend_Contract(t *testing.T) {
-	t.Skip("scaffolding only — unblock by hardening Provisioner.{Stop,IsRunning} against nil Docker client; see docs/architecture/backends.md drift risk #6")
 	var p Provisioner
 	runBackendContract(t, &p)
 }
 
 // TestCPProvisionerBackend_Contract — same story as the Docker variant.
-// CPProvisioner panics on nil httpClient today; once that's hardened,
-// remove the Skip and this runner exercises both backends through a
-// single contract.
+// Hardening landed in fix/provisioner-nil-guards-1813; zero-valued
+// *CPProvisioner returns ErrNoBackend from Stop/IsRunning instead of
+// panicking on nil httpClient.
 func TestCPProvisionerBackend_Contract(t *testing.T) {
-	t.Skip("scaffolding only — unblock by hardening CPProvisioner.{Stop,IsRunning} against nil httpClient; see docs/architecture/backends.md drift risk #6")
 	var p CPProvisioner
 	runBackendContract(t, &p)
 }
+
+// TestZeroValuedBackends_NoPanic pins the contract that motivated #1813:
+// a Backend with no underlying client wired up must return cleanly,
+// never panic. The orphan sweeper and shutdown paths both call these
+// methods speculatively where the receiver could be unset.
+//
+// The exact error shape varies between backends:
+//   - Docker Provisioner has no DB-lookup layer; zero-valued always
+//     returns ErrNoBackend.
+//   - CPProvisioner threads through a package-level resolveInstanceID
+//     lookup; when the DB has no row for the workspace (or db.DB
+//     itself is nil), instance_id comes back empty and the method
+//     short-circuits to (false, nil). Only when there's a real
+//     instance_id to query does the missing-httpClient case surface
+//     as ErrNoBackend.
+//
+// Both shapes are acceptable — the contract is "no panic, error is
+// nil or ErrNoBackend." Anything else is a bug.
+func TestZeroValuedBackends_NoPanic(t *testing.T) {
+	acceptableErr := func(err error) bool {
+		return err == nil || errors.Is(err, ErrNoBackend)
+	}
+	t.Run("Provisioner.Stop", func(t *testing.T) {
+		var p Provisioner
+		if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) {
+			t.Errorf("zero-valued Provisioner.Stop: got %v, want ErrNoBackend", err)
+		}
+	})
+	t.Run("Provisioner.IsRunning", func(t *testing.T) {
+		var p Provisioner
+		running, err := p.IsRunning(context.Background(), "any")
+		if !errors.Is(err, ErrNoBackend) {
+			t.Errorf("zero-valued Provisioner.IsRunning: got err=%v, want ErrNoBackend", err)
+		}
+		if running {
+			t.Errorf("zero-valued Provisioner.IsRunning: got running=true, want false")
+		}
+	})
+	t.Run("CPProvisioner.Stop", func(t *testing.T) {
+		var p CPProvisioner
+		if err := p.Stop(context.Background(), "any"); !acceptableErr(err) {
+			t.Errorf("zero-valued CPProvisioner.Stop: got %v, want nil or ErrNoBackend", err)
+		}
+	})
+	t.Run("CPProvisioner.IsRunning", func(t *testing.T) {
+		var p CPProvisioner
+		_, err := p.IsRunning(context.Background(), "any")
+		if !acceptableErr(err) {
+			t.Errorf("zero-valued CPProvisioner.IsRunning: got err=%v, want nil or ErrNoBackend", err)
+		}
+	})
+	// Nil receivers must also be safe. The orphan sweeper code path can
+	// hit Stop on a *Provisioner that's nil (not just zero-valued) when
+	// the wiring path hasn't initialized at all. For nil receivers we
+	// always expect ErrNoBackend (the nil-check at the top fires before
+	// any DB lookup could absorb the case).
+	t.Run("nil_Provisioner.Stop", func(t *testing.T) {
+		var p *Provisioner
+		if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) {
+			t.Errorf("nil *Provisioner.Stop: got %v, want ErrNoBackend", err)
+		}
+	})
+	t.Run("nil_CPProvisioner.Stop", func(t *testing.T) {
+		var p *CPProvisioner
+		if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) {
+			t.Errorf("nil *CPProvisioner.Stop: got %v, want ErrNoBackend", err)
+		}
+	})
+}
diff --git a/workspace-server/internal/provisioner/cp_provisioner.go b/workspace-server/internal/provisioner/cp_provisioner.go
index 6f6ae58d..e9cfc8c9 100644
--- a/workspace-server/internal/provisioner/cp_provisioner.go
+++ b/workspace-server/internal/provisioner/cp_provisioner.go
@@ -194,6 +194,9 @@ func (p *CPProvisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string,
 // blocking the next provision with InvalidGroup.Duplicate — a full
 // "Save & Restart" crash on SaaS.
 func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error {
+	if p == nil {
+		return ErrNoBackend
+	}
 	instanceID, err := resolveInstanceID(ctx, workspaceID)
 	if err != nil {
 		return fmt.Errorf("cp provisioner: stop: resolve instance_id: %w", err)
@@ -201,9 +204,17 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error {
 	if instanceID == "" {
 		// No instance was ever provisioned (or already deprovisioned and
 		// the column was cleared). Nothing to terminate — idempotent.
+		// Reached even when httpClient is nil since the empty-instance
+		// path doesn't need HTTP — symmetric with IsRunning.
 		log.Printf("CP provisioner: Stop for %s — no instance_id on file, nothing to do", workspaceID)
 		return nil
 	}
+	if p.httpClient == nil {
+		// HTTP wiring missing but we have an instance_id to terminate —
+		// can't make the DELETE call. Report ErrNoBackend so the
+		// orphan sweeper / shutdown path can branch.
+		return ErrNoBackend
+	}
 	url := fmt.Sprintf("%s/cp/workspaces/%s?instance_id=%s", p.baseURL, workspaceID, instanceID)
 	req, _ := http.NewRequestWithContext(ctx, "DELETE", url, nil)
 	p.provisionAuthHeaders(req)
@@ -269,6 +280,9 @@ var resolveInstanceID = func(ctx context.Context, workspaceID string) (string, e
 // Both callers are happy with (true, err); callers that need the
 // previous (false, err) shape must inspect err themselves.
 func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) {
+	if p == nil {
+		return false, ErrNoBackend
+	}
 	instanceID, err := resolveInstanceID(ctx, workspaceID)
 	if err != nil {
 		// Treat DB errors the same as transport errors — (true, err) keeps
@@ -277,9 +291,20 @@ func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool
 	}
 	if instanceID == "" {
 		// No instance recorded. Report "not running" cleanly (no error)
-		// so restart cascades can trigger a fresh provision.
+		// so restart cascades can trigger a fresh provision. This path
+		// is reached even on a zero-valued provisioner (no httpClient
+		// wired) — that's intentional; the resolveInstanceID lookup
+		// goes through the package-level db var, not p.httpClient, so
+		// a no-instance workspace gets a clean answer regardless of
+		// HTTP wiring state.
 		return false, nil
 	}
+	if p.httpClient == nil {
+		// HTTP wiring missing but we have an instance_id to query —
+		// can't proceed without a client. Report ErrNoBackend so the
+		// caller can branch (a2a_proxy keeps alive, healthsweep skips).
+		return false, ErrNoBackend
+	}
 	url := fmt.Sprintf("%s/cp/workspaces/%s/status?instance_id=%s", p.baseURL, workspaceID, instanceID)
 	req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
 	p.provisionAuthHeaders(req)
diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go
index ac04b15f..a57a4cb6 100644
--- a/workspace-server/internal/provisioner/provisioner.go
+++ b/workspace-server/internal/provisioner/provisioner.go
@@ -5,6 +5,7 @@ import (
 	"archive/tar"
 	"bytes"
 	"context"
+	"errors"
 	"fmt"
 	"io"
 	"log"
@@ -24,6 +25,15 @@ import (
 	ocispec "github.com/opencontainers/image-spec/specs-go/v1"
 )
 
+// ErrNoBackend is returned by lifecycle methods (Stop, IsRunning) when
+// the receiver is zero-valued — i.e. no Docker daemon connection has
+// been wired up. The orphan sweeper and the contract-test scaffolding
+// both speculatively call these methods on a possibly-nil receiver;
+// returning a typed error rather than panicking lets callers reason
+// about the case explicitly. See docs/architecture/backends.md
+// drift-risk #6.
+var ErrNoBackend = errors.New("provisioner: no backend configured (zero-valued receiver)")
+
 // RuntimeImages maps runtime names to their Docker image refs on GHCR.
 // Each standalone template repo publishes its image via the reusable
 // publish-template-image workflow in molecule-ci on every main merge.
@@ -823,6 +833,9 @@ func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) erro
 // respawn the container before ContainerRemove runs, leaving a zombie
 // that re-registers via heartbeat after deletion.
 func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
+	if p == nil || p.cli == nil {
+		return ErrNoBackend
+	}
 	name := ContainerName(workspaceID)
 
 	// Force-remove kills and removes in one atomic operation, bypassing
@@ -856,6 +869,9 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
 // so a real crash still surfaces via exec heartbeat or TTL, both of which
 // have narrower false-positive windows than daemon-inspect RPC.
 func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) {
+	if p == nil || p.cli == nil {
+		return false, ErrNoBackend
+	}
 	name := ContainerName(workspaceID)
 	info, err := p.cli.ContainerInspect(ctx, name)
 	if err != nil {

From 28d7649c480b29488f6c98cae543758cd7aed310 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 26 Apr 2026 02:50:42 -0700
Subject: [PATCH 11/34] test(handlers): sqlmock coverage for tokens.go (closes
 #1819)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The existing tokens_test.go skips every test when db.DB is nil, so CI
ran with 0% coverage on tokens.go's List/Create/Revoke. This file adds
sqlmock-driven tests that exercise the SQL paths directly without
needing a live Postgres, lifting coverage on all 4 functions to 100%
and module-level handler coverage from 60.3% → 61.1%.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../internal/handlers/tokens_sqlmock_test.go  | 331 ++++++++++++++++++
 1 file changed, 331 insertions(+)
 create mode 100644 workspace-server/internal/handlers/tokens_sqlmock_test.go

diff --git a/workspace-server/internal/handlers/tokens_sqlmock_test.go b/workspace-server/internal/handlers/tokens_sqlmock_test.go
new file mode 100644
index 00000000..b0166293
--- /dev/null
+++ b/workspace-server/internal/handlers/tokens_sqlmock_test.go
@@ -0,0 +1,331 @@
+package handlers
+
+// Sqlmock-backed coverage for tokens.go. Closes #1819.
+//
+// The existing tokens_test.go uses the real `db.DB` and t.Skip's when
+// the test DB isn't reachable — which is the default in CI, so the
+// file shows 0% coverage. This file substitutes the package-level
+// `db.DB` with a sqlmock instance so every code path (List, Create,
+// Revoke + their error branches) is exercised in `go test` without
+// any external dependency.
+//
+// What's covered:
+//   List   — happy path, empty rows, scan failure, query error
+//   Create — rate-limited, IssueToken DB error, happy path
+//   Revoke — happy path, not found, DB error
+//
+// What's NOT covered here (intentional):
+//   - Wsauth/middleware-level cross-tenant gating: those are exercised
+//     by middleware/wsauth_middleware_test.go. The handler-level code
+//     trusts WorkspaceAuth has already gated the request.
+//   - The full IssueToken path's correctness (random bytes,
+//     base64 encoding, prefix derivation): wsauth/tokens_test.go owns
+//     that. Here we only verify the handler hands off + reports
+//     errors correctly.
+
+import (
+	"context"
+	"database/sql"
+	"database/sql/driver"
+	"encoding/json"
+	"errors"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
+	"github.com/gin-gonic/gin"
+)
+
+func init() { gin.SetMode(gin.TestMode) }
+
+// withMockDB swaps `db.DB` for a sqlmock and returns the mock plus a
+// restore func. Tests use this in place of setupTokenTestDB which
+// skips on a missing real DB.
+func withMockDB(t *testing.T) (sqlmock.Sqlmock, func()) {
+	t.Helper()
+	mock, m, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock: %v", err)
+	}
+	prev := db.DB
+	db.DB = mock
+	cleanup := func() {
+		db.DB = prev
+		_ = mock.Close()
+	}
+	return m, cleanup
+}
+
+// makeReq builds a recorder + Gin context with the given URL params,
+// drives the handler, and returns the recorder. Centralised so each
+// scenario is one-line setup + assertion.
+func makeReq(t *testing.T, h gin.HandlerFunc, method, url string, params gin.Params) *httptest.ResponseRecorder {
+	t.Helper()
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Request = httptest.NewRequest(method, url, nil)
+	c.Params = params
+	h(c)
+	return w
+}
+
+// ---- List ------------------------------------------------------------
+
+func TestTokenHandler_List_HappyPath(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	created := time.Date(2026, 4, 1, 12, 0, 0, 0, time.UTC)
+	last := created.Add(time.Hour)
+	mock.ExpectQuery(`SELECT id, prefix, created_at, last_used_at\s+FROM workspace_auth_tokens`).
+		WithArgs("ws-1", 50, 0).
+		WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "created_at", "last_used_at"}).
+			AddRow("tok-1", "abc12345", created, last).
+			AddRow("tok-2", "def67890", created, nil))
+
+	w := makeReq(t, NewTokenHandler().List, "GET",
+		"/workspaces/ws-1/tokens", gin.Params{{Key: "id", Value: "ws-1"}})
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	var body struct {
+		Tokens []tokenListItem `json:"tokens"`
+		Count  int             `json:"count"`
+	}
+	if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if body.Count != 2 || len(body.Tokens) != 2 {
+		t.Fatalf("count=%d tokens=%d, want 2/2", body.Count, len(body.Tokens))
+	}
+	if body.Tokens[0].ID != "tok-1" || body.Tokens[1].ID != "tok-2" {
+		t.Errorf("wrong order: %+v", body.Tokens)
+	}
+	if body.Tokens[1].LastUsed != nil {
+		t.Errorf("token-2 should have nil last_used_at, got %v", body.Tokens[1].LastUsed)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet: %v", err)
+	}
+}
+
+func TestTokenHandler_List_EmptyResult(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	mock.ExpectQuery(`SELECT id, prefix, created_at, last_used_at`).
+		WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "created_at", "last_used_at"}))
+
+	w := makeReq(t, NewTokenHandler().List, "GET",
+		"/workspaces/ws-2/tokens", gin.Params{{Key: "id", Value: "ws-2"}})
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 on empty list, got %d", w.Code)
+	}
+	var body struct {
+		Tokens []tokenListItem `json:"tokens"`
+		Count  int             `json:"count"`
+	}
+	_ = json.Unmarshal(w.Body.Bytes(), &body)
+	if body.Count != 0 || body.Tokens == nil {
+		// Tokens MUST be `[]` not `null` so callers iterating with
+		// `.length` or `for...of` don't NPE on JS.
+		t.Errorf("empty: count=%d, tokens=%v (want 0 + non-nil)", body.Count, body.Tokens)
+	}
+}
+
+func TestTokenHandler_List_QueryError(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	mock.ExpectQuery(`SELECT id, prefix, created_at, last_used_at`).
+		WillReturnError(errors.New("connection refused"))
+
+	w := makeReq(t, NewTokenHandler().List, "GET",
+		"/workspaces/ws-3/tokens", gin.Params{{Key: "id", Value: "ws-3"}})
+
+	if w.Code != http.StatusInternalServerError {
+		t.Errorf("query error must surface as 500, got %d", w.Code)
+	}
+}
+
+func TestTokenHandler_List_RespectsLimit(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	mock.ExpectQuery(`SELECT id, prefix, created_at, last_used_at`).
+		WithArgs("ws-1", 10, 5).
+		WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "created_at", "last_used_at"}))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Request = httptest.NewRequest("GET", "/workspaces/ws-1/tokens?limit=10&offset=5", nil)
+	c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+	NewTokenHandler().List(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("limit/offset query: %d", w.Code)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("limit/offset args not bound correctly: %v", err)
+	}
+}
+
+func TestTokenHandler_List_ScanError(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	// Inject a bad row that fails to Scan: pass non-time value where
+	// created_at expects time.Time. sqlmock surfaces this as a Scan err.
+	mock.ExpectQuery(`SELECT id, prefix, created_at, last_used_at`).
+		WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "created_at", "last_used_at"}).
+			AddRow("tok-1", "abc", "not-a-timestamp", nil))
+
+	w := makeReq(t, NewTokenHandler().List, "GET",
+		"/workspaces/ws-1/tokens", gin.Params{{Key: "id", Value: "ws-1"}})
+
+	if w.Code != http.StatusInternalServerError {
+		t.Errorf("scan error must surface as 500, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
+// ---- Create ----------------------------------------------------------
+
+func TestTokenHandler_Create_RateLimited(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	// Count query returns 50 (== max) → 429.
+	mock.ExpectQuery(`SELECT COUNT\(\*\) FROM workspace_auth_tokens`).
+		WithArgs("ws-1").
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(50))
+
+	w := makeReq(t, NewTokenHandler().Create, "POST",
+		"/workspaces/ws-1/tokens", gin.Params{{Key: "id", Value: "ws-1"}})
+
+	if w.Code != http.StatusTooManyRequests {
+		t.Errorf("max active tokens should 429, got %d", w.Code)
+	}
+}
+
+func TestTokenHandler_Create_IssueFails(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	// Count = 0 → fall through to IssueToken, which does an INSERT
+	// into workspace_auth_tokens. Mock the INSERT to fail; handler
+	// surfaces as 500.
+	mock.ExpectQuery(`SELECT COUNT\(\*\) FROM workspace_auth_tokens`).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
+	mock.ExpectExec(`INSERT INTO workspace_auth_tokens`).
+		WillReturnError(errors.New("disk full"))
+
+	w := makeReq(t, NewTokenHandler().Create, "POST",
+		"/workspaces/ws-1/tokens", gin.Params{{Key: "id", Value: "ws-1"}})
+
+	if w.Code != http.StatusInternalServerError {
+		t.Errorf("IssueToken DB error must 500, got %d", w.Code)
+	}
+}
+
+func TestTokenHandler_Create_HappyPath(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	mock.ExpectQuery(`SELECT COUNT\(\*\) FROM workspace_auth_tokens`).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
+	mock.ExpectExec(`INSERT INTO workspace_auth_tokens`).
+		WillReturnResult(sqlmock.NewResult(1, 1))
+
+	w := makeReq(t, NewTokenHandler().Create, "POST",
+		"/workspaces/ws-1/tokens", gin.Params{{Key: "id", Value: "ws-1"}})
+
+	if w.Code != http.StatusCreated {
+		t.Fatalf("expected 201, got %d: %s", w.Code, w.Body.String())
+	}
+	var body struct {
+		AuthToken   string `json:"auth_token"`
+		WorkspaceID string `json:"workspace_id"`
+	}
+	if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if body.AuthToken == "" {
+		t.Errorf("auth_token must be present and non-empty in response")
+	}
+	if body.WorkspaceID != "ws-1" {
+		t.Errorf("workspace_id mismatch: %q", body.WorkspaceID)
+	}
+}
+
+// ---- Revoke ----------------------------------------------------------
+
+func TestTokenHandler_Revoke_HappyPath(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	mock.ExpectExec(`UPDATE workspace_auth_tokens\s+SET revoked_at = now\(\)`).
+		WithArgs("tok-1", "ws-1").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := makeReq(t, NewTokenHandler().Revoke, "DELETE",
+		"/workspaces/ws-1/tokens/tok-1", gin.Params{
+			{Key: "id", Value: "ws-1"},
+			{Key: "tokenId", Value: "tok-1"},
+		})
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 on revoke, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
+func TestTokenHandler_Revoke_NotFound(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	// 0 rows affected → token not found OR already revoked.
+	mock.ExpectExec(`UPDATE workspace_auth_tokens`).
+		WithArgs("tok-ghost", "ws-1").
+		WillReturnResult(sqlmock.NewResult(0, 0))
+
+	w := makeReq(t, NewTokenHandler().Revoke, "DELETE",
+		"/workspaces/ws-1/tokens/tok-ghost", gin.Params{
+			{Key: "id", Value: "ws-1"},
+			{Key: "tokenId", Value: "tok-ghost"},
+		})
+
+	if w.Code != http.StatusNotFound {
+		t.Errorf("revoke missing token must 404, got %d", w.Code)
+	}
+}
+
+func TestTokenHandler_Revoke_DBError(t *testing.T) {
+	mock, cleanup := withMockDB(t)
+	defer cleanup()
+
+	mock.ExpectExec(`UPDATE workspace_auth_tokens`).
+		WillReturnError(errors.New("conn lost"))
+
+	w := makeReq(t, NewTokenHandler().Revoke, "DELETE",
+		"/workspaces/ws-1/tokens/tok-1", gin.Params{
+			{Key: "id", Value: "ws-1"},
+			{Key: "tokenId", Value: "tok-1"},
+		})
+
+	if w.Code != http.StatusInternalServerError {
+		t.Errorf("DB error must 500, got %d", w.Code)
+	}
+}
+
+// Compile-time noise removal: the imports list pulls in the sql /
+// driver packages and the silenced ctx so a future scenario that
+// needs them doesn't have to re-add the import. Documented here so
+// the apparent "unused import" dance isn't surprising.
+var (
+	_ context.Context = context.Background()
+	_ driver.Value    = (sql.RawBytes)(nil)
+)

From 3c18b76aa7639f198c8b89d24d143c75f3eeb929 Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 04:16:43 -0700
Subject: [PATCH 12/34] ops(cf): hourly sweep workflow for orphan Cloudflare
 DNS records (#239)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes Molecule-AI/molecule-controlplane#239.

CF zone hit the 200-record quota 2026-04-23+ — every E2E and canary
left a record on moleculesai.app, and no scheduled job pruned them.
Provisions started failing with code 81045 ('Record quota exceeded').

The sweep-cf-orphans.sh script (PR #1978, with decision-function
unit tests added in #2079) already exists but no workflow fires it.
Adding it here as a parallel janitor to sweep-stale-e2e-orgs.yml:

- hourly schedule at :15 (offset from the e2e-orgs sweep at :00 so
  the two converge cleanly without racing the same CP admin endpoint)
- workflow_dispatch with dry_run input default true (ad-hoc verify
  without committing to deletes)
- workflow_dispatch with max_delete_pct input for major cleanups
  (the script's own MAX_DELETE_PCT defaults to 50% as a safety gate)
- concurrency group prevents schedule + manual-dispatch from racing
  the same zone

Why a separate workflow vs sweep-stale-e2e-orgs.yml:
- That workflow drives DELETE /cp/admin/tenants/:slug, assumes CP
  has the org row. Doesn't catch records left when CP itself never
  knew about the tenant (canary scratch, manual ops experiments)
  or when the CP-side cascade's CF-delete branch failed.
- sweep-cf-orphans.sh enumerates the CF zone directly + matches
  against live CP slugs + AWS EC2 names. Catches what the CP-driven
  sweep can't.

Required secrets (will need to be set on the repo): CF_API_TOKEN,
CF_ZONE_ID, CP_PROD_ADMIN_TOKEN, CP_STAGING_ADMIN_TOKEN,
AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY. Pre-flight verify-secrets
step fails loud if any are missing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/sweep-cf-orphans.yml | 103 +++++++++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 .github/workflows/sweep-cf-orphans.yml

diff --git a/.github/workflows/sweep-cf-orphans.yml b/.github/workflows/sweep-cf-orphans.yml
new file mode 100644
index 00000000..c20c649b
--- /dev/null
+++ b/.github/workflows/sweep-cf-orphans.yml
@@ -0,0 +1,103 @@
+name: Sweep stale Cloudflare DNS records
+
+# Janitor for Cloudflare DNS records whose backing tenant/workspace no
+# longer exists. Without this loop, every short-lived E2E or canary
+# leaves a CF record on the moleculesai.app zone — the zone has a
+# 200-record quota (controlplane#239 hit it 2026-04-23+) and provisions
+# start failing with code 81045 once exhausted.
+#
+# Why a separate workflow vs sweep-stale-e2e-orgs.yml:
+#   - That workflow operates at the CP layer (DELETE /cp/admin/tenants/:slug
+#     drives the cascade). It assumes CP has the org row to drive the
+#     deprovision from. It doesn't catch records left behind when CP
+#     itself never knew about the tenant (canary scratch, manual ops
+#     experiments) or when the cascade's CF-delete branch failed.
+#   - sweep-cf-orphans.sh enumerates the CF zone directly and matches
+#     each record against live CP slugs + AWS EC2 names. It catches
+#     leaks the CP-driven sweep can't.
+#
+# Safety: the script's own MAX_DELETE_PCT gate refuses to nuke more
+# than 50% of records in a single run. If something has gone weird
+# (CP admin endpoint returns no orgs → every tenant looks orphan) the
+# gate halts before damage. Decision-function unit tests in
+# scripts/ops/test_sweep_cf_decide.py (#2027) cover the rule
+# classifier.
+
+on:
+  schedule:
+    # Hourly. Mirrors sweep-stale-e2e-orgs cadence so the two janitors
+    # converge on the same tick. CF API rate budget is generous (1200
+    # req/5min); a single sweep makes ~1 list + N deletes (N<=quota/2).
+    - cron: '15 * * * *'  # offset from sweep-stale-e2e-orgs (top of hour)
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry run only — list what would be deleted, no deletion"
+        required: false
+        type: boolean
+        default: true
+      max_delete_pct:
+        description: "Override safety gate (default 50, set higher only for major cleanup)"
+        required: false
+        default: "50"
+
+# Don't let two sweeps race the same zone. workflow_dispatch during a
+# scheduled run would otherwise issue duplicate DELETE calls.
+concurrency:
+  group: sweep-cf-orphans
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+jobs:
+  sweep:
+    name: Sweep CF orphans
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    env:
+      CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
+      CF_ZONE_ID: ${{ secrets.CF_ZONE_ID }}
+      CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }}
+      CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      AWS_DEFAULT_REGION: us-east-2
+      MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Verify required secrets present
+        # Fail fast and loud if a secret is unset — sweep-cf-orphans.sh
+        # also checks via `need`, but we want a single distinct error
+        # in the workflow log instead of script-level multi-line noise.
+        run: |
+          missing=()
+          for var in CF_API_TOKEN CF_ZONE_ID CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
+            if [ -z "${!var:-}" ]; then
+              missing+=("$var")
+            fi
+          done
+          if [ ${#missing[@]} -gt 0 ]; then
+            echo "::error::missing required secret(s): ${missing[*]}"
+            exit 2
+          fi
+          echo "All required secrets present ✓"
+
+      - name: Install AWS CLI
+        # The script shells out to `aws ec2 describe-instances`; the
+        # ubuntu-latest runner has aws v2 preinstalled but we re-check
+        # to surface a clear error if a future runner image drops it.
+        run: aws --version
+
+      - name: Run sweep
+        run: |
+          set -euo pipefail
+          if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
+            echo "Running in dry-run mode — no deletions"
+            bash scripts/ops/sweep-cf-orphans.sh
+          else
+            echo "Running with --execute — will delete identified orphans"
+            bash scripts/ops/sweep-cf-orphans.sh --execute
+          fi

From 0ae6b201b4f2241d8e5a2a2eaeaa1171522275b5 Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 04:18:24 -0700
Subject: [PATCH 13/34] refactor(ci): apply simplify findings on PR #2088
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Drop redundant 'aws --version' step. Script's own 'aws ec2
  describe-instances' fails just as loud with a more actionable
  error; the pre-check added ~1s with no signal value.
- timeout-minutes 10 → 3. Realistic worst case is ~2min (4 curls +
  1 aws + N×CF-DELETE each individually capped at 10s by the
  script's curl -m flag). 3 surfaces hangs within one cron tick
  instead of burning the full interval.
- Document the schedule-vs-dispatch dry-run asymmetry inline so
  the next reader doesn't need to trace input defaults.
- Add merge_group: types: [checks_requested] for queue parity with
  runtime-pin-compat.yml — cheap insurance if this ever becomes a
  required check.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/sweep-cf-orphans.yml | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/sweep-cf-orphans.yml b/.github/workflows/sweep-cf-orphans.yml
index c20c649b..0e825256 100644
--- a/.github/workflows/sweep-cf-orphans.yml
+++ b/.github/workflows/sweep-cf-orphans.yml
@@ -40,6 +40,10 @@ on:
         description: "Override safety gate (default 50, set higher only for major cleanup)"
         required: false
         default: "50"
+  # Required-check support: scheduled-only today, but include merge_group
+  # so a future branch-protection wire-in doesn't need a workflow edit.
+  merge_group:
+    types: [checks_requested]
 
 # Don't let two sweeps race the same zone. workflow_dispatch during a
 # scheduled run would otherwise issue duplicate DELETE calls.
@@ -54,7 +58,11 @@ jobs:
   sweep:
     name: Sweep CF orphans
     runs-on: ubuntu-latest
-    timeout-minutes: 10
+    # 3 min surfaces hangs (CF API stall, AWS describe-instances stuck)
+    # within one cron interval instead of burning a full tick. Realistic
+    # worst case is ~2 min: 4 sequential curls + 1 aws + N×CF-DELETE
+    # each individually capped at 10s by the script's curl -m flag.
+    timeout-minutes: 3
     env:
       CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
       CF_ZONE_ID: ${{ secrets.CF_ZONE_ID }}
@@ -85,13 +93,16 @@ jobs:
           fi
           echo "All required secrets present ✓"
 
-      - name: Install AWS CLI
-        # The script shells out to `aws ec2 describe-instances`; the
-        # ubuntu-latest runner has aws v2 preinstalled but we re-check
-        # to surface a clear error if a future runner image drops it.
-        run: aws --version
-
       - name: Run sweep
+        # Schedule-vs-dispatch dry-run asymmetry (intentional):
+        #   - Scheduled runs: github.event.inputs.dry_run is empty →
+        #     defaults to "false" below → script runs with --execute
+        #     (the whole point of an hourly janitor).
+        #   - Manual workflow_dispatch: input default is true (line 38)
+        #     so an ad-hoc operator-triggered run is dry-run by default;
+        #     they have to flip the toggle to actually delete.
+        # The script's MAX_DELETE_PCT gate (default 50%) is the second
+        # line of defense regardless of mode.
         run: |
           set -euo pipefail
           if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then

From eb42f7d14578fee9fd2f757b9d73fc31e3be8037 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 26 Apr 2026 04:23:24 -0700
Subject: [PATCH 14/34] test(middleware): branch coverage for CanvasOrBearer +
 IsSameOriginCanvas (closes #1818)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per the 2026-04-23 audit, wsauth_middleware.go had two coverage holes
on auth-boundary code:

  CanvasOrBearer       50.0% (only fail-open + Origin paths covered)
  IsSameOriginCanvas    0.0% (exported wrapper never exercised)

This adds focused tests for the missing branches:

  CanvasOrBearer:
    - ValidBearer_Passes              (path-1 success)
    - InvalidBearer_Returns401        (auth-escape regression: bad
                                        bearer + matching Origin must
                                        NOT fall through to Origin)
    - AdminTokenEnv_Passes            (ADMIN_TOKEN constant-time match)
    - DBError_FailOpen                (documented fail-open behavior)
    - SameOriginCanvas_Passes         (path-3 combined-tenant image)

  IsSameOriginCanvas / isSameOriginCanvas:
    - ExportedWrapper_DelegatesToInternal
    - DisabledByEnv                   (CANVAS_PROXY_URL unset short-circuit)
    - BranchCoverage                  (table-driven: 11 host/referer/origin
                                        cases incl. the h.example.com.evil.com
                                        suffix-attack rejection)

Coverage moves CanvasOrBearer 50% → 100%, IsSameOriginCanvas 0% → 100%,
and middleware-package overall 81.6% → 86.0%. No production code change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../wsauth_middleware_canvasorbearer_test.go  | 296 ++++++++++++++++++
 1 file changed, 296 insertions(+)
 create mode 100644 workspace-server/internal/middleware/wsauth_middleware_canvasorbearer_test.go

diff --git a/workspace-server/internal/middleware/wsauth_middleware_canvasorbearer_test.go b/workspace-server/internal/middleware/wsauth_middleware_canvasorbearer_test.go
new file mode 100644
index 00000000..75ee87cb
--- /dev/null
+++ b/workspace-server/internal/middleware/wsauth_middleware_canvasorbearer_test.go
@@ -0,0 +1,296 @@
+package middleware
+
+// Coverage tests for the CanvasOrBearer middleware and IsSameOriginCanvas
+// per issue #1818. The existing wsauth_middleware_test.go covered the
+// fail-open and Origin paths but missed the bearer-validate, admin-secret,
+// same-origin-canvas, and IsSameOriginCanvas wrapper branches — leaving
+// CanvasOrBearer at 50% and IsSameOriginCanvas at 0% line coverage.
+//
+// These tests target the gaps without re-asserting behavior already
+// pinned by the existing suite.
+
+import (
+	"crypto/sha256"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/gin-gonic/gin"
+)
+
+// ── CanvasOrBearer: bearer-token branches (#1818) ───────────────────────────
+
+// TestCanvasOrBearer_ValidBearer_Passes exercises path 1 (the success
+// branch of the bearer-token validation block). Without this test the
+// only "OK" path covered was the Origin allowlist match — bearer
+// success was never asserted.
+func TestCanvasOrBearer_ValidBearer_Passes(t *testing.T) {
+	mockDB, mock, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock: %v", err)
+	}
+	defer mockDB.Close()
+
+	bearer := "valid-bearer-for-canvas-or-bearer"
+	hash := sha256.Sum256([]byte(bearer))
+
+	mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
+	mock.ExpectQuery(validateAnyTokenSelectQuery).
+		WithArgs(hash[:]).
+		WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("tok-1"))
+	mock.ExpectExec(validateTokenUpdateQuery).
+		WithArgs("tok-1").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	r := gin.New()
+	r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
+		c.JSON(http.StatusOK, gin.H{"ok": true})
+	})
+
+	w := httptest.NewRecorder()
+	req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
+	req.Header.Set("Authorization", "Bearer "+bearer)
+	r.ServeHTTP(w, req)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("valid bearer: got %d, want 200 (%s)", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock: %v", err)
+	}
+}
+
+// TestCanvasOrBearer_InvalidBearer_Returns401 covers the rejection
+// branch when a bearer is supplied but ValidateAnyToken fails. This
+// is the auth-escape case the issue called out: an attacker with a
+// revoked/expired token + matching Origin previously could bypass
+// auth, so the code MUST reject on bad bearer instead of falling
+// through to Origin.
+func TestCanvasOrBearer_InvalidBearer_Returns401(t *testing.T) {
+	mockDB, mock, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock: %v", err)
+	}
+	defer mockDB.Close()
+
+	bad := "expired-or-revoked-token"
+	hash := sha256.Sum256([]byte(bad))
+
+	mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
+	mock.ExpectQuery(validateAnyTokenSelectQuery).
+		WithArgs(hash[:]).
+		WillReturnRows(sqlmock.NewRows([]string{"id"})) // empty → ErrNoRows
+
+	// Origin would otherwise grant access — assert it doesn't here.
+	t.Setenv("CORS_ORIGINS", "https://acme.moleculesai.app")
+
+	handlerCalled := false
+	r := gin.New()
+	r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
+		handlerCalled = true
+		c.JSON(http.StatusOK, gin.H{"ok": true})
+	})
+
+	w := httptest.NewRecorder()
+	req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
+	req.Header.Set("Authorization", "Bearer "+bad)
+	req.Header.Set("Origin", "https://acme.moleculesai.app")
+	r.ServeHTTP(w, req)
+
+	if w.Code != http.StatusUnauthorized {
+		t.Errorf("invalid bearer with matching origin: got %d, want 401", w.Code)
+	}
+	if handlerCalled {
+		t.Error("invalid bearer leaked to handler — Origin fallback bypassed bearer validation")
+	}
+}
+
+// TestCanvasOrBearer_AdminTokenEnv_Passes exercises the ADMIN_TOKEN
+// constant-time-compare branch. The env-secret short-circuit is what
+// keeps the canvas dashboard usable when no DB-backed token rows are
+// provisioned yet (Hyperion bootstrap path).
+func TestCanvasOrBearer_AdminTokenEnv_Passes(t *testing.T) {
+	mockDB, mock, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock: %v", err)
+	}
+	defer mockDB.Close()
+
+	mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
+	// No ValidateAnyToken expectation — env match short-circuits before that.
+
+	t.Setenv("ADMIN_TOKEN", "platform-admin-secret")
+
+	r := gin.New()
+	r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
+		c.JSON(http.StatusOK, gin.H{"ok": true})
+	})
+
+	w := httptest.NewRecorder()
+	req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
+	req.Header.Set("Authorization", "Bearer platform-admin-secret")
+	r.ServeHTTP(w, req)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("admin env match: got %d, want 200 (%s)", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock: %v", err)
+	}
+}
+
+// TestCanvasOrBearer_DBError_FailOpen pins the documented behavior on a
+// HasAnyLiveTokenGlobal failure. The middleware logs and falls open so a
+// flaky DB doesn't lock canvas users out of cosmetic routes. Hardcoded in
+// the comment block; this is a reminder if anyone changes that semantic.
+func TestCanvasOrBearer_DBError_FailOpen(t *testing.T) {
+	mockDB, mock, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock: %v", err)
+	}
+	defer mockDB.Close()
+	mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
+		WillReturnError(http.ErrAbortHandler) // any non-nil error suffices
+
+	r := gin.New()
+	r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
+		c.JSON(http.StatusOK, gin.H{"ok": true})
+	})
+
+	w := httptest.NewRecorder()
+	req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
+	r.ServeHTTP(w, req)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("DB error fail-open: got %d, want 200 (%s)", w.Code, w.Body.String())
+	}
+}
+
+// TestCanvasOrBearer_SameOriginCanvas_Passes exercises path 3: when no
+// CORS Origin matches but the Referer/Host pair indicates the canvas is
+// served from the same combined-tenant image. CANVAS_PROXY_URL gates
+// this branch — without flipping canvasProxyActive the path is dead.
+func TestCanvasOrBearer_SameOriginCanvas_Passes(t *testing.T) {
+	prev := canvasProxyActive
+	canvasProxyActive = true
+	defer func() { canvasProxyActive = prev }()
+
+	mockDB, mock, err := sqlmock.New()
+	if err != nil {
+		t.Fatalf("sqlmock: %v", err)
+	}
+	defer mockDB.Close()
+	mock.ExpectQuery(hasAnyLiveTokenGlobalQuery).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1))
+
+	r := gin.New()
+	r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) {
+		c.JSON(http.StatusOK, gin.H{"ok": true})
+	})
+
+	w := httptest.NewRecorder()
+	req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil)
+	req.Host = "tenant.moleculesai.app"
+	req.Header.Set("Referer", "https://tenant.moleculesai.app/dashboard")
+	r.ServeHTTP(w, req)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("same-origin canvas: got %d, want 200 (%s)", w.Code, w.Body.String())
+	}
+}
+
+// ── IsSameOriginCanvas: exported wrapper + branch coverage (#1818) ──────────
+
+// TestIsSameOriginCanvas_ExportedWrapper_DelegatesToInternal — IsSame...
+// is at 0% per the audit because nothing in the test suite calls the
+// exported variant. It's a one-line delegation but the auth-boundary
+// surface must stay testable from outside the package.
+func TestIsSameOriginCanvas_ExportedWrapper_DelegatesToInternal(t *testing.T) {
+	prev := canvasProxyActive
+	canvasProxyActive = true
+	defer func() { canvasProxyActive = prev }()
+
+	c, _ := gin.CreateTestContext(httptest.NewRecorder())
+	c.Request = httptest.NewRequest(http.MethodGet, "/x", nil)
+	c.Request.Host = "tenant.moleculesai.app"
+	c.Request.Header.Set("Referer", "https://tenant.moleculesai.app/x")
+
+	if !IsSameOriginCanvas(c) {
+		t.Error("exported IsSameOriginCanvas should accept matching Referer")
+	}
+}
+
+// TestIsSameOriginCanvas_DisabledByEnv — when CANVAS_PROXY_URL was unset
+// at boot, canvasProxyActive is false and the wrapper must return false
+// even on a perfect Referer match. Self-hosted / dev installs rely on
+// this short-circuit.
+func TestIsSameOriginCanvas_DisabledByEnv(t *testing.T) {
+	prev := canvasProxyActive
+	canvasProxyActive = false
+	defer func() { canvasProxyActive = prev }()
+
+	c, _ := gin.CreateTestContext(httptest.NewRecorder())
+	c.Request = httptest.NewRequest(http.MethodGet, "/x", nil)
+	c.Request.Host = "tenant.moleculesai.app"
+	c.Request.Header.Set("Referer", "https://tenant.moleculesai.app/x")
+
+	if IsSameOriginCanvas(c) {
+		t.Error("CANVAS_PROXY_URL unset → IsSameOriginCanvas must return false")
+	}
+}
+
+// TestIsSameOriginCanvas_BranchCoverage exercises every Referer/Origin
+// combination isSameOriginCanvas accepts or rejects. Table-driven so
+// adding a new edge case is one row.
+func TestIsSameOriginCanvas_BranchCoverage(t *testing.T) {
+	prev := canvasProxyActive
+	canvasProxyActive = true
+	defer func() { canvasProxyActive = prev }()
+
+	cases := []struct {
+		name    string
+		host    string
+		referer string
+		origin  string
+		want    bool
+	}{
+		// Referer accepts:
+		{"https referer matches host with path", "h.example.com", "https://h.example.com/dash", "", true},
+		{"http referer matches host with path", "h.example.com", "http://h.example.com/dash", "", true},
+		{"https referer matches host root no path", "h.example.com", "https://h.example.com", "", true},
+		{"http referer matches host root no path", "h.example.com", "http://h.example.com", "", true},
+
+		// Origin fallback (no Referer, used by WS upgrade):
+		{"https origin only (no referer)", "h.example.com", "", "https://h.example.com", true},
+		{"http origin only (no referer)", "h.example.com", "", "http://h.example.com", true},
+
+		// Reject paths — the security-critical ones:
+		{"empty host short-circuits", "", "https://h.example.com/", "", false},
+		{"referer host suffix attack — h.example.com.evil.com", "h.example.com", "https://h.example.com.evil.com/", "", false},
+		{"referer different host", "h.example.com", "https://other.example.com/", "", false},
+		{"origin different host", "h.example.com", "", "https://other.example.com", false},
+		{"no referer no origin", "h.example.com", "", "", false},
+	}
+
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			ctx, _ := gin.CreateTestContext(httptest.NewRecorder())
+			ctx.Request = httptest.NewRequest(http.MethodGet, "/x", nil)
+			ctx.Request.Host = c.host
+			if c.referer != "" {
+				ctx.Request.Header.Set("Referer", c.referer)
+			}
+			if c.origin != "" {
+				ctx.Request.Header.Set("Origin", c.origin)
+			}
+			if got := isSameOriginCanvas(ctx); got != c.want {
+				t.Errorf("isSameOriginCanvas(host=%q, referer=%q, origin=%q) = %v, want %v",
+					c.host, c.referer, c.origin, got, c.want)
+			}
+		})
+	}
+}

From 76d0f8d004c81d3542a51147041e61a392aecdeb Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 26 Apr 2026 05:23:00 -0700
Subject: [PATCH 15/34] fix(a2a): document the metadata-attach except-pass in
 a2a_executor (closes #1787)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GitHub Code Quality bot flagged the empty `except (AttributeError,
TypeError): pass` at workspace/a2a_executor.py:424 as a nit on PR #1783.
The suppression IS intentional — `new_agent_text_message()` returns
a plain string in MagicMock paths in tests where assignment to
`.metadata` raises despite hasattr being true.

This:
  - Adds a why-comment citing the test-mock motivation, commit
    dcbcf19 (the original guard), and issue #1787 so the next
    code-quality pass doesn't re-flag it.
  - Adds `logger.debug("metadata attach skipped (non-Message ...")`
    for observability — debug-level so production logs stay quiet
    but ops can flip the level if metadata loss is ever suspected.

Behavior unchanged. 43 existing a2a_executor tests still pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 workspace/a2a_executor.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/workspace/a2a_executor.py b/workspace/a2a_executor.py
index b550a350..69c56398 100644
--- a/workspace/a2a_executor.py
+++ b/workspace/a2a_executor.py
@@ -422,7 +422,14 @@ class LangGraphA2AExecutor(AgentExecutor):
                     try:
                         msg.metadata = {"tool_trace": tool_trace}
                     except (AttributeError, TypeError):
-                        pass
+                        # `new_agent_text_message()` returns a plain string in
+                        # MagicMock paths in tests, where assignment to
+                        # .metadata raises despite hasattr being true (the
+                        # mock has the attribute as a property). Suppression
+                        # is intentional — production Message objects always
+                        # accept the assignment. See #1787 + commit dcbcf19
+                        # for the original test-mock motivation.
+                        logger.debug("metadata attach skipped (non-Message return from new_agent_text_message)")
                 await event_queue.enqueue_event(msg)
                 _result = final_text
 

From 1a273f21f5894d1cc04a5252e5c289167d6752e1 Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 06:02:56 -0700
Subject: [PATCH 16/34] feat(canvas): per-workspace provision_timeout_ms
 override (#2054)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 1 of moving runtime UX knobs server-side. Builds the canvas
foundation: a workspace can carry its own provision_timeout_ms
(sourced server-side from a template manifest in a follow-up PR),
and ProvisioningTimeout's resolver respects it per-node.

Today the resolver had Props-level timeoutMs that applied to ALL
nodes — fine for tests but wrong for production where one batch
could mix runtimes (hermes 12-min cold boot alongside docker 2-min).
The runtime profile fallback already handles per-runtime defaults;
this PR adds the per-WORKSPACE override layer above that.

Resolution priority (most specific wins):
  1. node.provisionTimeoutMs — server-declared per-workspace
     override (this PR's new field)
  2. timeoutMs prop — single-threshold test override
  3. runtime profile in @/lib/runtimeProfiles
  4. DEFAULT_RUNTIME_PROFILE

Changes:
- WorkspaceData (socket): add optional provision_timeout_ms
- WorkspaceNodeData: add optional provisionTimeoutMs
- canvas-topology hydrate: thread the field through to node.data
- ProvisioningTimeout: extend the serialized-string node iteration
  to carry provisionTimeoutMs (4-field positional split); pass as
  the second arg to provisionTimeoutForRuntime
- 3 new tests in ProvisioningTimeout.test.tsx covering hydrate
  threading, null fall-through, and resolver priority

Phase 2 (separate PR, blocked on workspace-server template-config
loader): workspace-server reads provision_timeout_seconds from
template config.yaml at provision time, includes
provision_timeout_ms in the workspace API/socket response. Phase 3
(template-repo PR): template-hermes config.yaml declares
provision_timeout_seconds: 720; canvas RUNTIME_PROFILES.hermes
becomes redundant and can be removed.

19/19 tests pass (3 new + 16 existing).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 canvas/src/components/ProvisioningTimeout.tsx | 27 +++++++--
 .../__tests__/ProvisioningTimeout.test.tsx    | 55 +++++++++++++++++++
 canvas/src/store/canvas-topology.ts           |  3 +
 canvas/src/store/canvas.ts                    |  6 ++
 canvas/src/store/socket.ts                    |  7 +++
 5 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/canvas/src/components/ProvisioningTimeout.tsx b/canvas/src/components/ProvisioningTimeout.tsx
index 1c09fa3b..39d8a1fc 100644
--- a/canvas/src/components/ProvisioningTimeout.tsx
+++ b/canvas/src/components/ProvisioningTimeout.tsx
@@ -71,15 +71,19 @@ export function ProvisioningTimeout({
   // Runtime included so the timeout threshold can be resolved per-node
   // (hermes cold-boot legitimately takes 8-13 min vs 30-90s for docker
   //  runtimes — a single threshold would false-alarm on one or the other).
+  // provisionTimeoutMs added by #2054 — server-declared per-workspace
+  // override that wins over the runtime profile when present.
   // Separator: `|` between fields, `,` between nodes. Names may contain
   // anything the user typed; strip `|` and `,` so serialization round-trips.
+  // Empty-string sentinels for missing values so split/index stays positional.
   const provisioningNodes = useCanvasStore((s) => {
     const result = s.nodes
       .filter((n) => n.data.status === "provisioning")
       .map((n) => {
         const safeName = (n.data.name ?? "").replace(/[|,]/g, " ");
         const runtime = n.data.runtime ?? "";
-        return `${n.id}|${safeName}|${runtime}`;
+        const provisionTimeoutMs = n.data.provisionTimeoutMs ?? "";
+        return `${n.id}|${safeName}|${runtime}|${provisionTimeoutMs}`;
       });
     return result.join(",");
   });
@@ -87,8 +91,14 @@ export function ProvisioningTimeout({
     () =>
       provisioningNodes
         ? provisioningNodes.split(",").map((entry) => {
-            const [id, name, runtime] = entry.split("|");
-            return { id, name, runtime };
+            const [id, name, runtime, provisionTimeoutMs] = entry.split("|");
+            const ptms = provisionTimeoutMs ? Number(provisionTimeoutMs) : undefined;
+            return {
+              id,
+              name,
+              runtime,
+              provisionTimeoutMs: Number.isFinite(ptms) ? ptms : undefined,
+            };
           })
         : [],
     [provisioningNodes],
@@ -138,10 +148,19 @@ export function ProvisioningTimeout({
       // default), then scales by concurrent-provisioning count. A
       // hermes workspace in a batch alongside two langgraph workspaces
       // gets hermes's 12-min base, not langgraph's 2-min base.
+      //
+      // Resolution priority (most specific wins):
+      //   1. node.provisionTimeoutMs — server-declared per-workspace
+      //      override (#2054, sourced from template manifest)
+      //   2. timeoutMs prop — single-threshold test override
+      //   3. runtime profile in @/lib/runtimeProfiles
+      //   4. DEFAULT_RUNTIME_PROFILE
       for (const node of parsedProvisioningNodes) {
         const startedAt = tracking.get(node.id);
         if (!startedAt) continue;
-        const base = timeoutMs ?? provisionTimeoutForRuntime(node.runtime);
+        const base = provisionTimeoutForRuntime(node.runtime, {
+          provisionTimeoutMs: node.provisionTimeoutMs ?? timeoutMs,
+        });
         const effective = effectiveTimeoutMs(
           base,
           parsedProvisioningNodes.length,
diff --git a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx
index 2424ea49..dedb1fb3 100644
--- a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx
+++ b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx
@@ -287,5 +287,60 @@ describe("ProvisioningTimeout", () => {
         );
       });
     });
+
+    // #2054 — per-workspace server override threading from socket
+    // payload through node-data into ProvisioningTimeout's resolver.
+    // Doesn't render the component; verifies the data path lands the
+    // value where ProvisioningTimeout reads it from.
+    describe("server-side per-workspace override (#2054)", () => {
+      it("hydrate carries provision_timeout_ms onto node.data.provisionTimeoutMs", () => {
+        useCanvasStore.getState().hydrate([
+          makeWS({
+            id: "ws-slow",
+            name: "Slow",
+            status: "provisioning",
+            runtime: "future-runtime",
+            provision_timeout_ms: 600_000,
+          }),
+        ]);
+        const node = useCanvasStore
+          .getState()
+          .nodes.find((n) => n.id === "ws-slow");
+        expect(node?.data.provisionTimeoutMs).toBe(600_000);
+      });
+
+      it("absent provision_timeout_ms hydrates to null (falls through to runtime profile)", () => {
+        useCanvasStore.getState().hydrate([
+          makeWS({ id: "ws-default", name: "Default", status: "provisioning", runtime: "hermes" }),
+        ]);
+        const node = useCanvasStore
+          .getState()
+          .nodes.find((n) => n.id === "ws-default");
+        expect(node?.data.provisionTimeoutMs).toBeNull();
+        // And the resolver still returns hermes' profile value when
+        // no override is supplied — proves the fall-through stays intact.
+        expect(
+          provisionTimeoutForRuntime("hermes", {
+            provisionTimeoutMs: node?.data.provisionTimeoutMs ?? undefined,
+          }),
+        ).toBe(RUNTIME_PROFILES.hermes.provisionTimeoutMs);
+      });
+
+      it("server override wins over runtime profile via the resolver path the component uses", () => {
+        // Mirrors ProvisioningTimeout.tsx:144 where node.provisionTimeoutMs
+        // is passed as overrides — verifies the resolver respects it
+        // even when the runtime has its own profile entry.
+        const override = 30_000;
+        expect(
+          provisionTimeoutForRuntime("hermes", {
+            provisionTimeoutMs: override,
+          }),
+        ).toBe(override);
+        // Sanity — the runtime profile would have been much larger.
+        expect(RUNTIME_PROFILES.hermes.provisionTimeoutMs).toBeGreaterThan(
+          override,
+        );
+      });
+    });
   });
 });
diff --git a/canvas/src/store/canvas-topology.ts b/canvas/src/store/canvas-topology.ts
index 9c1cb25f..fbd02601 100644
--- a/canvas/src/store/canvas-topology.ts
+++ b/canvas/src/store/canvas-topology.ts
@@ -478,6 +478,9 @@ export function buildNodesAndEdges(
         needsRestart: false,
         budgetLimit: ws.budget_limit ?? null,
         budgetUsed: ws.budget_used ?? null,
+        // #2054 — server-declared per-workspace provisioning timeout.
+        // Falls through to the runtime profile when null/absent.
+        provisionTimeoutMs: ws.provision_timeout_ms ?? null,
       },
     };
     if (hasParent) {
diff --git a/canvas/src/store/canvas.ts b/canvas/src/store/canvas.ts
index 2cec82ea..02f93b25 100644
--- a/canvas/src/store/canvas.ts
+++ b/canvas/src/store/canvas.ts
@@ -92,6 +92,12 @@ export interface WorkspaceNodeData extends Record<string, unknown> {
   budgetLimit: number | null;
   /** Cumulative USD spend. Present when the platform tracks spend (issue #541). */
   budgetUsed?: number | null;
+  /** Per-workspace provisioning-timeout override in milliseconds (#2054).
+   *  Sourced server-side from the workspace's template manifest at provision
+   *  time. null/absent = fall through to runtime profile + default in
+   *  @/lib/runtimeProfiles. Lets a slow runtime declare its cold-boot
+   *  expectation without a canvas release. */
+  provisionTimeoutMs?: number | null;
 }
 
 export type PanelTab = "details" | "skills" | "chat" | "terminal" | "config" | "schedule" | "channels" | "files" | "memory" | "traces" | "events" | "activity" | "audit";
diff --git a/canvas/src/store/socket.ts b/canvas/src/store/socket.ts
index f350c4d7..858fc875 100644
--- a/canvas/src/store/socket.ts
+++ b/canvas/src/store/socket.ts
@@ -122,6 +122,13 @@ export interface WorkspaceData {
   budget_limit: number | null;
   /** Cumulative USD spend for this workspace. Present when the platform tracks spend. */
   budget_used?: number | null;
+  /** Server-declared provisioning-timeout override in milliseconds (#2054).
+   *  Sourced from the workspace's template manifest at provision time —
+   *  lets a slow runtime declare its cold-boot expectation without a
+   *  canvas release. Falls through to the per-runtime profile in
+   *  `@/lib/runtimeProfiles` when absent (the default behavior for any
+   *  template that hasn't yet declared the field). */
+  provision_timeout_ms?: number | null;
 }
 
 let socket: ReconnectingSocket | null = null;

From 6b9be7b086dbe65ad17114d3b9a6df1b12eeda56 Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 06:05:47 -0700
Subject: [PATCH 17/34] docs(provisioning): clarify separator-safety contract
 for the serialized-node string
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

simplify-review note: the |/,-delimited node string is brittle if a
future string-typed field is added without sanitization. Document
which fields are user-typed (name — already sanitized) vs primitive
(id is UUID, runtime is a slug, provisionTimeoutMs is numeric) so
the next field-add doesn't accidentally introduce an injection
vector for the splitter.

Skipped (false-positive review finding): the agent flagged the
prop > runtime-profile order as inconsistent with the docstring,
but the docstring explicitly lists the prop at #2 (between node and
runtime-profile) — matches both the implementation AND the original
behavior pre-#2054 (the prop was 'timeoutMs ?? runtime-profile').

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 canvas/src/components/ProvisioningTimeout.tsx | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/canvas/src/components/ProvisioningTimeout.tsx b/canvas/src/components/ProvisioningTimeout.tsx
index 39d8a1fc..2f2ee564 100644
--- a/canvas/src/components/ProvisioningTimeout.tsx
+++ b/canvas/src/components/ProvisioningTimeout.tsx
@@ -73,8 +73,11 @@ export function ProvisioningTimeout({
   //  runtimes — a single threshold would false-alarm on one or the other).
   // provisionTimeoutMs added by #2054 — server-declared per-workspace
   // override that wins over the runtime profile when present.
-  // Separator: `|` between fields, `,` between nodes. Names may contain
-  // anything the user typed; strip `|` and `,` so serialization round-trips.
+  // Separator: `|` between fields, `,` between nodes. Only `name` is
+  // user-typed (gets sanitized below); the other fields are
+  // primitive-typed (id is a UUID, runtime is a [a-z-]+ slug,
+  // provisionTimeoutMs is numeric). If a future field is string-typed,
+  // extend the sanitize step to strip `|` + `,` from it too.
   // Empty-string sentinels for missing values so split/index stays positional.
   const provisioningNodes = useCanvasStore((s) => {
     const result = s.nodes

From 355355a80a59eceeb0db8f9a4368f900c725b4d8 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 26 Apr 2026 06:21:22 -0700
Subject: [PATCH 18/34] test(workspace): centralize pytest-cov config + 92%
 floor (closes #1817)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Python workspace already runs pytest-cov in CI but with no
threshold and inline-flagged config. CI run 24956647701 (2026-04-26
staging) reports 97% coverage on the package — well above the issue's
75% target. The actionable gap is locking in a floor so a regression
can't sneak past, and centralizing config so local `pytest` matches CI.

Changes:

- workspace/pytest.ini — coverage flags moved into addopts (-q,
  --cov=., --cov-report=term-missing, --cov-fail-under=92).
  92% = current 97% measurement minus the 5pp safety margin
  the issue's Step 3 prescribes.

- workspace/.coveragerc (new) — [run] omit list and [report]
  skip_covered. coverage.py doesn't read pytest.ini sections, so
  the omit config has to live here.

- .github/workflows/ci.yml — removed the inline --cov flags from the
  Python Lint & Test step; now reads from pytest.ini. Workflow stays
  the same single-command shape, just simpler.

Result: any PR that drops coverage below 92% fails CI loudly. Floor
ratchets up by replacing 92 with current measurement on a future
test-writing pass — same shape as Go coverage gates landed elsewhere.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci.yml |  4 +++-
 workspace/.coveragerc    | 13 +++++++++++++
 workspace/pytest.ini     | 15 ++++++++++++++-
 3 files changed, 30 insertions(+), 2 deletions(-)
 create mode 100644 workspace/.coveragerc

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2ee5fe5b..a9902658 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -283,7 +283,9 @@ jobs:
           cache: pip
           cache-dependency-path: workspace/requirements.txt
       - run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov
-      - run: python -m pytest --tb=short -q --cov=. --cov-report=term-missing
+      # Coverage flags + fail-under floor moved into workspace/pytest.ini
+      # (issue #1817) so local `pytest` and CI use identical config.
+      - run: python -m pytest --tb=short
 
       # SDK + plugin validation moved to standalone repo:
       # github.com/Molecule-AI/molecule-sdk-python
diff --git a/workspace/.coveragerc b/workspace/.coveragerc
new file mode 100644
index 00000000..b14f2f88
--- /dev/null
+++ b/workspace/.coveragerc
@@ -0,0 +1,13 @@
+# coverage.py config — consumed by `pytest --cov` via the pytest-cov
+# plugin. Lives here (not in pytest.ini) because coverage.py only reads
+# .coveragerc / setup.cfg / tox.ini / pyproject.toml — the [coverage:*]
+# sections in pytest.ini are silently ignored. See issue #1817.
+[run]
+omit =
+    */tests/*
+    */__init__.py
+    plugins_registry/*
+
+[report]
+# Skip files at 100% in the term-missing output to keep CI logs readable.
+skip_covered = True
diff --git a/workspace/pytest.ini b/workspace/pytest.ini
index 50e89f85..230622e5 100644
--- a/workspace/pytest.ini
+++ b/workspace/pytest.ini
@@ -3,4 +3,17 @@ testpaths = tests
 python_files = test_*.py
 python_functions = test_*
 asyncio_mode = auto
-addopts = -q
+# Coverage config moved here from .github/workflows/ci.yml so local
+# `pytest` matches CI without operator-typed flags. cov-fail-under
+# pins the floor at 92% — 5pp below the 97% measured on staging
+# (run 24956647701, 2026-04-26). Floor exists so a regression that
+# drops coverage doesn't sneak past CI; tightening above 92% should
+# follow real measurement, not aspiration. See issue #1817.
+addopts =
+    -q
+    --cov=.
+    --cov-report=term-missing
+    --cov-fail-under=92
+# Coverage omit / report config lives in workspace/.coveragerc — coverage.py
+# only reads .coveragerc / setup.cfg / tox.ini / pyproject.toml, NOT
+# pytest.ini, so [coverage:*] sections here would be silently ignored.

From 5d294936b37df3643f1657e31064d7d390e09b24 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 26 Apr 2026 06:24:36 -0700
Subject: [PATCH 19/34] =?UTF-8?q?fixup:=20lower=20coverage=20floor=2092?=
 =?UTF-8?q?=E2=86=9286=20to=20match=20post-omit=20measurement?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 97% number from CI run 24956647701 was measured WITHOUT a
.coveragerc omit list. Once this PR's prescribed omit set is in
effect (`*/__init__.py`, `*/tests/*`, `plugins_registry/*` — files
that don't carry behavior), the actual measurement of behavior-bearing
code on the same staging snapshot is 91.11% (run 24957664272).

86% sits at the issue's prescribed `current − 5pp` margin and
unblocks CI without lowering the bar in real terms.
---
 workspace/pytest.ini | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/workspace/pytest.ini b/workspace/pytest.ini
index 230622e5..22fee4eb 100644
--- a/workspace/pytest.ini
+++ b/workspace/pytest.ini
@@ -5,15 +5,22 @@ python_functions = test_*
 asyncio_mode = auto
 # Coverage config moved here from .github/workflows/ci.yml so local
 # `pytest` matches CI without operator-typed flags. cov-fail-under
-# pins the floor at 92% — 5pp below the 97% measured on staging
-# (run 24956647701, 2026-04-26). Floor exists so a regression that
-# drops coverage doesn't sneak past CI; tightening above 92% should
+# pins the floor at 86% — 5pp below the 91.11% measured on staging
+# (run 24957664272, 2026-04-26). Floor exists so a regression that
+# drops coverage doesn't sneak past CI; tightening above 86% should
 # follow real measurement, not aspiration. See issue #1817.
+#
+# Why 86 not 92: the earlier 97% measurement was without the
+# .coveragerc omit list. Once `*/__init__.py`, `*/tests/*`, and
+# `plugins_registry/*` are excluded (the issue's prescribed omit
+# set, more meaningful since those files don't carry behavior),
+# the actual measurement of behavior-bearing code is 91.11% — and
+# 86% sits at the issue's prescribed `current - 5pp` margin.
 addopts =
     -q
     --cov=.
     --cov-report=term-missing
-    --cov-fail-under=92
+    --cov-fail-under=86
 # Coverage omit / report config lives in workspace/.coveragerc — coverage.py
 # only reads .coveragerc / setup.cfg / tox.ini / pyproject.toml, NOT
 # pytest.ini, so [coverage:*] sections here would be silently ignored.

From 27396d992c3c09cd7590c7d4e3477d3aac71193c Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 06:37:45 -0700
Subject: [PATCH 20/34] feat(workspace-server): surface provision_timeout_ms in
 workspace API (#2054 phase 2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 2 of #2054 — workspace-server reads runtime-level
provision_timeout_seconds from template config.yaml manifests and
includes provision_timeout_ms in the workspace List/Get response.
Phase 1 (canvas, #2092) already plumbs the field through socket →
node-data → ProvisioningTimeout's resolver, so the moment a
template declares the field the per-runtime banner threshold
adjusts without a canvas release.

Implementation:

- templates.go: parse runtime_config.provision_timeout_seconds in
  the templateSummary marshaller. The /templates API now surfaces
  the field too — useful for ops dashboards and future tooling.
- runtime_provision_timeouts.go (new): loadRuntimeProvisionTimeouts
  scans configsDir, parses every immediate subdir's config.yaml,
  returns runtime → seconds. Multiple templates with the same
  runtime: max wins (so a slow template's threshold doesn't get
  cut by a fast template's). Bad/empty inputs are silently
  skipped — workspace-server starts cleanly with no templates.
- runtimeProvisionTimeoutsCache: sync.Once-backed lazy cache.
  First workspace API request after process start pays the read
  cost (~few KB across ~50 templates); every subsequent request is
  a map lookup. Cache lifetime = process lifetime; invalidates on
  workspace-server restart, which is the normal template-change
  cadence.
- WorkspaceHandler gets a provisionTimeouts field (zero-value struct
  is valid — the cache lazy-inits on first get()).
- addProvisionTimeoutMs decorates the response map with
  provision_timeout_ms (seconds × 1000) when the runtime has a
  declared timeout. Absent = no key in the response, canvas falls
  through to its runtime-profile default. Wired into both List
  (per-row decoration in the loop) and Get.

Tests (5 new in runtime_provision_timeouts_test.go):
- happy path: hermes declares 720, claude-code doesn't, only
  hermes appears in the map
- max-on-duplicate: same runtime in two templates → max wins
- skip-bad-inputs: missing runtime, zero timeout, malformed yaml,
  loose top-level files all silently ignored
- missing-dir: returns empty map, no crash
- cache: lazy-init on first get; subsequent gets hit cache even
  after underlying file changes (sync.Once contract); unknown
  runtime returns zero

Phase 3 (separate template-repo PR): template-hermes config.yaml
declares provision_timeout_seconds: 720 under runtime_config.
canvas RUNTIME_PROFILES.hermes becomes redundant + removable.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../handlers/runtime_provision_timeouts.go    |  83 ++++++++++++
 .../runtime_provision_timeouts_test.go        | 128 ++++++++++++++++++
 .../internal/handlers/templates.go            |  35 +++--
 .../internal/handlers/workspace.go            |  28 ++++
 4 files changed, 261 insertions(+), 13 deletions(-)
 create mode 100644 workspace-server/internal/handlers/runtime_provision_timeouts.go
 create mode 100644 workspace-server/internal/handlers/runtime_provision_timeouts_test.go

diff --git a/workspace-server/internal/handlers/runtime_provision_timeouts.go b/workspace-server/internal/handlers/runtime_provision_timeouts.go
new file mode 100644
index 00000000..d27b1975
--- /dev/null
+++ b/workspace-server/internal/handlers/runtime_provision_timeouts.go
@@ -0,0 +1,83 @@
+package handlers
+
+import (
+	"log"
+	"os"
+	"path/filepath"
+	"sync"
+
+	"gopkg.in/yaml.v3"
+)
+
+// runtimeProvisionTimeouts caches the per-runtime provision-timeout values
+// declared in template config.yaml manifests (#2054 phase 2). Lazy-init so
+// the first workspace API request after process start pays the read cost
+// (a few KB of yaml across ~50 templates) and every subsequent one is a
+// map lookup.
+//
+// Cache lifetime = process lifetime. Templates only change on container
+// rebuild + workspace-server restart, which already invalidates the
+// in-memory state. A future template-hot-reload feature would need to
+// refresh this cache; today there is no such hook.
+type runtimeProvisionTimeoutsCache struct {
+	once sync.Once
+	m    map[string]int // runtime → seconds
+}
+
+func (c *runtimeProvisionTimeoutsCache) get(configsDir string, runtime string) int {
+	c.once.Do(func() {
+		c.m = loadRuntimeProvisionTimeouts(configsDir)
+	})
+	return c.m[runtime]
+}
+
+// loadRuntimeProvisionTimeouts walks `configsDir`, parses every immediate
+// subdirectory's `config.yaml`, and returns a map of runtime → seconds
+// for templates that declared `runtime_config.provision_timeout_seconds`.
+//
+// Templates without the field aren't represented (lookup returns zero,
+// which the caller treats as "fall through to canvas runtime profile").
+//
+// Multiple templates with the same runtime: take the MAX timeout — a
+// slow template's threshold should win over a fast template's so users
+// of either template see a true-positive timeout signal rather than a
+// false alarm. Same-runtime divergence is rare in practice (typically
+// one canonical template-{runtime} per runtime) but the rule is the
+// safer default.
+func loadRuntimeProvisionTimeouts(configsDir string) map[string]int {
+	out := map[string]int{}
+	entries, err := os.ReadDir(configsDir)
+	if err != nil {
+		// Logged but not fatal — workspace-server starts cleanly with
+		// no templates (dev / fresh-clone). The result is an empty map
+		// so every runtime falls through to canvas's profile default.
+		log.Printf("loadRuntimeProvisionTimeouts: read configsDir %s: %v", configsDir, err)
+		return out
+	}
+	for _, e := range entries {
+		if !e.IsDir() {
+			continue
+		}
+		data, err := os.ReadFile(filepath.Join(configsDir, e.Name(), "config.yaml"))
+		if err != nil {
+			continue
+		}
+		var raw struct {
+			Runtime       string `yaml:"runtime"`
+			RuntimeConfig struct {
+				ProvisionTimeoutSeconds int `yaml:"provision_timeout_seconds"`
+			} `yaml:"runtime_config"`
+		}
+		if err := yaml.Unmarshal(data, &raw); err != nil {
+			continue
+		}
+		secs := raw.RuntimeConfig.ProvisionTimeoutSeconds
+		if secs <= 0 || raw.Runtime == "" {
+			continue
+		}
+		if existing, ok := out[raw.Runtime]; !ok || secs > existing {
+			out[raw.Runtime] = secs
+		}
+	}
+	return out
+}
diff --git a/workspace-server/internal/handlers/runtime_provision_timeouts_test.go b/workspace-server/internal/handlers/runtime_provision_timeouts_test.go
new file mode 100644
index 00000000..a6cd3b52
--- /dev/null
+++ b/workspace-server/internal/handlers/runtime_provision_timeouts_test.go
@@ -0,0 +1,128 @@
+package handlers
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+// writeTemplate is a tiny test fixture: drop a config.yaml under
+// tmp/<dir>/config.yaml with the given content. Mirrors the real
+// configsDir layout (one subdir per template, each with its own
+// config.yaml).
+func writeTemplate(t *testing.T, dir, name, content string) {
+	t.Helper()
+	p := filepath.Join(dir, name)
+	if err := os.MkdirAll(p, 0o755); err != nil {
+		t.Fatalf("mkdir %s: %v", p, err)
+	}
+	if err := os.WriteFile(filepath.Join(p, "config.yaml"), []byte(content), 0o644); err != nil {
+		t.Fatalf("write config.yaml: %v", err)
+	}
+}
+
+func TestLoadRuntimeProvisionTimeouts_HappyPath(t *testing.T) {
+	dir := t.TempDir()
+	writeTemplate(t, dir, "template-hermes", `
+name: Hermes
+runtime: hermes
+runtime_config:
+  provision_timeout_seconds: 720
+`)
+	writeTemplate(t, dir, "template-claude-code", `
+name: Claude
+runtime: claude-code
+runtime_config:
+  model: anthropic:claude-opus
+`)
+	got := loadRuntimeProvisionTimeouts(dir)
+	if got["hermes"] != 720 {
+		t.Errorf("hermes: got %d, want 720", got["hermes"])
+	}
+	// claude-code didn't declare a timeout — must not appear in the map
+	// (zero-value lookup is the no-override signal).
+	if _, ok := got["claude-code"]; ok {
+		t.Errorf("claude-code: present without declaration: %d", got["claude-code"])
+	}
+}
+
+func TestLoadRuntimeProvisionTimeouts_MaxOnDuplicateRuntime(t *testing.T) {
+	dir := t.TempDir()
+	writeTemplate(t, dir, "template-hermes-fast", `
+runtime: hermes
+runtime_config:
+  provision_timeout_seconds: 300
+`)
+	writeTemplate(t, dir, "template-hermes-slow", `
+runtime: hermes
+runtime_config:
+  provision_timeout_seconds: 900
+`)
+	got := loadRuntimeProvisionTimeouts(dir)
+	// Max wins so the slowest template's threshold doesn't false-alarm
+	// when both templates use the same runtime.
+	if got["hermes"] != 900 {
+		t.Errorf("max-on-duplicate: got %d, want 900", got["hermes"])
+	}
+}
+
+func TestLoadRuntimeProvisionTimeouts_SkipsBadInputs(t *testing.T) {
+	dir := t.TempDir()
+	// Missing runtime field — has timeout but no key to map under.
+	writeTemplate(t, dir, "template-no-runtime", `
+runtime_config:
+  provision_timeout_seconds: 600
+`)
+	// Zero/negative timeout — same as no declaration.
+	writeTemplate(t, dir, "template-zero", `
+runtime: zero-runtime
+runtime_config:
+  provision_timeout_seconds: 0
+`)
+	// Malformed yaml — must not crash.
+	writeTemplate(t, dir, "template-bad", "not: valid: yaml: at: all:")
+	// Loose file at the top level (not a dir) — must be ignored.
+	if err := os.WriteFile(filepath.Join(dir, "stray.txt"), []byte("ignore me"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	got := loadRuntimeProvisionTimeouts(dir)
+	if len(got) != 0 {
+		t.Errorf("expected empty map for skip cases, got %v", got)
+	}
+}
+
+func TestLoadRuntimeProvisionTimeouts_MissingDirReturnsEmpty(t *testing.T) {
+	got := loadRuntimeProvisionTimeouts("/nonexistent/path/should/not/exist/12345")
+	if len(got) != 0 {
+		t.Errorf("expected empty map on missing dir, got %v", got)
+	}
+}
+
+func TestRuntimeProvisionTimeoutsCache_LazyInitAndCached(t *testing.T) {
+	dir := t.TempDir()
+	writeTemplate(t, dir, "template-hermes", `
+runtime: hermes
+runtime_config:
+  provision_timeout_seconds: 720
+`)
+	c := runtimeProvisionTimeoutsCache{}
+
+	// First call populates.
+	if got := c.get(dir, "hermes"); got != 720 {
+		t.Errorf("first call: got %d, want 720", got)
+	}
+	// Second call hits cache — even if the underlying file changed we
+	// still see the original value (sync.Once contract).
+	if err := os.WriteFile(filepath.Join(dir, "template-hermes", "config.yaml"),
+		[]byte("runtime: hermes\nruntime_config:\n  provision_timeout_seconds: 60\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if got := c.get(dir, "hermes"); got != 720 {
+		t.Errorf("cached call: got %d, want 720 (cache must not re-read)", got)
+	}
+	// Unknown runtime returns zero — caller's signal to fall through to
+	// the canvas runtime profile default.
+	if got := c.get(dir, "unknown"); got != 0 {
+		t.Errorf("unknown runtime: got %d, want 0", got)
+	}
+}
diff --git a/workspace-server/internal/handlers/templates.go b/workspace-server/internal/handlers/templates.go
index 38735830..6c5f42f3 100644
--- a/workspace-server/internal/handlers/templates.go
+++ b/workspace-server/internal/handlers/templates.go
@@ -61,6 +61,13 @@ type templateSummary struct {
 	RequiredEnv []string `json:"required_env,omitempty"`
 	Skills      []string `json:"skills"`
 	SkillCount  int      `json:"skill_count"`
+	// ProvisionTimeoutSeconds lets a slow runtime declare its expected
+	// cold-boot duration in its template manifest. Canvas's
+	// ProvisioningTimeout banner respects this per-workspace via the
+	// `provision_timeout_ms` field in the workspace API response (#2054).
+	// 0 = template hasn't declared one, falls through to canvas's
+	// runtime-profile default.
+	ProvisionTimeoutSeconds int `json:"provision_timeout_seconds,omitempty"`
 }
 
 // resolveTemplateDir finds the template directory for a workspace on the host.
@@ -106,9 +113,10 @@ func (h *TemplatesHandler) List(c *gin.Context) {
 			Model         string   `yaml:"model"`
 			Skills        []string `yaml:"skills"`
 			RuntimeConfig struct {
-				Model       string      `yaml:"model"`
-				Models      []modelSpec `yaml:"models"`
-				RequiredEnv []string    `yaml:"required_env"`
+				Model                  string      `yaml:"model"`
+				Models                 []modelSpec `yaml:"models"`
+				RequiredEnv            []string    `yaml:"required_env"`
+				ProvisionTimeoutSeconds int         `yaml:"provision_timeout_seconds"`
 			} `yaml:"runtime_config"`
 		}
 		if err := yaml.Unmarshal(data, &raw); err != nil {
@@ -122,16 +130,17 @@ func (h *TemplatesHandler) List(c *gin.Context) {
 		}
 
 		templates = append(templates, templateSummary{
-			ID:          entry.Name(),
-			Name:        raw.Name,
-			Description: raw.Description,
-			Tier:        raw.Tier,
-			Runtime:     raw.Runtime,
-			Model:       model,
-			Models:      raw.RuntimeConfig.Models,
-			RequiredEnv: raw.RuntimeConfig.RequiredEnv,
-			Skills:      raw.Skills,
-			SkillCount:  len(raw.Skills),
+			ID:                      entry.Name(),
+			Name:                    raw.Name,
+			Description:             raw.Description,
+			Tier:                    raw.Tier,
+			Runtime:                 raw.Runtime,
+			Model:                   model,
+			Models:                  raw.RuntimeConfig.Models,
+			RequiredEnv:             raw.RuntimeConfig.RequiredEnv,
+			Skills:                  raw.Skills,
+			SkillCount:              len(raw.Skills),
+			ProvisionTimeoutSeconds: raw.RuntimeConfig.ProvisionTimeoutSeconds,
 		})
 	}
 
diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go
index 91ece238..43d6e877 100644
--- a/workspace-server/internal/handlers/workspace.go
+++ b/workspace-server/internal/handlers/workspace.go
@@ -40,6 +40,10 @@ type WorkspaceHandler struct {
 	// calls made by HibernateWorkspace without requiring a running Docker daemon.
 	// Always nil in production; the real provisioner path is used when nil.
 	stopFnOverride func(ctx context.Context, workspaceID string)
+	// provisionTimeouts caches per-runtime provision-timeout values from
+	// template manifests (#2054 phase 2). Lazy-init on first scan; see
+	// runtime_provision_timeouts.go for the loader contract.
+	provisionTimeouts runtimeProvisionTimeoutsCache
 }
 
 func NewWorkspaceHandler(b *events.Broadcaster, p *provisioner.Provisioner, platformURL, configsDir string) *WorkspaceHandler {
@@ -343,6 +347,17 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
 	})
 }
 
+// addProvisionTimeoutMs decorates a workspace response map with the
+// per-runtime provision-timeout override (#2054 phase 2) when one is
+// declared in the runtime's template manifest. No-op when the runtime
+// has no declared timeout — the canvas-side resolver falls through to
+// its runtime-profile default.
+func (h *WorkspaceHandler) addProvisionTimeoutMs(ws map[string]interface{}, runtime string) {
+	if secs := h.provisionTimeouts.get(h.configsDir, runtime); secs > 0 {
+		ws["provision_timeout_ms"] = secs * 1000
+	}
+}
+
 // scanWorkspaceRow is a helper to scan workspace+layout rows into a clean JSON map.
 func scanWorkspaceRow(rows interface {
 	Scan(dest ...interface{}) error
@@ -441,6 +456,13 @@ func (h *WorkspaceHandler) List(c *gin.Context) {
 			log.Printf("List scan error: %v", err)
 			continue
 		}
+		// #2054 phase 2: surface per-runtime provision-timeout for
+		// canvas's ProvisioningTimeout banner. Decorating per-row
+		// (vs map-once-and-reuse) keeps the helper self-contained;
+		// the cache hit is sub-microsecond.
+		if rt, _ := ws["runtime"].(string); rt != "" {
+			h.addProvisionTimeoutMs(ws, rt)
+		}
 		workspaces = append(workspaces, ws)
 	}
 	if err := rows.Err(); err != nil {
@@ -508,5 +530,11 @@ func (h *WorkspaceHandler) Get(c *gin.Context) {
 		ws["last_outbound_at"] = nil
 	}
 
+	// #2054 phase 2: per-runtime provision-timeout for canvas's
+	// ProvisioningTimeout banner.
+	if rt, _ := ws["runtime"].(string); rt != "" {
+		h.addProvisionTimeoutMs(ws, rt)
+	}
+
 	c.JSON(http.StatusOK, ws)
 }

From f1ad012024c4184913d732488d823990178db1b6 Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 06:40:15 -0700
Subject: [PATCH 21/34] refactor(handlers): apply simplify findings on PR #2094
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Extract walkTemplateConfigs(configsDir, fn) shared helper. Both
  templates.List and loadRuntimeProvisionTimeouts walked configsDir
  + parsed config.yaml — same boilerplate twice. Now centralised so
  a future template-discovery rule (subdir naming, README sentinel,
  etc.) lands in one place.
- templates.List uses the walker — net -10 lines.
- loadRuntimeProvisionTimeouts uses the walker — net -10 lines.
- Document runtimeProvisionTimeoutsCache as 'NOT SAFE for
  package-level reuse' so a future change doesn't accidentally
  promote it to a singleton (sync.Once can't be reset → tests
  would lock out other fixtures).

Skipped (review finding): atomic.Pointer[map[string]int] for
future hot-reload. The doc comment already documents the
limitation; YAGNI-promoting the primitive now would buy a
not-yet-built feature at the cost of more code today.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../handlers/runtime_provision_timeouts.go    | 73 ++++++++++++-------
 .../internal/handlers/templates.go            | 30 ++------
 2 files changed, 53 insertions(+), 50 deletions(-)

diff --git a/workspace-server/internal/handlers/runtime_provision_timeouts.go b/workspace-server/internal/handlers/runtime_provision_timeouts.go
index d27b1975..9e922fb9 100644
--- a/workspace-server/internal/handlers/runtime_provision_timeouts.go
+++ b/workspace-server/internal/handlers/runtime_provision_timeouts.go
@@ -9,16 +9,21 @@ import (
 	"gopkg.in/yaml.v3"
 )
 
-// runtimeProvisionTimeouts caches the per-runtime provision-timeout values
-// declared in template config.yaml manifests (#2054 phase 2). Lazy-init so
-// the first workspace API request after process start pays the read cost
-// (a few KB of yaml across ~50 templates) and every subsequent one is a
-// map lookup.
+// runtimeProvisionTimeoutsCache holds the per-runtime provision-timeout
+// values declared in template config.yaml manifests (#2054 phase 2).
+// Lazy-init so the first workspace API request after process start pays
+// the read cost (a few KB of yaml across ~50 templates) and every
+// subsequent one is a map lookup.
 //
 // Cache lifetime = process lifetime. Templates only change on container
 // rebuild + workspace-server restart, which already invalidates the
 // in-memory state. A future template-hot-reload feature would need to
 // refresh this cache; today there is no such hook.
+//
+// NOT SAFE for package-level reuse — sync.Once cannot be reset, so a
+// shared singleton would lock tests out of trying multiple template
+// fixtures. Tests construct fresh struct values; production code holds
+// one per WorkspaceHandler instance.
 type runtimeProvisionTimeoutsCache struct {
 	once sync.Once
 	m    map[string]int // runtime → seconds
@@ -31,9 +36,38 @@ func (c *runtimeProvisionTimeoutsCache) get(configsDir string, runtime string) i
 	return c.m[runtime]
 }
 
-// loadRuntimeProvisionTimeouts walks `configsDir`, parses every immediate
-// subdirectory's `config.yaml`, and returns a map of runtime → seconds
-// for templates that declared `runtime_config.provision_timeout_seconds`.
+// walkTemplateConfigs invokes fn(id, configBytes) for every immediate
+// subdirectory of configsDir that has a readable config.yaml. Bad reads
+// and empty dirs are silently skipped — the caller decides whether
+// missing fields warrant the slot. A bad configsDir logs once and
+// returns without invoking fn, matching the "start clean with no
+// templates" semantics.
+//
+// Shared between the templates.List handler (which decodes the full
+// templateSummary) and loadRuntimeProvisionTimeouts (which decodes the
+// narrow runtime-timeout subset). Centralising the walk means a future
+// template-discovery rule (subdir naming convention, README sentinel,
+// etc.) lands in one place.
+func walkTemplateConfigs(configsDir string, fn func(id string, configBytes []byte)) {
+	entries, err := os.ReadDir(configsDir)
+	if err != nil {
+		log.Printf("walkTemplateConfigs: read configsDir %s: %v", configsDir, err)
+		return
+	}
+	for _, e := range entries {
+		if !e.IsDir() {
+			continue
+		}
+		data, err := os.ReadFile(filepath.Join(configsDir, e.Name(), "config.yaml"))
+		if err != nil {
+			continue
+		}
+		fn(e.Name(), data)
+	}
+}
+
+// loadRuntimeProvisionTimeouts returns a map of runtime → seconds for
+// templates that declared `runtime_config.provision_timeout_seconds`.
 //
 // Templates without the field aren't represented (lookup returns zero,
 // which the caller treats as "fall through to canvas runtime profile").
@@ -46,22 +80,7 @@ func (c *runtimeProvisionTimeoutsCache) get(configsDir string, runtime string) i
 // safer default.
 func loadRuntimeProvisionTimeouts(configsDir string) map[string]int {
 	out := map[string]int{}
-	entries, err := os.ReadDir(configsDir)
-	if err != nil {
-		// Logged but not fatal — workspace-server starts cleanly with
-		// no templates (dev / fresh-clone). The result is an empty map
-		// so every runtime falls through to canvas's profile default.
-		log.Printf("loadRuntimeProvisionTimeouts: read configsDir %s: %v", configsDir, err)
-		return out
-	}
-	for _, e := range entries {
-		if !e.IsDir() {
-			continue
-		}
-		data, err := os.ReadFile(filepath.Join(configsDir, e.Name(), "config.yaml"))
-		if err != nil {
-			continue
-		}
+	walkTemplateConfigs(configsDir, func(_ string, data []byte) {
 		var raw struct {
 			Runtime       string `yaml:"runtime"`
 			RuntimeConfig struct {
@@ -69,15 +88,15 @@ func loadRuntimeProvisionTimeouts(configsDir string) map[string]int {
 			} `yaml:"runtime_config"`
 		}
 		if err := yaml.Unmarshal(data, &raw); err != nil {
-			continue
+			return
 		}
 		secs := raw.RuntimeConfig.ProvisionTimeoutSeconds
 		if secs <= 0 || raw.Runtime == "" {
-			continue
+			return
 		}
 		if existing, ok := out[raw.Runtime]; !ok || secs > existing {
 			out[raw.Runtime] = secs
 		}
-	}
+	})
 	return out
 }
diff --git a/workspace-server/internal/handlers/templates.go b/workspace-server/internal/handlers/templates.go
index 6c5f42f3..e33c06d6 100644
--- a/workspace-server/internal/handlers/templates.go
+++ b/workspace-server/internal/handlers/templates.go
@@ -87,24 +87,8 @@ func (h *TemplatesHandler) resolveTemplateDir(wsName string) string {
 
 // List handles GET /templates
 func (h *TemplatesHandler) List(c *gin.Context) {
-	entries, err := os.ReadDir(h.configsDir)
-	if err != nil {
-		c.JSON(http.StatusOK, []templateSummary{})
-		return
-	}
-
 	templates := make([]templateSummary, 0)
-	for _, entry := range entries {
-		if !entry.IsDir() {
-			continue
-		}
-
-		configPath := filepath.Join(h.configsDir, entry.Name(), "config.yaml")
-		data, err := os.ReadFile(configPath)
-		if err != nil {
-			continue
-		}
-
+	walkTemplateConfigs(h.configsDir, func(id string, data []byte) {
 		var raw struct {
 			Name          string   `yaml:"name"`
 			Description   string   `yaml:"description"`
@@ -113,14 +97,14 @@ func (h *TemplatesHandler) List(c *gin.Context) {
 			Model         string   `yaml:"model"`
 			Skills        []string `yaml:"skills"`
 			RuntimeConfig struct {
-				Model                  string      `yaml:"model"`
-				Models                 []modelSpec `yaml:"models"`
-				RequiredEnv            []string    `yaml:"required_env"`
+				Model                   string      `yaml:"model"`
+				Models                  []modelSpec `yaml:"models"`
+				RequiredEnv             []string    `yaml:"required_env"`
 				ProvisionTimeoutSeconds int         `yaml:"provision_timeout_seconds"`
 			} `yaml:"runtime_config"`
 		}
 		if err := yaml.Unmarshal(data, &raw); err != nil {
-			continue
+			return
 		}
 
 		// Model comes from either top-level (legacy) or runtime_config.model (current).
@@ -130,7 +114,7 @@ func (h *TemplatesHandler) List(c *gin.Context) {
 		}
 
 		templates = append(templates, templateSummary{
-			ID:                      entry.Name(),
+			ID:                      id,
 			Name:                    raw.Name,
 			Description:             raw.Description,
 			Tier:                    raw.Tier,
@@ -142,7 +126,7 @@ func (h *TemplatesHandler) List(c *gin.Context) {
 			SkillCount:              len(raw.Skills),
 			ProvisionTimeoutSeconds: raw.RuntimeConfig.ProvisionTimeoutSeconds,
 		})
-	}
+	})
 
 	c.JSON(http.StatusOK, templates)
 }

From 2b76f7dfcb97a8499ba4eb2f8febe07098074f08 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 26 Apr 2026 06:50:36 -0700
Subject: [PATCH 22/34] fix(discovery): isSafeURL guard on registered URLs
 (closes #1484)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#1484 flagged that discoverHostPeer() and writeExternalWorkspaceURL()
return URLs sourced from the workspaces table without an isSafeURL
check. Workspace runtimes register their own URLs via /registry/register
— a misbehaving / compromised runtime could register a metadata-IP URL.
Today both functions are gated by Phase 30.6 bearer-required Discover,
so exposure is theoretical. The fix makes them safe regardless of
upstream auth shape.

Changes:
- discoverHostPeer: isSafeURL on resolved URL before responding;
  503 + log on rejection.
- writeExternalWorkspaceURL: same guard applied to the post-rewrite
  outURL (so a host.docker.internal rewrite is checked AND a
  metadata-IP that survived the rewrite untouched is rejected).
- 3 new regression tests:
  * RejectsMetadataIPURL on host-peer path (169.254.169.254 → 503)
  * AcceptsPublicURL on host-peer path (8.8.8.8 → 200; positive
    counterpart so the rejection test can't pass via universal-fail)
  * RejectsMetadataIPURL on external-workspace path

setupTestDB already disables SSRF checks via setSSRFCheckForTest,
so the 16+ existing discovery tests remain untouched. Only the new
tests opt in to enabled SSRF.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../internal/handlers/discovery.go            | 25 ++++++
 .../internal/handlers/discovery_test.go       | 88 +++++++++++++++++++
 2 files changed, 113 insertions(+)

diff --git a/workspace-server/internal/handlers/discovery.go b/workspace-server/internal/handlers/discovery.go
index 6422e002..79315016 100644
--- a/workspace-server/internal/handlers/discovery.go
+++ b/workspace-server/internal/handlers/discovery.go
@@ -102,6 +102,19 @@ func discoverHostPeer(ctx context.Context, c *gin.Context, targetID string) {
 		return
 	}
 
+	// #1484 SSRF defense-in-depth: the URL came from the workspaces table
+	// without any per-write validation (workspace runtimes register their
+	// own URLs via /registry/register, and a misbehaving / compromised
+	// runtime could register a 169.254.169.254 metadata URL). Validate
+	// before handing it to the caller, who might dispatch HTTP against it.
+	// Currently gated by the bearer-required Discover handler, but this
+	// guard makes discoverHostPeer safe regardless of upstream auth shape.
+	if err := isSafeURL(url.String); err != nil {
+		log.Printf("Discovery: rejecting unsafe registered URL for %s (#1484): %v", resolvedID, err)
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace URL failed safety check", "status": status})
+		return
+	}
+
 	db.CacheURL(ctx, resolvedID, url.String)
 	c.JSON(http.StatusOK, gin.H{
 		"id":     resolvedID,
@@ -172,6 +185,18 @@ func writeExternalWorkspaceURL(ctx context.Context, c *gin.Context, callerID, ta
 		outURL = strings.Replace(outURL, "127.0.0.1", "host.docker.internal", 1)
 		outURL = strings.Replace(outURL, "localhost", "host.docker.internal", 1)
 	}
+
+	// #1484 SSRF defense-in-depth — same rationale as discoverHostPeer.
+	// We validate the post-rewrite URL because the rewrite changes which
+	// host the caller would dispatch against (host.docker.internal is
+	// only reachable inside a docker network; isSafeURL accepts it but
+	// blocks a metadata IP that survived the rewrite untouched).
+	if err := isSafeURL(outURL); err != nil {
+		log.Printf("Discovery: rejecting unsafe external workspace URL for %s (#1484): %v", targetID, err)
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace URL failed safety check"})
+		return true
+	}
+
 	c.JSON(http.StatusOK, gin.H{"id": targetID, "url": outURL, "name": wsName})
 	return true
 }
diff --git a/workspace-server/internal/handlers/discovery_test.go b/workspace-server/internal/handlers/discovery_test.go
index e2368aac..892a1f0a 100644
--- a/workspace-server/internal/handlers/discovery_test.go
+++ b/workspace-server/internal/handlers/discovery_test.go
@@ -671,6 +671,94 @@ func TestWriteExternalWorkspaceURL_RewritesLocalhostForDockerCaller(t *testing.T
 	}
 }
 
+// --- #1484 SSRF defense-in-depth regression tests ---
+
+// TestDiscoverHostPeer_RejectsMetadataIPURL pins the #1484 guard:
+// even though discoverHostPeer is currently gated by a bearer-required
+// Discover handler, the function MUST refuse to hand back a URL that
+// resolves into the cloud-metadata range. setupTestDB disables SSRF
+// for normal tests, so we re-enable it here for the duration of the
+// case and provide a literal IP so the check doesn't depend on DNS.
+func TestDiscoverHostPeer_RejectsMetadataIPURL(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	restoreSSRF := setSSRFCheckForTest(true)
+	t.Cleanup(restoreSSRF)
+
+	mock.ExpectQuery(`SELECT url, status, forwarded_to FROM workspaces WHERE id =`).
+		WithArgs("ws-bad").
+		WillReturnRows(sqlmock.NewRows([]string{"url", "status", "forwarded_to"}).
+			AddRow("http://169.254.169.254/latest/meta-data/", "online", nil))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Request = httptest.NewRequest("GET", "/x", nil)
+
+	discoverHostPeer(context.Background(), c, "ws-bad")
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503 for metadata-IP URL, got %d: %s", w.Code, w.Body.String())
+	}
+	if !strings.Contains(w.Body.String(), "safety check") {
+		t.Errorf("response should mention 'safety check', got %s", w.Body.String())
+	}
+}
+
+// TestDiscoverHostPeer_AcceptsPublicURL is the positive counterpart —
+// a routable hostname (literal public-range IP, no DNS dependency)
+// passes through the new guard and returns 200. Without it, the
+// rejection test above could pass by falsely failing every URL.
+func TestDiscoverHostPeer_AcceptsPublicURL(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	restoreSSRF := setSSRFCheckForTest(true)
+	t.Cleanup(restoreSSRF)
+
+	mock.ExpectQuery(`SELECT url, status, forwarded_to FROM workspaces WHERE id =`).
+		WithArgs("ws-good").
+		WillReturnRows(sqlmock.NewRows([]string{"url", "status", "forwarded_to"}).
+			AddRow("http://8.8.8.8/a2a", "online", nil))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Request = httptest.NewRequest("GET", "/x", nil)
+
+	discoverHostPeer(context.Background(), c, "ws-good")
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200 for public-IP URL, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
+// TestWriteExternalWorkspaceURL_RejectsMetadataIPURL is the parallel
+// guard for the external-runtime path. Same #1484 rationale as the
+// host-peer test above; covers writeExternalWorkspaceURL specifically.
+func TestWriteExternalWorkspaceURL_RejectsMetadataIPURL(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	restoreSSRF := setSSRFCheckForTest(true)
+	t.Cleanup(restoreSSRF)
+
+	mock.ExpectQuery(`SELECT COALESCE\(url,''\) FROM workspaces WHERE id =`).
+		WithArgs("ws-ext").
+		WillReturnRows(sqlmock.NewRows([]string{"url"}).
+			AddRow("http://169.254.169.254/computeMetadata/v1/"))
+	// callerRuntime lookup happens before isSafeURL — must mock it.
+	mock.ExpectQuery(`SELECT COALESCE\(runtime,'langgraph'\) FROM workspaces WHERE id =`).
+		WithArgs("ws-caller").
+		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("langgraph"))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Request = httptest.NewRequest("GET", "/x", nil)
+
+	handled := writeExternalWorkspaceURL(context.Background(), c, "ws-caller", "ws-ext", "Bad WS")
+	if !handled {
+		t.Fatal("expected handled=true (the function did write a 503)")
+	}
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503 for metadata-IP URL, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
 // --- discoverHostPeer smoke (currently unreachable via Discover) ---
 
 func TestDiscoverHostPeer_Smoke_CacheHit(t *testing.T) {

From fd891a147e1c052aed7917bb623c88a00d3323cc Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 26 Apr 2026 07:18:58 -0700
Subject: [PATCH 23/34] fix(a2a): isSafeURL guard inside dispatchA2A (closes
 #1483)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#1483 flagged that dispatchA2A() doesn't call isSafeURL internally —
the guard exists only at the caller level (resolveAgentURL at
a2a_proxy.go:424). The primary call path through proxyA2ARequest
is safe today, but if any future code path ever calls dispatchA2A
directly without going through resolveAgentURL, the SSRF check
would be silently bypassed.

This adds the one-line defense-in-depth guard the issue prescribed:

  if err := isSafeURL(agentURL); err != nil {
      return nil, nil, &proxyDispatchBuildError{err: err}
  }

Wrapping as *proxyDispatchBuildError preserves the existing caller
error-classification path — the same shape that maps to 500 elsewhere.

Adds TestDispatchA2A_RejectsUnsafeURL pinning the contract:
re-enables SSRF for the test (setupTestDB disables it for normal
unit tests), passes a metadata IP, asserts the build error returns
and cancel is nil so no resource is leaked.

The 4 existing dispatchA2A unit tests use setupTestDB → SSRF
disabled, so they continue passing unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../internal/handlers/a2a_proxy.go            | 10 ++++++
 .../internal/handlers/a2a_proxy_test.go       | 33 +++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go
index 13c46641..934af511 100644
--- a/workspace-server/internal/handlers/a2a_proxy.go
+++ b/workspace-server/internal/handlers/a2a_proxy.go
@@ -483,6 +483,16 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
 // canvas (callerID == "") = 5 min, agent-to-agent = 30 min. Callers can
 // override via the X-Timeout header (applied to ctx upstream in ProxyA2A).
 func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, agentURL string, body []byte, callerID string) (*http.Response, context.CancelFunc, error) {
+	// #1483 SSRF defense-in-depth: the primary call path through
+	// proxyA2ARequest → resolveAgentURL already validates via isSafeURL
+	// (a2a_proxy.go:424), but adding the check here closes the gap for
+	// any future code path that calls dispatchA2A directly without
+	// going through resolveAgentURL. Wrapping as proxyDispatchBuildError
+	// keeps the caller's error-classification path unchanged — the same
+	// shape it already produces a 500 for.
+	if err := isSafeURL(agentURL); err != nil {
+		return nil, nil, &proxyDispatchBuildError{err: err}
+	}
 	forwardCtx := context.WithoutCancel(ctx)
 	var cancel context.CancelFunc
 	if _, hasDeadline := ctx.Deadline(); !hasDeadline {
diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go
index dcad98e2..81733ad5 100644
--- a/workspace-server/internal/handlers/a2a_proxy_test.go
+++ b/workspace-server/internal/handlers/a2a_proxy_test.go
@@ -1155,6 +1155,39 @@ func TestDispatchA2A_ContextDeadline_NoCancelAdded(t *testing.T) {
 	}
 }
 
+// TestDispatchA2A_RejectsUnsafeURL is the #1483 defense-in-depth
+// regression. setupTestDB disables SSRF for normal tests so existing
+// dispatchA2A unit tests can hit httptest.NewServer (loopback) — we
+// re-enable it here to verify the new in-function isSafeURL guard.
+// Production callers go through resolveAgentURL which already
+// validates; this test pins that dispatchA2A is now safe even when
+// called directly by a future caller that skips resolveAgentURL.
+func TestDispatchA2A_RejectsUnsafeURL(t *testing.T) {
+	setupTestDB(t)
+	setupTestRedis(t)
+	restoreSSRF := setSSRFCheckForTest(true)
+	t.Cleanup(restoreSSRF)
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+
+	// Cloud metadata IP — must be rejected before any HTTP call goes out.
+	_, cancel, err := handler.dispatchA2A(
+		context.Background(),
+		"http://169.254.169.254/latest/meta-data/",
+		[]byte(`{}`),
+		"",
+	)
+	if cancel != nil {
+		cancel()
+		t.Error("cancel must be nil when the URL is rejected pre-request")
+	}
+	if err == nil {
+		t.Fatal("expected SSRF rejection error, got nil")
+	}
+	if _, ok := err.(*proxyDispatchBuildError); !ok {
+		t.Errorf("expected *proxyDispatchBuildError (caller maps to 500), got %T: %v", err, err)
+	}
+}
+
 // --- handleA2ADispatchError ---
 
 func TestHandleA2ADispatchError_ContextDeadline(t *testing.T) {

From f1792e1f7ae713bf9cab80d3e747980acd966927 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 26 Apr 2026 08:05:53 -0700
Subject: [PATCH 24/34] =?UTF-8?q?fix(ci):=20stop=20sweep-cf-orphans=20nois?=
 =?UTF-8?q?e=20=E2=80=94=20drop=20merge=5Fgroup=20+=20soft-skip=20when=20s?=
 =?UTF-8?q?ecrets=20unset?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The sweep-cf-orphans workflow shipped in #2088 was noisier than
intended in two ways. This PR fixes both — was filed under the
Optional finding I left on the original review and now matters because
the noise is observably hitting the merge queue.

1) `merge_group: types: [checks_requested]` was firing the entire
   sweep job on every PR through the merge queue. The original intent
   ("future required-check support without a workflow edit") never
   materialized, and meanwhile every recent merge-queue eval (#2091,
   #2092, #2093, #2094, #2095, #2097) generated a red `Sweep CF
   orphans (merge_group)` run.

   Drop the trigger. Comment in the workflow explains the re-add path
   if/when the workflow IS wired as a required check (re-add the
   trigger AND gate the actual sweep step with
   `if: github.event_name != 'merge_group'` so merge-queue evals are
   no-op success).

2) The `Verify required secrets present` step exits 2 when the 6
   secrets aren't configured yet (the PR body's post-merge step,
   still pending). That turns the hourly schedule into an hourly red
   CI run for as long as the secrets stay unset.

   Convert to a soft skip: emit a `::warning::` listing the missing
   secrets and set a `skip=true` step output, then gate the sweep
   step with `if: steps.verify.outputs.skip != 'true'`. Workflow
   reports green and ops still sees the warning when they review
   recent runs.

Net effect:
- merge-queue evals stop generating spurious red runs
- the schedule reports green-with-warning until secrets land
- once secrets land, behavior is identical to today's (real sweep
  runs, hard-fails if a secret is later removed)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/sweep-cf-orphans.yml | 28 +++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/sweep-cf-orphans.yml b/.github/workflows/sweep-cf-orphans.yml
index 0e825256..7fb35328 100644
--- a/.github/workflows/sweep-cf-orphans.yml
+++ b/.github/workflows/sweep-cf-orphans.yml
@@ -40,10 +40,14 @@ on:
         description: "Override safety gate (default 50, set higher only for major cleanup)"
         required: false
         default: "50"
-  # Required-check support: scheduled-only today, but include merge_group
-  # so a future branch-protection wire-in doesn't need a workflow edit.
-  merge_group:
-    types: [checks_requested]
+  # No `merge_group:` trigger on purpose. This is a janitor — it doesn't
+  # need to gate merges, and including it as written before #2088 fired
+  # the full sweep job (or its secret-check) on every PR going through
+  # the merge queue, generating one red CI run per merge-queue eval. If
+  # this workflow is ever wired up as a required check, re-add
+  #   merge_group: { types: [checks_requested] }
+  # AND gate the sweep step with `if: github.event_name != 'merge_group'`
+  # so merge-queue evals report success without actually running.
 
 # Don't let two sweeps race the same zone. workflow_dispatch during a
 # scheduled run would otherwise issue duplicate DELETE calls.
@@ -77,9 +81,12 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Verify required secrets present
-        # Fail fast and loud if a secret is unset — sweep-cf-orphans.sh
-        # also checks via `need`, but we want a single distinct error
-        # in the workflow log instead of script-level multi-line noise.
+        id: verify
+        # Soft skip when secrets aren't configured. The 6 secrets have
+        # to be set on the repo manually before this workflow can do
+        # real work; until they are, the schedule is a no-op rather
+        # than a recurring red CI run. workflow_dispatch surfaces a
+        # warning so an operator running it ad-hoc sees the gap.
         run: |
           missing=()
           for var in CF_API_TOKEN CF_ZONE_ID CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
@@ -88,12 +95,15 @@ jobs:
             fi
           done
           if [ ${#missing[@]} -gt 0 ]; then
-            echo "::error::missing required secret(s): ${missing[*]}"
-            exit 2
+            echo "::warning::skipping sweep — secrets not yet configured: ${missing[*]}"
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            exit 0
           fi
           echo "All required secrets present ✓"
+          echo "skip=false" >> "$GITHUB_OUTPUT"
 
       - name: Run sweep
+        if: steps.verify.outputs.skip != 'true'
         # Schedule-vs-dispatch dry-run asymmetry (intentional):
         #   - Scheduled runs: github.event.inputs.dry_run is empty →
         #     defaults to "false" below → script runs with --execute

From f9b1b3495631542c1f5fae4015afaf4a06b8768d Mon Sep 17 00:00:00 2001
From: rabbitblood <hongmingwangrabbit@gmail.com>
Date: Sun, 26 Apr 2026 08:21:18 -0700
Subject: [PATCH 25/34] =?UTF-8?q?fix(e2e):=20bump=20staging=20tenant=20TLS?=
 =?UTF-8?q?-readiness=20timeout=203min=20=E2=86=92=2010min?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes a 4+ cycle Canvas tabs E2E flake pattern that's been blocking
staging→main PRs since 2026-04-24+ (#2096, #2094, #2055, #2079, ...).

Root cause: TLS_TIMEOUT_MS=180s (3 min) is too tight for the layered
realities of staging tenant TLS readiness:

1. Cloudflare DNS propagation through the edge (1-2 min typical)
2. Tenant CF Tunnel registering the new hostname (1-2 min)
3. CF edge ACME cert provisioning + cache (1-3 min)

Each layer can add 1-3 min on its own under heavy staging load — the
realistic worst case is well past the 3-min cap.

Provision and workspace-online timeouts were already raised to 20 min
(staging-setup.ts:42-46 history). The TLS gate was the remaining
under-budgeted step. Bumping to 10 min keeps it inside the 20-min
PROVISION envelope so a genuinely-stuck tenant still fails loud at
the earlier provision step rather than masquerading as a TLS issue.

Both call sites raised together:
- canvas/e2e/staging-setup.ts: TLS_TIMEOUT_MS = 10 * 60 * 1000
- tests/e2e/test_staging_full_saas.sh: TLS_DEADLINE += 600

Each carries an inline rationale comment so the next reviewer sees
the layer-by-layer decomposition without re-reading the issue thread.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 canvas/e2e/staging-setup.ts         | 11 ++++++++++-
 tests/e2e/test_staging_full_saas.sh | 11 +++++++++--
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/canvas/e2e/staging-setup.ts b/canvas/e2e/staging-setup.ts
index 963f9ccb..122efdf9 100644
--- a/canvas/e2e/staging-setup.ts
+++ b/canvas/e2e/staging-setup.ts
@@ -46,7 +46,16 @@ const TENANT_DOMAIN = process.env.STAGING_TENANT_DOMAIN || "staging.moleculesai.
 // were blocking staging→main syncs on 2026-04-24.
 const PROVISION_TIMEOUT_MS = 20 * 60 * 1000;
 const WORKSPACE_ONLINE_TIMEOUT_MS = 20 * 60 * 1000;
-const TLS_TIMEOUT_MS = 3 * 60 * 1000;
+
+// TLS readiness depends on (1) Cloudflare DNS propagation through the
+// edge, (2) the tenant's CF Tunnel registering the new hostname, (3)
+// CF's edge ACME cert provisioning + cache. Each of these layers can
+// add 1-3 min on its own under heavy staging load. The original 3-min
+// cap blocked four cycles of staging→main PRs across 2026-04-24+.
+// 10 min stays inside the 20-min PROVISION_TIMEOUT envelope (so a
+// genuinely-stuck tenant still fails-loud at the provision step) but
+// absorbs the realistic worst case for a one-shot tenant TLS handshake.
+const TLS_TIMEOUT_MS = 10 * 60 * 1000;
 
 async function jsonFetch(
   url: string,
diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh
index ba0fc7a9..e498ed46 100755
--- a/tests/e2e/test_staging_full_saas.sh
+++ b/tests/e2e/test_staging_full_saas.sh
@@ -195,14 +195,21 @@ TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(js
 ok "Tenant admin token retrieved (len=${#TENANT_TOKEN})"
 
 # ─── 4. Wait for tenant TLS / DNS propagation ──────────────────────────
+# 10 min — same envelope as canvas/e2e/staging-setup.ts TLS_TIMEOUT_MS.
+# CF DNS propagation + tunnel hostname registration + ACME cert + edge
+# cache routinely take 5-7 min under staging load; the original 3-min
+# cap blocked multiple staging→main PRs across 2026-04-24+. Stays
+# inside the parent provision envelope so a genuinely-stuck tenant
+# still fails loud at the earlier provision step rather than masquerading
+# as a TLS issue.
 log "4/11 Waiting for tenant TLS / DNS propagation..."
-TLS_DEADLINE=$(( $(date +%s) + 180 ))
+TLS_DEADLINE=$(( $(date +%s) + 600 ))
 while true; do
   if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then
     break
   fi
   if [ "$(date +%s)" -gt "$TLS_DEADLINE" ]; then
-    fail "Tenant URL never responded 2xx on /health within 3 min"
+    fail "Tenant URL never responded 2xx on /health within 10 min"
   fi
   sleep 5
 done

From fc2720c1fe57804c915965437c825179644a4462 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 26 Apr 2026 08:22:29 -0700
Subject: [PATCH 26/34] fix(git-token-helper): close TOCTOU window + stop
 swallowing chmod errors (closes #1552)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The token-cache helper had three #1552 findings, all in the
mode-600-after-the-fact pattern:

1. _write_cache writes .tmp with default umask (typically 022 → 644
   on disk) and then chmod 600's after the mv. A concurrent reader
   in that microsecond-wide window sees the token at mode 644.
2. Each chmod was swallowed via `|| true` — if it ever fails, the
   tokens stay world-readable with no operator signal.
3. _refresh_gh's gh_token_file write has the same shape and same
   two issues.

Hardening:

- Wrap the .tmp creates in a `umask 077` block so the files are 600
  from creation. Restore the previous umask before return so callers
  aren't perturbed.
- Replace `chmod ... 2>/dev/null || true` with `if ! chmod ...; then
  echo WARN ...; fi`. A chmod failure is a real signal worth grep'ing.
- Apply the same pattern to the _refresh_gh gh_token_file path.
  `local` is illegal in a top-level case branch, so use a uniquely-
  named global (_gh_prev_umask) and unset it after.

Verified `bash -n` clean and `shellcheck` clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../scripts/molecule-git-token-helper.sh      | 44 +++++++++++++++++--
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/workspace/scripts/molecule-git-token-helper.sh b/workspace/scripts/molecule-git-token-helper.sh
index 0faab0fc..125d5109 100755
--- a/workspace/scripts/molecule-git-token-helper.sh
+++ b/workspace/scripts/molecule-git-token-helper.sh
@@ -110,18 +110,45 @@ _read_cache() {
 }
 
 # _write_cache — atomically persist token + expiry.
+#
+# Hardened per #1552:
+#  - umask 077 around the writes so .tmp files are 600 from creation,
+#    closing the TOCTOU window where a concurrent reader could read
+#    the token while it was still mode 644 (between the create-with-
+#    default-umask and the later chmod 600).
+#  - Don't swallow chmod errors with `|| true`. A chmod failure leaves
+#    tokens potentially world-readable; surface it as a WARN line so
+#    ops can grep `[molecule-git-token-helper] WARN` and see real
+#    permission failures instead of silent 644 files.
 _write_cache() {
     local token="$1"
     mkdir -p "${CACHE_DIR}"
-    chmod 700 "${CACHE_DIR}" 2>/dev/null || true
+    if ! chmod 700 "${CACHE_DIR}" 2>/dev/null; then
+        echo "[molecule-git-token-helper] WARN: failed to chmod 700 ${CACHE_DIR} — cache dir may be world-readable" >&2
+    fi
     now=$(_now_epoch)
     expiry=$((now + TOKEN_CACHE_TTL_SEC))
+
+    # Restrictive umask so the .tmp files are 600 from creation. Restored
+    # before return so callers' umask isn't perturbed.
+    local prev_umask
+    prev_umask=$(umask)
+    umask 077
+
     # Write atomically via tmp + mv to avoid partial reads.
     printf '%s' "${token}" > "${CACHE_TOKEN_FILE}.tmp"
     printf '%s' "${expiry}" > "${CACHE_EXPIRY_FILE}.tmp"
     mv -f "${CACHE_TOKEN_FILE}.tmp" "${CACHE_TOKEN_FILE}"
     mv -f "${CACHE_EXPIRY_FILE}.tmp" "${CACHE_EXPIRY_FILE}"
-    chmod 600 "${CACHE_TOKEN_FILE}" "${CACHE_EXPIRY_FILE}" 2>/dev/null || true
+
+    umask "${prev_umask}"
+
+    # Belt-and-suspenders chmod — umask 077 should make the files 600
+    # already, but a chmod that fails on the post-rename file is itself
+    # a real signal worth surfacing.
+    if ! chmod 600 "${CACHE_TOKEN_FILE}" "${CACHE_EXPIRY_FILE}" 2>/dev/null; then
+        echo "[molecule-git-token-helper] WARN: chmod 600 failed on cache files — token may be world-readable" >&2
+    fi
 }
 
 # _fetch_token_from_api — hit the platform endpoint.
@@ -230,10 +257,21 @@ case "${ACTION}" in
             echo "[molecule-git-token-helper] _refresh_gh: gh auth login failed (non-fatal)" >&2
         }
         # Also update GH_TOKEN file for scripts that source it.
+        # Same #1552 hardening as _write_cache — umask 077 around the
+        # write so the .tmp file is 600 from creation, and surface a
+        # WARN on chmod failure instead of swallowing it.
         gh_token_file="${HOME}/.gh_token"
+        # `local` is illegal here (top-level case branch, not a
+        # function); shadow with a uniquely-named global instead.
+        _gh_prev_umask=$(umask)
+        umask 077
         printf '%s' "${api_token}" > "${gh_token_file}.tmp"
         mv -f "${gh_token_file}.tmp" "${gh_token_file}"
-        chmod 600 "${gh_token_file}" 2>/dev/null || true
+        umask "${_gh_prev_umask}"
+        unset _gh_prev_umask
+        if ! chmod 600 "${gh_token_file}" 2>/dev/null; then
+            echo "[molecule-git-token-helper] WARN: chmod 600 failed on ${gh_token_file} — token may be world-readable" >&2
+        fi
         echo "[molecule-git-token-helper] _refresh_gh: token refreshed successfully" >&2
         ;;
     _invalidate_cache)

From 7d48f24fef02312e8784d3470f7f4148ddc03fb7 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 26 Apr 2026 09:05:52 -0700
Subject: [PATCH 27/34] test(handlers): introduce events.EventEmitter interface
 (#1814 partial)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 3 skipped tests in workspace_provision_test.go (#1206 regression
tests) were blocked because captureBroadcaster's struct-embed wouldn't
type-check against WorkspaceHandler.broadcaster's concrete
*events.Broadcaster field. This PR fixes the interface blocker for
the 2 broadcaster-related tests; the 3rd (plugins.Registry resolver)
is a separate blocker tracked elsewhere.

Changes:

- internal/events/broadcaster.go: define `EventEmitter` interface with
  RecordAndBroadcast + BroadcastOnly. *Broadcaster satisfies it via
  its existing methods (compile-time assertion guards future drift).
  SubscribeSSE / Subscribe stay off the interface because only sse.go
  + cmd/server/main.go call them, and both still hold the concrete
  *Broadcaster.

- internal/handlers/workspace.go: WorkspaceHandler.broadcaster type
  changes from *events.Broadcaster to events.EventEmitter.
  NewWorkspaceHandler signature updated to match. Production callers
  unchanged — they pass *events.Broadcaster, which the interface
  accepts.

- internal/handlers/activity.go: LogActivity takes events.EventEmitter
  for the same reason — tests passing a stub no longer need to
  construct the full broadcaster.

- internal/handlers/workspace_provision_test.go: captureBroadcaster
  drops the struct embed (no more zero-value Broadcaster underlying
  the SSE+hub fields), implements RecordAndBroadcast directly, and
  adds a no-op BroadcastOnly to satisfy the interface. Skip messages
  on the 2 empty broadcaster-blocked tests updated to reflect the
  new "interface unblocked, test body still needed" state.

Verified `go build ./...`, `go test ./internal/handlers/`, and
`go vet ./...` all clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../internal/events/broadcaster.go            | 23 +++++++++++++++++++
 .../internal/handlers/activity.go             |  4 +++-
 .../internal/handlers/workspace.go            | 10 ++++++--
 .../handlers/workspace_provision_test.go      | 16 +++++++++----
 4 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/workspace-server/internal/events/broadcaster.go b/workspace-server/internal/events/broadcaster.go
index 514d9781..53427010 100644
--- a/workspace-server/internal/events/broadcaster.go
+++ b/workspace-server/internal/events/broadcaster.go
@@ -15,6 +15,29 @@ import (
 
 const broadcastChannel = "events:broadcast"
 
+// EventEmitter is the contract handler code needs from a broadcaster.
+// Defining it here lets tests substitute a capture-only stub instead
+// of standing up the full Redis + WebSocket hub topology that the
+// concrete *Broadcaster builds (and that previously blocked
+// TestProvisionWorkspace_* regression tests on issue #1814).
+//
+// Includes BroadcastOnly because the activity-log + A2A-response paths
+// inside the handler package fan out via that method — narrowing
+// further would force production callers back to the concrete type.
+//
+// *Broadcaster satisfies this interface trivially. Production code that
+// needs the wider surface (SubscribeSSE, Subscribe) keeps using the
+// concrete *Broadcaster type — sse.go + cmd/server/main.go are the
+// only such call sites today.
+type EventEmitter interface {
+	RecordAndBroadcast(ctx context.Context, eventType string, workspaceID string, payload interface{}) error
+	BroadcastOnly(workspaceID string, eventType string, payload interface{})
+}
+
+// Compile-time assertion: a renamed/reshaped Broadcaster method that
+// silently broke this interface would fail to build here.
+var _ EventEmitter = (*Broadcaster)(nil)
+
 // sseSubscription is a single in-process SSE subscriber.
 // deliverToSSE writes to ch; StreamEvents reads from it.
 type sseSubscription struct {
diff --git a/workspace-server/internal/handlers/activity.go b/workspace-server/internal/handlers/activity.go
index 4d98e9fa..a38603af 100644
--- a/workspace-server/internal/handlers/activity.go
+++ b/workspace-server/internal/handlers/activity.go
@@ -373,7 +373,9 @@ func (h *ActivityHandler) Report(c *gin.Context) {
 }
 
 // LogActivity inserts an activity log and optionally broadcasts via WebSocket.
-func LogActivity(ctx context.Context, broadcaster *events.Broadcaster, params ActivityParams) {
+// Takes events.EventEmitter (#1814) so callers passing a stub broadcaster
+// in tests no longer need to construct the full *events.Broadcaster.
+func LogActivity(ctx context.Context, broadcaster events.EventEmitter, params ActivityParams) {
 	reqJSON, reqErr := json.Marshal(params.RequestBody)
 	if reqErr != nil {
 		log.Printf("LogActivity: failed to marshal request_body for %s: %v", params.WorkspaceID, reqErr)
diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go
index 43d6e877..13364666 100644
--- a/workspace-server/internal/handlers/workspace.go
+++ b/workspace-server/internal/handlers/workspace.go
@@ -26,7 +26,13 @@ import (
 )
 
 type WorkspaceHandler struct {
-	broadcaster *events.Broadcaster
+	// broadcaster narrowed from `*events.Broadcaster` to the
+	// events.EventEmitter interface (#1814) so tests can substitute a
+	// capture-only stub without standing up the real Redis + WS-hub
+	// topology. Production callers still pass *events.Broadcaster, which
+	// satisfies the interface — see the compile-time assertion in
+	// internal/events/broadcaster.go.
+	broadcaster events.EventEmitter
 	provisioner *provisioner.Provisioner
 	cpProv      *provisioner.CPProvisioner
 	platformURL string
@@ -46,7 +52,7 @@ type WorkspaceHandler struct {
 	provisionTimeouts runtimeProvisionTimeoutsCache
 }
 
-func NewWorkspaceHandler(b *events.Broadcaster, p *provisioner.Provisioner, platformURL, configsDir string) *WorkspaceHandler {
+func NewWorkspaceHandler(b events.EventEmitter, p *provisioner.Provisioner, platformURL, configsDir string) *WorkspaceHandler {
 	return &WorkspaceHandler{
 		broadcaster: b,
 		provisioner: p,
diff --git a/workspace-server/internal/handlers/workspace_provision_test.go b/workspace-server/internal/handlers/workspace_provision_test.go
index 65960071..5297fd15 100644
--- a/workspace-server/internal/handlers/workspace_provision_test.go
+++ b/workspace-server/internal/handlers/workspace_provision_test.go
@@ -9,7 +9,6 @@ import (
 	"testing"
 
 	"github.com/DATA-DOG/go-sqlmock"
-	"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/plugins"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
@@ -1044,13 +1043,20 @@ var errInternalDB = fmt.Errorf("pq: connection refused")
 var errInternalOS = fmt.Errorf("operation failed: no such file or directory")
 
 // captureBroadcaster is a test broadcaster that captures the last data
-// payload passed to RecordAndBroadcast so tests can inspect it.
+// payload passed to RecordAndBroadcast so tests can inspect it. Now
+// satisfies events.EventEmitter (#1814) directly — RecordAndBroadcast
+// captures, BroadcastOnly is a no-op since none of the
+// WorkspaceHandler paths under test call it.
 type captureBroadcaster struct {
-	events.Broadcaster // embed to satisfy the interface — only RecordAndBroadcast is overridden
 	lastData map[string]interface{}
 	lastErr  error
 }
 
+// BroadcastOnly is required to satisfy events.EventEmitter. None of the
+// captureBroadcaster's exercising tests should land here — if a future
+// test does, it'll need to add capture state for that channel.
+func (c *captureBroadcaster) BroadcastOnly(_ string, _ string, _ interface{}) {}
+
 func (c *captureBroadcaster) RecordAndBroadcast(_ context.Context, _, _ string, data interface{}) error {
 	if m, ok := data.(map[string]interface{}); ok {
 		// Shallow-copy so the caller can't mutate our capture.
@@ -1107,14 +1113,14 @@ func containsUnsafeString(v interface{}) bool {
 // never leaks internal error details in WORKSPACE_PROVISION_FAILED broadcasts.
 // Regression test for issue #1206.
 func TestProvisionWorkspace_NoInternalErrorsInBroadcast(t *testing.T) {
-	t.Skip("TODO: captureBroadcaster type mismatch with WorkspaceHandler.broadcaster (*events.Broadcaster). Needs broadcaster interface refactor — currently blocking package compile on main (2026-04-21).")
+	t.Skip("TODO: write the test body. The interface blocker (#1814) is fixed — captureBroadcaster now satisfies events.EventEmitter and can be passed to NewWorkspaceHandler. The remaining work is constructing the provisionWorkspace failure path + asserting captured payload doesn't contain unsafeErrorStrings.")
 }
 
 // TestProvisionWorkspaceCP_NoInternalErrorsInBroadcast asserts that
 // provisionWorkspaceCP never leaks err.Error() in WORKSPACE_PROVISION_FAILED
 // broadcasts. Regression test for issue #1206.
 func TestProvisionWorkspaceCP_NoInternalErrorsInBroadcast(t *testing.T) {
-	t.Skip("TODO: captureBroadcaster type mismatch with WorkspaceHandler.broadcaster (*events.Broadcaster). Needs broadcaster interface refactor — currently blocking package compile on main (2026-04-21).")
+	t.Skip("TODO: write the test body. Same status as TestProvisionWorkspace_NoInternalErrorsInBroadcast — interface blocker fixed (#1814), need to construct the provisionWorkspaceCP failure path + assert payload sanitization.")
 }
 
 // mockEnvMutator is a provisionhook.Registry stub that always returns a fixed error.

From 1ae051ec95968dd141e046e5a15b232a1ca37377 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 26 Apr 2026 10:03:46 -0700
Subject: [PATCH 28/34] test(e2e): add 'Invalid API key' regression assertion
 to staging A2A check (#1900)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The staging E2E suite already grep's for 5 known regression patterns
in the A2A response (hermes-agent 401, model_not_found, Encrypted
content, Unknown provider, hermes-agent unreachable). The comment
block at lines 386-395 lists "Invalid API key" as the signal for the
CP #238 boot-event 401 race + stale OPENAI_API_KEY paths, but the
explicit grep was never added — meaning a regression in that class
would slip through the generic `error|exception` catch-all.

Closes the gap with one specific-pattern check that fails loud with
the relevant bug references in the message.

Verified `bash -n` clean; pre-existing shellcheck SC2015 at line 88
is unrelated.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/e2e/test_staging_full_saas.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh
index e498ed46..ce3be39b 100755
--- a/tests/e2e/test_staging_full_saas.sh
+++ b/tests/e2e/test_staging_full_saas.sh
@@ -410,6 +410,13 @@ fi
 if echo "$AGENT_TEXT" | grep -qF "Unknown provider"; then
   fail "A2A — REGRESSION: install.sh set PROVIDER to a value not in hermes's registry. Run 'hermes doctor' on the workspace to see valid values. Raw: $AGENT_TEXT"
 fi
+# "Invalid API key" — the comment block lists this as a CP #238 race
+# (tenant auth chain) signal but the grep was missing. Caller-side
+# 401's containing this exact phrase don't match the generic
+# "error|exception" catch-all below, so they'd slip through.
+if echo "$AGENT_TEXT" | grep -qF "Invalid API key"; then
+  fail "A2A — REGRESSION: tenant auth chain returned 'Invalid API key'. Likely CP boot-event 401 race (CP #238) or stale OPENAI_API_KEY in the runtime env. Raw: $AGENT_TEXT"
+fi
 # Generic catch-all — falls through if none of the known regressions hit.
 if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then
   fail "A2A returned an error-shaped response: $AGENT_TEXT"

From d97d7d4768460b8b22978e47d61e2820f0b425ff Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Sun, 26 Apr 2026 10:14:19 -0700
Subject: [PATCH 29/34] fix(platform/delegation): classify queued response +
 stitch drain result back
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When proxyA2A returns 202+{queued:true} (target busy → enqueued for drain
on next heartbeat), executeDelegation previously treated it as a successful
completion and ran extractResponseText on the queued JSON. The result was
'Delegation completed (workspace agent busy — request queued, will dispatch...)'
landing in activity_logs.summary, which the LLM then echoed to the user
chat as garbage.

Two fixes:
1. delegation.go: detect queued shape via new isQueuedProxyResponse helper,
   write status='queued' with clean summary 'Delegation queued — target at
   capacity', store delegation_id in response_body so the drain can stitch
   back later. Also embed delegation_id in params.message.metadata + use it
   as messageId so the proxy's idempotency-key path keys off the same id.

2. a2a_queue.go: when DrainQueueForWorkspace successfully drains a queued
   item, extract delegation_id from the body's metadata and UPDATE the
   originating delegate_result row (queued → completed with real
   response_body). Broadcast DELEGATION_COMPLETE so the canvas chat feed
   flips the queued line to completed in real time.

Closes the loop so check_task_status reflects ground truth instead of
perpetual 'queued' even after the queued request eventually drained.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
---
 .../internal/handlers/a2a_queue.go            | 87 ++++++++++++++++++-
 .../internal/handlers/a2a_queue_test.go       | 33 +++++++
 .../internal/handlers/delegation.go           | 63 +++++++++++++-
 .../internal/handlers/delegation_test.go      | 33 +++++++
 4 files changed, 212 insertions(+), 4 deletions(-)

diff --git a/workspace-server/internal/handlers/a2a_queue.go b/workspace-server/internal/handlers/a2a_queue.go
index b747fac4..fb0e3b80 100644
--- a/workspace-server/internal/handlers/a2a_queue.go
+++ b/workspace-server/internal/handlers/a2a_queue.go
@@ -288,7 +288,7 @@ func (h *WorkspaceHandler) DrainQueueForWorkspace(ctx context.Context, workspace
 	}
 	// logActivity=false: the original EnqueueA2A callsite already logged
 	// the dispatch attempt; re-logging here would double-count events.
-	status, _, proxyErr := h.proxyA2ARequest(ctx, workspaceID, item.Body, callerID, false)
+	status, respBody, proxyErr := h.proxyA2ARequest(ctx, workspaceID, item.Body, callerID, false)
 
 	// 202 Accepted = the dispatch was itself queued again (target still busy).
 	// That's not a failure — the queued item just stays queued naturally on
@@ -321,4 +321,89 @@ func (h *WorkspaceHandler) DrainQueueForWorkspace(ctx context.Context, workspace
 	MarkQueueItemCompleted(ctx, item.ID)
 	log.Printf("A2AQueue drain: dispatched %s to workspace %s (attempt=%d)",
 		item.ID, workspaceID, item.Attempts)
+
+	// Stitch the response back to the originating delegation row, if this
+	// queue item was a delegation. Without this, check_task_status would
+	// see status='queued' (set by the executeDelegation queued-branch) and
+	// the LLM would think the work was never done. We embed delegation_id
+	// in params.message.metadata at Delegate-handler time; pull it out
+	// here and UPDATE the delegate_result row so the original caller can
+	// observe the real reply.
+	if delegationID := extractDelegationIDFromBody(item.Body); delegationID != "" {
+		h.stitchDrainResponseToDelegation(ctx, callerID, item.WorkspaceID, delegationID, respBody)
+	}
+}
+
+// extractDelegationIDFromBody pulls params.message.metadata.delegation_id
+// out of an A2A JSON-RPC body. Empty string when absent — drain treats
+// that as "this queue item didn't originate from /workspaces/:id/delegate"
+// and skips the stitch, so non-delegation queue uses (cross-workspace
+// peer-direct A2A) aren't affected.
+func extractDelegationIDFromBody(body []byte) string {
+	var envelope struct {
+		Params struct {
+			Message struct {
+				Metadata struct {
+					DelegationID string `json:"delegation_id"`
+				} `json:"metadata"`
+			} `json:"message"`
+		} `json:"params"`
+	}
+	if err := json.Unmarshal(body, &envelope); err != nil {
+		return ""
+	}
+	return envelope.Params.Message.Metadata.DelegationID
+}
+
+// stitchDrainResponseToDelegation writes the drained response into the
+// delegation's existing delegate_result row (created with status='queued'
+// by executeDelegation when the proxy first returned queued). This is the
+// other half of the loop that closes "queued → completed" so the LLM's
+// check_task_status reflects ground truth.
+//
+// Errors are logged-only — drain is fire-and-forget from Heartbeat, and a
+// stitch failure shouldn't block other queued items. The delegation will
+// just remain stuck at 'queued' in this case, which is the pre-fix
+// behaviour (no regression vs. shipping nothing).
+func (h *WorkspaceHandler) stitchDrainResponseToDelegation(ctx context.Context, sourceID, targetID, delegationID string, respBody []byte) {
+	if sourceID == "" || delegationID == "" {
+		return
+	}
+	responseText := extractResponseText(respBody)
+	respJSON, _ := json.Marshal(map[string]interface{}{
+		"text":          responseText,
+		"delegation_id": delegationID,
+	})
+	res, err := db.DB.ExecContext(ctx, `
+		UPDATE activity_logs
+		   SET status        = 'completed',
+		       summary       = $1,
+		       response_body = $2::jsonb
+		 WHERE workspace_id   = $3
+		   AND method         = 'delegate_result'
+		   AND target_id      = $4
+		   AND response_body->>'delegation_id' = $5
+	`, "Delegation completed ("+truncate(responseText, 80)+")", string(respJSON),
+		sourceID, targetID, delegationID)
+	if err != nil {
+		log.Printf("A2AQueue drain stitch: update failed for delegation %s: %v", delegationID, err)
+		return
+	}
+	if rows, _ := res.RowsAffected(); rows == 0 {
+		log.Printf("A2AQueue drain stitch: no delegate_result row for delegation %s (queued-row may not exist yet)", delegationID)
+		return
+	}
+	log.Printf("A2AQueue drain stitch: delegation %s queued → completed (%d chars)", delegationID, len(responseText))
+
+	// Broadcast DELEGATION_COMPLETE so the canvas chat feed flips the
+	// "⏸ queued" line to "✓ completed" in real time. Without this the
+	// transition only surfaces after the user reloads or polls activity.
+	if h.broadcaster != nil {
+		h.broadcaster.RecordAndBroadcast(ctx, "DELEGATION_COMPLETE", sourceID, map[string]interface{}{
+			"delegation_id":    delegationID,
+			"target_id":        targetID,
+			"response_preview": truncate(responseText, 200),
+			"via":              "queue_drain",
+		})
+	}
 }
diff --git a/workspace-server/internal/handlers/a2a_queue_test.go b/workspace-server/internal/handlers/a2a_queue_test.go
index 940f7f2f..57000910 100644
--- a/workspace-server/internal/handlers/a2a_queue_test.go
+++ b/workspace-server/internal/handlers/a2a_queue_test.go
@@ -80,6 +80,39 @@ func TestExtractIdempotencyKey_emptyOnMissing(t *testing.T) {
 	}
 }
 
+func TestExtractDelegationIDFromBody(t *testing.T) {
+	cases := []struct {
+		name string
+		body string
+		want string
+	}{
+		{
+			name: "delegation body — metadata.delegation_id present",
+			body: `{"method":"message/send","params":{"message":{"role":"user","messageId":"abc-123","metadata":{"delegation_id":"abc-123"},"parts":[{"type":"text","text":"hi"}]}}}`,
+			want: "abc-123",
+		},
+		{
+			name: "non-delegation body — no metadata (peer-direct A2A)",
+			body: `{"method":"message/send","params":{"message":{"role":"user","messageId":"m-1","parts":[{"type":"text","text":"hi"}]}}}`,
+			want: "",
+		},
+		{
+			name: "metadata present but no delegation_id key",
+			body: `{"params":{"message":{"metadata":{"trace_id":"t-1"}}}}`,
+			want: "",
+		},
+		{"malformed JSON", `not json`, ""},
+		{"empty body", ``, ""},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := extractDelegationIDFromBody([]byte(tc.body)); got != tc.want {
+				t.Errorf("extractDelegationIDFromBody = %q, want %q", got, tc.want)
+			}
+		})
+	}
+}
+
 // ──────────────────────────────────────────────────────────────────────────────
 // DrainQueueForWorkspace — nil-safe error extraction regression tests
 //
diff --git a/workspace-server/internal/handlers/delegation.go b/workspace-server/internal/handlers/delegation.go
index 8c0d681f..59da198c 100644
--- a/workspace-server/internal/handlers/delegation.go
+++ b/workspace-server/internal/handlers/delegation.go
@@ -78,13 +78,21 @@ func (h *DelegationHandler) Delegate(c *gin.Context) {
 	// reason (logged); we still dispatch the A2A request and surface the
 	// warning in the response.
 
-	// Build A2A payload
+	// Build A2A payload. Embedding delegation_id in metadata gives the
+	// queue drain path a way to look up the originating delegation row
+	// when stitching the response back (issue: previously the drain
+	// dispatched successfully but discarded the response, so
+	// check_task_status returned status='queued' forever even after a
+	// real reply landed). messageId mirrors delegation_id so the
+	// platform's idempotency-key extraction also keys off the same id.
 	a2aBody, _ := json.Marshal(map[string]interface{}{
 		"method": "message/send",
 		"params": map[string]interface{}{
 			"message": map[string]interface{}{
-				"role":  "user",
-				"parts": []map[string]interface{}{{"type": "text", "text": body.Task}},
+				"role":      "user",
+				"messageId": delegationID,
+				"parts":     []map[string]interface{}{{"type": "text", "text": body.Task}},
+				"metadata":  map[string]interface{}{"delegation_id": delegationID},
 			},
 		},
 	})
@@ -284,6 +292,40 @@ func (h *DelegationHandler) executeDelegation(sourceID, targetID, delegationID s
 		return
 	}
 
+	// 202 + {queued: true} means the target was busy and the proxy
+	// enqueued the request for the next drain tick — NOT a completion.
+	// Treat it as such: write a clean 'queued' activity row with no
+	// JSON-as-text leakage into the summary, broadcast a status update,
+	// and return. The eventual drain doesn't (yet) feed a result back
+	// into this delegation, so callers polling check_task_status will
+	// see status='queued' and know to retry instead of believing the
+	// queued JSON is the agent's reply. Fixes the chat-leak where the
+	// LLM echoed "Delegation completed (workspace agent busy ...)" to
+	// the user.
+	if status == http.StatusAccepted && isQueuedProxyResponse(respBody) {
+		log.Printf("Delegation %s: target %s busy — queued for drain", delegationID, targetID)
+		h.updateDelegationStatus(sourceID, delegationID, "queued", "")
+		// Store delegation_id in response_body so DrainQueueForWorkspace's
+		// stitch step can find this row by JSON-path key after the queued
+		// dispatch eventually succeeds. Without the key, the drain finds
+		// the row by (workspace_id, target_id, method) but can't tell
+		// multiple-queued-delegations-to-same-target apart.
+		queuedJSON, _ := json.Marshal(map[string]interface{}{
+			"delegation_id": delegationID,
+			"queued":        true,
+		})
+		if _, err := db.DB.ExecContext(ctx, `
+			INSERT INTO activity_logs (workspace_id, activity_type, method, source_id, target_id, summary, response_body, status)
+			VALUES ($1, 'delegation', 'delegate_result', $2, $3, $4, $5::jsonb, 'queued')
+		`, sourceID, sourceID, targetID, "Delegation queued — target at capacity", string(queuedJSON)); err != nil {
+			log.Printf("Delegation %s: failed to insert queued log: %v", delegationID, err)
+		}
+		h.broadcaster.RecordAndBroadcast(ctx, "DELEGATION_STATUS", sourceID, map[string]interface{}{
+			"delegation_id": delegationID, "target_id": targetID, "status": "queued",
+		})
+		return
+	}
+
 	// A2A returned 200 — target received and processed the task
 	// Status: dispatched → received → completed (we don't have a separate "received" signal from the target yet)
 	responseText := extractResponseText(respBody)
@@ -517,6 +559,21 @@ func isTransientProxyError(err *proxyA2AError) bool {
 	return false
 }
 
+// isQueuedProxyResponse reports whether the proxy returned a body shaped like
+// `{"queued": true, "queue_id": ..., "queue_depth": ..., "message": ...}` —
+// the busy-target enqueue path in a2a_proxy_helpers.go. Caller checks this
+// alongside HTTP 202 to distinguish a successful agent reply from a deferred
+// dispatch; without the distinction we'd write the queued-message JSON into
+// the delegation result row and the LLM would surface it as agent output.
+func isQueuedProxyResponse(body []byte) bool {
+	var resp map[string]interface{}
+	if json.Unmarshal(body, &resp) != nil {
+		return false
+	}
+	queued, _ := resp["queued"].(bool)
+	return queued
+}
+
 func extractResponseText(body []byte) string {
 	var resp map[string]interface{}
 	if json.Unmarshal(body, &resp) != nil {
diff --git a/workspace-server/internal/handlers/delegation_test.go b/workspace-server/internal/handlers/delegation_test.go
index caa5118d..21cc3a90 100644
--- a/workspace-server/internal/handlers/delegation_test.go
+++ b/workspace-server/internal/handlers/delegation_test.go
@@ -376,6 +376,39 @@ func TestIsTransientProxyError_RetriesOnRestartRaceStatuses(t *testing.T) {
 	}
 }
 
+func TestIsQueuedProxyResponse(t *testing.T) {
+	// Regression guard for the chat-leak bug: when the proxy returns
+	// 202 with a queued-shape body, executeDelegation must classify it
+	// as "queued" — not "completed". Mis-classifying it causes the
+	// queued JSON to land in activity_logs.summary, which the LLM then
+	// echoes verbatim into the agent chat as
+	// "Delegation completed: Delegation completed (workspace agent
+	// busy — request queued, will dispatch...)".
+	cases := []struct {
+		name string
+		body string
+		want bool
+	}{
+		{
+			name: "real proxy busy-enqueue body",
+			body: `{"queued":true,"queue_id":"d0993390-5f5a-4f5d-90a2-66639e53e3c9","queue_depth":1,"message":"workspace agent busy — request queued, will dispatch when capacity available"}`,
+			want: true,
+		},
+		{"queued false explicitly", `{"queued":false}`, false},
+		{"queued field absent (real A2A reply)", `{"jsonrpc":"2.0","id":"1","result":{"kind":"message","parts":[{"kind":"text","text":"hi"}]}}`, false},
+		{"non-bool queued value (defensive)", `{"queued":"true"}`, false},
+		{"malformed JSON", `not-json`, false},
+		{"empty body", ``, false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := isQueuedProxyResponse([]byte(tc.body)); got != tc.want {
+				t.Errorf("isQueuedProxyResponse(%q) = %v, want %v", tc.body, got, tc.want)
+			}
+		})
+	}
+}
+
 func TestDelegationRetryDelay_IsSaneWindow(t *testing.T) {
 	// Regression guard: the retry delay must be long enough for the
 	// reactive URL refresh in proxyA2ARequest to kick in (which involves

From 7ed50824b6442a9da585ae013c60fc0e23b15e7b Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Sun, 26 Apr 2026 10:14:47 -0700
Subject: [PATCH 30/34] fix(platform/ssrf): allow RFC-1918 in
 MOLECULE_ENV=development
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The docker-compose dev pattern puts platform and workspace containers on
the same docker bridge network (172.18.0.0/16, RFC-1918). The runtime
registers via its docker-internal hostname which DNS-resolves to a
172.18.x.x IP. The SSRF defence's isPrivateOrMetadataIP rejected those,
so every workspace POST through the platform proxy returned
'workspace URL is not publicly routable' — breaking the entire docker-
compose dev loop.

Fix: in isPrivateOrMetadataIP, treat MOLECULE_ENV=development the same
as SaaS mode for RFC-1918 relaxation. Both share the 'trusted intra-
network routing' property — SaaS is sibling EC2s in the same VPC, dev
is sibling containers on the same docker bridge. Always-blocked
categories (metadata link-local, TEST-NET, CGNAT) stay blocked.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
---
 workspace-server/internal/handlers/ssrf.go | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/workspace-server/internal/handlers/ssrf.go b/workspace-server/internal/handlers/ssrf.go
index a84426f1..2e795e90 100644
--- a/workspace-server/internal/handlers/ssrf.go
+++ b/workspace-server/internal/handlers/ssrf.go
@@ -127,7 +127,16 @@ var testAllowLoopback = false
 // container deployments the relaxation is off and every private range
 // stays blocked.
 func isPrivateOrMetadataIP(ip net.IP) bool {
-	saas := saasMode()
+	// MOLECULE_ENV=development is the dev-host pattern: platform and
+	// workspace containers share a docker bridge network (172.18.0.0/16,
+	// RFC-1918). Treat that the same as SaaS for private-range relaxation
+	// — both share the "trusted intra-network routing" property. Without
+	// this, every workspace registration via docker-internal hostname
+	// resolves to 172.18.x.x and gets rejected as
+	// "workspace URL is not publicly routable", breaking the entire
+	// docker-compose dev loop. Always-blocked categories (metadata link-
+	// local, TEST-NET, CGNAT) remain blocked regardless.
+	saas := saasMode() || devModeAllowsLoopback()
 
 	// IPv4 path.
 	if ip4 := ip.To4(); ip4 != nil {

From 09972486e8ef3f044428c5b2112ea3c9fc3ddaad Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Sun, 26 Apr 2026 10:14:47 -0700
Subject: [PATCH 31/34] fix(platform/notify): persist agent
 send_message_to_user pushes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-fix, POST /workspaces/:id/notify (the side-channel agents use to push
interim updates and follow-up results) only broadcast via WebSocket — no
DB write. When the user refreshed the page, the chat-history loader
(which queries activity_logs) couldn't restore those messages and they
vanished from the chat.

Hits the most common path: when the platform's POST /a2a times out (idle),
the runtime keeps working and eventually pushes its reply via
send_message_to_user. The reply rendered live but disappeared on reload.

Fix: also INSERT an activity_logs row with shape the existing loader
already understands (type=a2a_receive, source_id=NULL, response_body=
{result: text}). Persistence is best-effort — a DB hiccup doesn't block
the WebSocket push (which the user is already seeing).

🤖 Generated with [Claude Code](https://claude.com/claude-code)
---
 .../internal/handlers/activity.go             | 31 +++++++
 .../internal/handlers/activity_test.go        | 80 +++++++++++++++++++
 2 files changed, 111 insertions(+)

diff --git a/workspace-server/internal/handlers/activity.go b/workspace-server/internal/handlers/activity.go
index a38603af..ba6d9f0f 100644
--- a/workspace-server/internal/handlers/activity.go
+++ b/workspace-server/internal/handlers/activity.go
@@ -286,6 +286,37 @@ func (h *ActivityHandler) Notify(c *gin.Context) {
 		"name":         wsName,
 	})
 
+	// Persist to activity_logs so the chat history loader restores this
+	// message after a page reload. Pre-fix, send_message_to_user pushes
+	// were broadcast-only — survived the WebSocket session but vanished
+	// when the user refreshed because nothing wrote them to the DB.
+	//
+	// Shape chosen to match the existing loader query
+	// (`type=a2a_receive&source=canvas`):
+	//   - activity_type='a2a_receive' so it joins the same query path
+	//   - source_id=NULL so the canvas-source filter accepts it
+	//   - method='notify' to distinguish from real A2A receives in audits
+	//   - request_body=NULL so the loader doesn't append a duplicate
+	//     "user message" bubble for it
+	//   - response_body={"result": "<text>"} matches extractResponseText's
+	//     simplest branch ({result: string} → take verbatim)
+	//
+	// Errors are logged-only — broadcast already succeeded, the user
+	// sees the message; persistence failure just means the message
+	// won't survive reload (pre-fix behavior). Don't fail the whole
+	// notify on a DB hiccup.
+	respJSON, _ := json.Marshal(map[string]interface{}{"result": body.Message})
+	preview := body.Message
+	if len(preview) > 80 {
+		preview = preview[:80] + "…"
+	}
+	if _, err := db.DB.ExecContext(c.Request.Context(), `
+		INSERT INTO activity_logs (workspace_id, activity_type, method, summary, response_body, status)
+		VALUES ($1, 'a2a_receive', 'notify', $2, $3::jsonb, 'ok')
+	`, workspaceID, "Agent message: "+preview, string(respJSON)); err != nil {
+		log.Printf("Notify: failed to persist message for %s: %v", workspaceID, err)
+	}
+
 	c.JSON(http.StatusOK, gin.H{"status": "sent"})
 }
 
diff --git a/workspace-server/internal/handlers/activity_test.go b/workspace-server/internal/handlers/activity_test.go
index 1780be3b..9cba5873 100644
--- a/workspace-server/internal/handlers/activity_test.go
+++ b/workspace-server/internal/handlers/activity_test.go
@@ -217,6 +217,86 @@ func TestActivityReport_RejectsUnknownType(t *testing.T) {
 	}
 }
 
+func TestNotify_PersistsToActivityLogsForReloadRecovery(t *testing.T) {
+	// Regression guard for the "responses gone on reload" bug. send_message_to_user
+	// pushes (which route through Notify) used to be broadcast-only — they
+	// rendered in the canvas but vanished on page reload because nothing
+	// wrote them to activity_logs. The chat history loader queries
+	// `type=a2a_receive&source=canvas`, so the persisted row must:
+	//   - Use activity_type='a2a_receive' (loader's filter)
+	//   - Have source_id NULL (canvas-source filter)
+	//   - Carry the message text in response_body so extractResponseText
+	//     can reconstruct the agent reply on reload
+	mockDB, mock, _ := sqlmock.New()
+	defer mockDB.Close()
+	db.DB = mockDB
+
+	// Workspace existence check
+	mock.ExpectQuery(`SELECT name FROM workspaces`).
+		WithArgs("ws-notify").
+		WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("DD"))
+
+	// Persistence INSERT — verify shape
+	mock.ExpectExec(`INSERT INTO activity_logs`).
+		WithArgs(
+			"ws-notify",
+			sqlmock.AnyArg(), // summary
+			sqlmock.AnyArg(), // response_body JSON
+		).
+		WillReturnResult(sqlmock.NewResult(1, 1))
+
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	gin.SetMode(gin.TestMode)
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-notify"}}
+	body := `{"message":"agent reply that arrived after the sync POST timed out"}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/ws-notify/notify", strings.NewReader(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+	handler.Notify(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("DB expectations not met: %v", err)
+	}
+}
+
+func TestNotify_DBFailure_StillBroadcastsAnd200(t *testing.T) {
+	// Persistence is best-effort — a DB hiccup must NOT block the
+	// WebSocket push (which the user is already seeing in their open
+	// canvas). Pre-fix the WS push always succeeded; we don't want
+	// the new persistence step to regress that path.
+	mockDB, mock, _ := sqlmock.New()
+	defer mockDB.Close()
+	db.DB = mockDB
+
+	mock.ExpectQuery(`SELECT name FROM workspaces`).
+		WithArgs("ws-x").
+		WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("DD"))
+	mock.ExpectExec(`INSERT INTO activity_logs`).
+		WillReturnError(fmt.Errorf("simulated db hiccup"))
+
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	gin.SetMode(gin.TestMode)
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-x"}}
+	body := `{"message":"hi"}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/ws-x/notify", strings.NewReader(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+	handler.Notify(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("DB failure must not break the response; got %d", w.Code)
+	}
+}
+
 // ==================== Direct unit tests for SessionSearch helpers ====================
 
 // --- parseSessionSearchParams ---

From 50decfd326dd4988d016aa12ed47a448b1022dd1 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Sun, 26 Apr 2026 10:14:47 -0700
Subject: [PATCH 32/34] chore(compose): wire MOLECULE_ENV, GHCR_USER/TOKEN,
 MOLECULE_IMAGE_PLATFORM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three env vars the platform now reads:

- MOLECULE_ENV=development (default) — activates the WorkspaceAuth /
  AdminAuth dev fail-open path so the canvas's bearer-less requests pass
  through. Also unlocks RFC-1918 relaxation in the SSRF guard so docker-
  bridge IPs work. Override to 'production' for staged deploys.

- GHCR_USER + GHCR_TOKEN — feed POST /admin/workspace-images/refresh's
  ImagePull auth payload. Both empty → endpoint can pull cached/public
  images only. Set with a fine-grained PAT (read:packages on Molecule-AI
  org) to pull private GHCR images.

- MOLECULE_IMAGE_PLATFORM=linux/amd64 (default) — workspace-template-*
  images ship single-arch amd64. On Apple Silicon hosts, the daemon's
  native linux/arm64/v8 request misses the manifest and pulls fail.
  Forcing amd64 makes Docker Desktop run them under Rosetta — slower
  (~2-3×) but functional.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
---
 docker-compose.yml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/docker-compose.yml b/docker-compose.yml
index c9c88d7c..2be0d3f6 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -126,6 +126,13 @@ services:
       REDIS_URL: redis://redis:6379
       PORT: "${PLATFORM_PORT:-8080}"
       PLATFORM_URL: "http://platform:${PLATFORM_PORT:-8080}"
+      # Default MOLECULE_ENV=development so the WorkspaceAuth / AdminAuth
+      # middleware fail-open path activates when ADMIN_TOKEN is unset —
+      # otherwise the canvas (which runs without a bearer in pure local
+      # dev) gets 401 "missing workspace auth token" on every request.
+      # Override to "production" for SaaS/staged deploys; in those modes
+      # ADMIN_TOKEN must also be set or every request rejects.
+      MOLECULE_ENV: "${MOLECULE_ENV:-development}"
       CORS_ORIGINS: ${CORS_ORIGINS:-http://localhost:${CANVAS_PUBLISH_PORT:-3000},http://127.0.0.1:${CANVAS_PUBLISH_PORT:-3000},http://localhost:3001}
       RATE_LIMIT: "${RATE_LIMIT:-1000}"
       CONFIGS_DIR: /configs
@@ -153,6 +160,24 @@ services:
       HIBERNATION_IDLE_MINUTES: "${HIBERNATION_IDLE_MINUTES:-}"
       # Plugin supply chain hardening (issue #768 / PR #775). Never set in production.
       PLUGIN_ALLOW_UNPINNED: "${PLUGIN_ALLOW_UNPINNED:-}"
+      # Force ImagePull/ContainerCreate to request linux/amd64 manifests
+      # for the workspace-template-* images. The templates ship single-arch
+      # amd64 today; without this override, an arm64 host (Apple Silicon)
+      # asks the daemon for linux/arm64/v8, which doesn't match the manifest
+      # and the pull fails with "no matching manifest". Apple Silicon will
+      # run the amd64 image under Rosetta — slower (~2-3×) but functional.
+      # Override to "" or another platform when the templates start shipping
+      # multi-arch (then this hardcoded amd64 becomes unnecessary).
+      MOLECULE_IMAGE_PLATFORM: "${MOLECULE_IMAGE_PLATFORM:-linux/amd64}"
+      # GHCR auth for the workspace-images refresh endpoint
+      # (POST /admin/workspace-images/refresh). When set, the platform's
+      # Docker SDK ImagePull on private workspace-template-* images
+      # succeeds without per-host `docker login`. GHCR_USER is the GitHub
+      # username; GHCR_TOKEN is a fine-grained PAT with `read:packages`
+      # on the Molecule-AI org. Both unset → endpoint can only pull
+      # public images (current state for all 8 templates).
+      GHCR_USER: "${GHCR_USER:-}"
+      GHCR_TOKEN: "${GHCR_TOKEN:-}"
     volumes:
       - ./workspace-configs-templates:/configs
       - ./org-templates:/org-templates:ro

From 0de67cd3794f455a49de2d3f0076eaeec8da2e84 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Sun, 26 Apr 2026 10:17:21 -0700
Subject: [PATCH 33/34] feat(platform/admin): /admin/workspace-images/refresh +
 Docker SDK + GHCR auth
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The production-side end of the runtime CD chain. Operators (or the post-
publish CI workflow) hit this after a runtime release to pull the latest
workspace-template-* images from GHCR and recreate any running ws-* containers
so they adopt the new image. Without this, freshly-published runtime sat in
the registry but containers kept the old image until naturally cycled.

Implementation notes:
- Uses Docker SDK ImagePull rather than shelling out to docker CLI — the
  alpine platform container has no docker CLI installed.
- ghcrAuthHeader() reads GHCR_USER + GHCR_TOKEN env, builds the base64-
  encoded JSON payload Docker engine expects in PullOptions.RegistryAuth.
  Both empty → public/cached images only; both set → private GHCR pulls.
- Container matching uses ContainerInspect (NOT ContainerList) because
  ContainerList returns the resolved digest in .Image, not the human tag.
  Inspect surfaces .Config.Image which is what we need.
- Provisioner.DefaultImagePlatform() exported so admin handler picks the
  same Apple-Silicon-needs-amd64 platform as the provisioner — single
  source of truth for the multi-arch override.

Local-dev companion: scripts/refresh-workspace-images.sh runs on the
host and inherits the host's docker keychain auth — alternate path for
when GHCR_USER/TOKEN aren't set in the platform env.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
---
 .github/workflows/auto-tag-runtime.yml        | 113 +++++++
 .github/workflows/publish-runtime.yml         | 161 ++++++++++
 docs/workspace-runtime-package.md             | 186 +++++++++--
 scripts/build_runtime_package.py              | 298 ++++++++++++++++++
 scripts/refresh-workspace-images.sh           |  95 ++++++
 .../handlers/admin_workspace_images.go        | 227 +++++++++++++
 .../handlers/admin_workspace_images_test.go   |  73 +++++
 .../internal/provisioner/provisioner.go       |   7 +
 workspace-server/internal/router/router.go    |  11 +
 9 files changed, 1143 insertions(+), 28 deletions(-)
 create mode 100644 .github/workflows/auto-tag-runtime.yml
 create mode 100644 .github/workflows/publish-runtime.yml
 create mode 100755 scripts/build_runtime_package.py
 create mode 100755 scripts/refresh-workspace-images.sh
 create mode 100644 workspace-server/internal/handlers/admin_workspace_images.go
 create mode 100644 workspace-server/internal/handlers/admin_workspace_images_test.go

diff --git a/.github/workflows/auto-tag-runtime.yml b/.github/workflows/auto-tag-runtime.yml
new file mode 100644
index 00000000..2b9070bc
--- /dev/null
+++ b/.github/workflows/auto-tag-runtime.yml
@@ -0,0 +1,113 @@
+name: auto-tag-runtime
+
+# Auto-tag runtime releases on every merge to main that touches workspace/.
+# This is the entry point of the runtime CD chain:
+#
+#   merge PR → auto-tag-runtime (this) → publish-runtime → cascade → template
+#   image rebuilds → repull on hosts.
+#
+# Default bump is patch. Override via PR label `release:minor` or
+# `release:major` BEFORE merging — the label is read off the merged PR
+# associated with the push commit.
+#
+# Skips when:
+#   - The push isn't to main (other branches don't auto-release).
+#   - The merge commit message contains `[skip-release]` (escape hatch
+#     for cleanup PRs that touch workspace/ but shouldn't ship).
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "workspace/**"
+      - "scripts/build_runtime_package.py"
+      - ".github/workflows/auto-tag-runtime.yml"
+      - ".github/workflows/publish-runtime.yml"
+
+permissions:
+  contents: write    # to push the new tag
+  pull-requests: read # to read labels off the merged PR
+
+concurrency:
+  # Serialize tag bumps so two near-simultaneous merges can't both think
+  # they're 0.1.6 and race to push the same tag.
+  group: auto-tag-runtime
+  cancel-in-progress: false
+
+jobs:
+  tag:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0    # need full tag history for `git describe` / sort
+
+      - name: Skip when commit asks
+        id: skip
+        run: |
+          MSG=$(git log -1 --format=%B "${{ github.sha }}")
+          if echo "$MSG" | grep -qiE '\[skip-release\]|\[no-release\]'; then
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            echo "Commit message contains [skip-release] — no tag will be created."
+          else
+            echo "skip=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Determine bump kind from PR label
+        id: bump
+        if: steps.skip.outputs.skip != 'true'
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          # The merged PR for this push commit. `gh pr list --search` finds
+          # closed PRs whose merge commit matches; we take the first.
+          PR=$(gh pr list --state merged --search "${{ github.sha }}" --json number,labels --jq '.[0]' 2>/dev/null || echo "")
+          if [ -z "$PR" ] || [ "$PR" = "null" ]; then
+            echo "No merged PR found for ${{ github.sha }} — defaulting to patch bump."
+            echo "kind=patch" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          LABELS=$(echo "$PR" | jq -r '.labels[].name')
+          if echo "$LABELS" | grep -qx 'release:major'; then
+            echo "kind=major" >> "$GITHUB_OUTPUT"
+          elif echo "$LABELS" | grep -qx 'release:minor'; then
+            echo "kind=minor" >> "$GITHUB_OUTPUT"
+          else
+            echo "kind=patch" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Compute next version from latest runtime-v* tag
+        id: version
+        if: steps.skip.outputs.skip != 'true'
+        run: |
+          # Find the highest runtime-vX.Y.Z tag. `sort -V` handles semver
+          # ordering; `grep` filters to the right tag prefix.
+          LATEST=$(git tag --list 'runtime-v*' | sort -V | tail -1)
+          if [ -z "$LATEST" ]; then
+            # No prior tag — start the runtime line at 0.1.0.
+            CURRENT="0.0.0"
+          else
+            CURRENT="${LATEST#runtime-v}"
+          fi
+          MAJOR=$(echo "$CURRENT" | cut -d. -f1)
+          MINOR=$(echo "$CURRENT" | cut -d. -f2)
+          PATCH=$(echo "$CURRENT" | cut -d. -f3)
+          case "${{ steps.bump.outputs.kind }}" in
+            major) MAJOR=$((MAJOR+1)); MINOR=0; PATCH=0;;
+            minor) MINOR=$((MINOR+1)); PATCH=0;;
+            patch) PATCH=$((PATCH+1));;
+          esac
+          NEW="$MAJOR.$MINOR.$PATCH"
+          echo "current=$CURRENT" >> "$GITHUB_OUTPUT"
+          echo "new=$NEW" >> "$GITHUB_OUTPUT"
+          echo "Bumping runtime $CURRENT → $NEW (${{ steps.bump.outputs.kind }})"
+
+      - name: Push new tag
+        if: steps.skip.outputs.skip != 'true'
+        run: |
+          NEW_TAG="runtime-v${{ steps.version.outputs.new }}"
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git tag -a "$NEW_TAG" -m "runtime $NEW_TAG (auto-bump from ${{ steps.bump.outputs.kind }})"
+          git push origin "$NEW_TAG"
+          echo "Pushed $NEW_TAG — publish-runtime workflow will fire on the tag."
diff --git a/.github/workflows/publish-runtime.yml b/.github/workflows/publish-runtime.yml
new file mode 100644
index 00000000..61054f8a
--- /dev/null
+++ b/.github/workflows/publish-runtime.yml
@@ -0,0 +1,161 @@
+name: publish-runtime
+
+# Publishes molecule-ai-workspace-runtime to PyPI from monorepo workspace/.
+# Monorepo workspace/ is the only source-of-truth for runtime code; this
+# workflow is the bridge from monorepo edits to the PyPI artifact that
+# the 8 workspace-template-* repos depend on.
+#
+# Triggered by:
+#   - Pushing a tag matching `runtime-vX.Y.Z` (the version is derived from
+#     the tag — `runtime-v0.1.6` publishes `0.1.6`).
+#   - Manual workflow_dispatch with an explicit `version` input (useful for
+#     dev/test releases without tagging the repo).
+#
+# The workflow:
+#   1. Runs scripts/build_runtime_package.py to copy workspace/ →
+#      build/molecule_runtime/ with imports rewritten (`a2a_client` →
+#      `molecule_runtime.a2a_client`).
+#   2. Builds wheel + sdist with `python -m build`.
+#   3. Publishes to PyPI via twine + repo secret PYPI_TOKEN.
+#
+# After publish: the 8 template repos pick up the new version on their
+# next image rebuild (their requirements.txt pin
+# `molecule-ai-workspace-runtime>=0.1.0`, so any new release is eligible).
+# To force-pull immediately, bump the pin in each template repo's
+# requirements.txt and merge — that triggers their own publish-image.yml.
+
+on:
+  push:
+    tags:
+      - "runtime-v*"
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "Version to publish (e.g. 0.1.6). Required for manual dispatch."
+        required: true
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    environment: pypi-publish
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: pip
+
+      - name: Derive version from tag or input
+        id: version
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            VERSION="${{ inputs.version }}"
+          else
+            # Tag is `runtime-vX.Y.Z` — strip the prefix.
+            VERSION="${GITHUB_REF_NAME#runtime-v}"
+          fi
+          if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+(\.dev[0-9]+|rc[0-9]+|a[0-9]+|b[0-9]+|\.post[0-9]+)?$'; then
+            echo "::error::version $VERSION does not match PEP 440"
+            exit 1
+          fi
+          echo "version=$VERSION" >> "$GITHUB_OUTPUT"
+          echo "Publishing molecule-ai-workspace-runtime $VERSION"
+
+      - name: Install build tooling
+        run: pip install build twine
+
+      - name: Build package from workspace/
+        run: |
+          python scripts/build_runtime_package.py \
+            --version "${{ steps.version.outputs.version }}" \
+            --out "${{ runner.temp }}/runtime-build"
+
+      - name: Build wheel + sdist
+        working-directory: ${{ runner.temp }}/runtime-build
+        run: python -m build
+
+      - name: Verify package contents (sanity)
+        working-directory: ${{ runner.temp }}/runtime-build
+        run: |
+          python -m twine check dist/*
+          # Smoke-import the built wheel to catch import-rewrite mistakes
+          # before they hit PyPI. The package depends on a2a-sdk + httpx
+          # via pyproject; install those so the smoke import resolves.
+          python -m venv /tmp/smoke
+          /tmp/smoke/bin/pip install --quiet dist/*.whl
+          WORKSPACE_ID=00000000-0000-0000-0000-000000000000 \
+          PLATFORM_URL=http://localhost:8080 \
+            /tmp/smoke/bin/python -c "
+          from molecule_runtime import a2a_client, a2a_tools
+          from molecule_runtime.builtin_tools import memory
+          from molecule_runtime.adapters import get_adapter, BaseAdapter, AdapterConfig
+          assert a2a_client._A2A_QUEUED_PREFIX, 'queued prefix missing — chat-leak fix not in build'
+          print('✓ smoke import passed')
+          "
+
+      - name: Publish to PyPI
+        working-directory: ${{ runner.temp }}/runtime-build
+        env:
+          TWINE_USERNAME: __token__
+          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+        run: python -m twine upload dist/*
+
+  cascade:
+    # After PyPI accepts the upload, fan out a repository_dispatch to each
+    # template repo so they rebuild their image against the new runtime.
+    # Each template's `runtime-published.yml` receiver picks up the event,
+    # pulls the new PyPI version (their requirements.txt pin is `>=`), and
+    # republishes ghcr.io/molecule-ai/workspace-template-<runtime>:latest.
+    #
+    # Soft-fail per repo: if one template's dispatch fails (perms missing,
+    # repo archived, etc.) we still try the others and surface the failures
+    # in the workflow summary instead of aborting the whole cascade.
+    needs: publish
+    runs-on: ubuntu-latest
+    steps:
+      - name: Fan out repository_dispatch
+        env:
+          # Fine-grained PAT with `actions:write` on the 8 template repos.
+          # GITHUB_TOKEN can't fire dispatches across repos — needs an explicit
+          # token. Stored as a repo secret; rotate per the standard schedule.
+          DISPATCH_TOKEN: ${{ secrets.TEMPLATE_DISPATCH_TOKEN }}
+          RUNTIME_VERSION: ${{ needs.publish.outputs.version || steps.version.outputs.version }}
+        run: |
+          set +e   # don't abort on a single repo failure — collect them all
+          if [ -z "$DISPATCH_TOKEN" ]; then
+            echo "::warning::TEMPLATE_DISPATCH_TOKEN secret not set — skipping cascade. PyPI was published; templates will pick up the new version on their own next rebuild."
+            exit 0
+          fi
+          # Re-derive version from the tag here too (in case publish job
+          # didn't expose an output the previous step's reference reads).
+          VERSION="${GITHUB_REF_NAME#runtime-v}"
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            VERSION="${{ inputs.version }}"
+          fi
+          TEMPLATES="claude-code langgraph crewai autogen deepagents hermes gemini-cli openclaw"
+          FAILED=""
+          for tpl in $TEMPLATES; do
+            REPO="Molecule-AI/molecule-ai-workspace-template-$tpl"
+            STATUS=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" \
+              -X POST "https://api.github.com/repos/$REPO/dispatches" \
+              -H "Authorization: Bearer $DISPATCH_TOKEN" \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              -d "{\"event_type\":\"runtime-published\",\"client_payload\":{\"runtime_version\":\"$VERSION\"}}")
+            if [ "$STATUS" = "204" ]; then
+              echo "✓ dispatched $tpl ($VERSION)"
+            else
+              echo "::warning::✗ failed to dispatch $tpl: HTTP $STATUS — $(cat /tmp/dispatch.out)"
+              FAILED="$FAILED $tpl"
+            fi
+          done
+          if [ -n "$FAILED" ]; then
+            echo "::warning::Cascade incomplete. Failed templates:$FAILED"
+            # Don't fail the whole job — PyPI publish already succeeded;
+            # operators can retry the failed templates manually.
+          fi
diff --git a/docs/workspace-runtime-package.md b/docs/workspace-runtime-package.md
index a24ce42b..aed86368 100644
--- a/docs/workspace-runtime-package.md
+++ b/docs/workspace-runtime-package.md
@@ -2,29 +2,67 @@
 
 ## Overview
 
-The shared workspace runtime infrastructure lives in two places:
+The shared workspace runtime infrastructure has **one editable source** and
+**one published artifact**:
 
-1. **Source of truth (monorepo):** `workspace/` — this is where all development happens
-2. **Published package:** [`molecule-ai-workspace-runtime`](https://pypi.org/project/molecule-ai-workspace-runtime/) on PyPI
+1. **Source of truth (monorepo, editable):** `workspace/` — every runtime
+   change lands here. Edit it like any other monorepo code.
+2. **Published artifact (PyPI, generated):** [`molecule-ai-workspace-runtime`](https://pypi.org/project/molecule-ai-workspace-runtime/)
+   — produced by `.github/workflows/publish-runtime.yml` on every
+   `runtime-vX.Y.Z` tag push. Do NOT edit this independently — it gets
+   overwritten on every publish.
+
+The legacy sibling repo `molecule-ai-workspace-runtime` (the GitHub repo, as
+distinct from the PyPI package) is no longer the source-of-truth and should
+be treated as a publish artifact only. It can be archived or used as a
+read-only mirror.
+
+## Why this shape
+
+The 8 workspace template repos (claude-code, langgraph, hermes, etc.) each
+build their own Docker image and `pip install molecule-ai-workspace-runtime`
+from PyPI. PyPI is the right distribution channel — semver, reproducible
+builds, no submodule dance per-repo. But the runtime ALSO needs to evolve
+in lock-step with the platform's wire protocol (queue shape, A2A metadata,
+event payloads). Shipping cross-cutting protocol changes as separate
+runtime + platform PRs in two repos creates ordering pain and broken
+intermediate states.
+
+The monorepo + auto-publish split gives both: edit cross-cutting changes
+in one PR, publish the runtime artifact via a tag.
 
 ## What's in the package
 
-Everything in `workspace/` except adapter-specific code:
+Everything in `workspace/*.py` plus the `adapters/`, `builtin_tools/`,
+`plugins_registry/`, `policies/`, `skill_loader/` subpackages. Build
+artifacts (`Dockerfile`, `*.sh`, `pytest.ini`, `requirements.txt`) are
+excluded.
 
-- `molecule_runtime/` — all shared `.py` files (main.py, config.py, heartbeat.py, etc.)
-- `molecule_runtime/adapters/` — `BaseAdapter`, `AdapterConfig`, `SetupResult`, `shared_runtime`
-- `molecule_runtime/builtin_tools/` — delegation, memory, approvals, sandbox, telemetry
-- `molecule_runtime/skill_loader/` — skill loading + hot-reload
-- `molecule_runtime/plugins_registry/` — plugin discovery and install pipeline
-- `molecule_runtime/policies/` — namespace routing policies
-- Console script: `molecule-runtime` → `molecule_runtime.main:main_sync`
+The build script rewrites bare imports so the published package is a
+proper Python namespace:
+
+```
+# In monorepo workspace/:
+from a2a_client import discover_peer
+from builtin_tools.memory import store
+
+# In published molecule_runtime/ (auto-rewritten at publish time):
+from molecule_runtime.a2a_client import discover_peer
+from molecule_runtime.builtin_tools.memory import store
+```
+
+The closed allowlist of rewritten module names lives in
+`scripts/build_runtime_package.py` (`TOP_LEVEL_MODULES` + `SUBPACKAGES`).
+Add a new top-level module to workspace/? Add it to the allowlist in the
+same PR.
 
 ## Adapter repos
 
-Each of the 8 adapter repos now contains:
+Each of the 8 adapter template repos contains:
 - `adapter.py` — runtime-specific `Adapter` class
-- `requirements.txt` — `molecule-ai-workspace-runtime>=0.1.0` + adapter deps
-- `Dockerfile` — standalone image (no longer extends workspace-template:base)
+- `requirements.txt` — `molecule-ai-workspace-runtime>=0.1.X` + adapter deps
+- `Dockerfile` — standalone image with `ENV ADAPTER_MODULE=adapter` and
+  `ENTRYPOINT ["molecule-runtime"]`
 
 | Adapter | Repo |
 |---------|------|
@@ -39,8 +77,8 @@ Each of the 8 adapter repos now contains:
 
 ## Adapter discovery (ADAPTER_MODULE)
 
-Standalone adapter repos set `ENV ADAPTER_MODULE=adapter` in their Dockerfile.
-The runtime's `get_adapter()` checks this env var first:
+Standalone adapter repos set `ENV ADAPTER_MODULE=adapter` in their
+Dockerfile. The runtime's `get_adapter()` checks this env var first:
 
 ```python
 # In molecule_runtime/adapters/__init__.py
@@ -49,25 +87,104 @@ def get_adapter(runtime: str) -> type[BaseAdapter]:
     if adapter_module:
         mod = importlib.import_module(adapter_module)
         return getattr(mod, "Adapter")
-    # Fall back to built-in subdirectory scan (monorepo local dev)
-    ...
+    raise KeyError(...)
 ```
 
 ## Publishing a new version
 
 ```bash
-cd workspace-template
-# 1. Bump version in pyproject.toml
-# 2. Sync to molecule-ai-workspace-runtime repo
-# 3. Tag and push — CI publishes to PyPI via PYPI_TOKEN secret
+# From any local checkout of monorepo, after merging your runtime change:
+git tag runtime-v0.1.6
+git push origin runtime-v0.1.6
 ```
 
-Or manually:
-```bash
-cd workspace-template
-python -m build
-python -m twine upload dist/*
+The `publish-runtime` workflow takes over — checks out the tag, runs
+`scripts/build_runtime_package.py --version 0.1.6`, builds wheel + sdist,
+runs a smoke import to catch broken rewrites, and uploads to PyPI via
+the `PYPI_TOKEN` repo secret.
+
+For dev/test releases without tagging, dispatch the workflow manually
+with an explicit version (e.g. `0.1.6.dev1` — PEP 440 dev/rc/post forms
+are accepted).
+
+After publish, the 8 template repos pick up the new version on their
+next `:latest` rebuild. To force-pull immediately, bump the pin in each
+template's `requirements.txt`.
+
+## End-to-end CD chain
+
+The full chain from monorepo merge → workspace containers running new code:
+
 ```
+1. Merge PR with workspace/ changes to main
+   ↓
+2. .github/workflows/auto-tag-runtime.yml fires
+   ↓ reads PR labels (release:major/minor) or defaults to patch
+   ↓ pushes runtime-vX.Y.Z tag
+   ↓
+3. .github/workflows/publish-runtime.yml fires (on the tag)
+   ↓ builds wheel via scripts/build_runtime_package.py
+   ↓ smoke-imports the wheel
+   ↓ uploads to PyPI
+   ↓ cascade job fires repository_dispatch (event-type: runtime-published)
+   ↓ to all 8 workspace-template-* repos
+   ↓
+4. Each template's publish-image.yml fires (on repository_dispatch)
+   ↓ rebuilds Dockerfile (which pip-installs the new PyPI version)
+   ↓ pushes ghcr.io/molecule-ai/workspace-template-<runtime>:latest
+   ↓
+5. Production hosts run scripts/refresh-workspace-images.sh
+   OR an operator hits POST /admin/workspace-images/refresh on the platform
+   ↓ docker pull all 8 :latest tags
+   ↓ remove + force-recreate any running ws-* containers using a refreshed image
+   ↓ canvas re-provisions the workspaces on next interaction
+```
+
+Steps 1-4 are fully automated. Step 5 is one-click: a single curl or shell
+command. SaaS deployments typically wire step 5 into their normal deploy
+pipeline (every release pulls fresh images on every host); local dev fires
+it manually after a runtime release lands.
+
+### Required secrets
+
+| Secret | Where | Why |
+|---|---|---|
+| `PYPI_TOKEN` | molecule-core repo | Twine upload auth (PyPI) |
+| `TEMPLATE_DISPATCH_TOKEN` | molecule-core repo | Fine-grained PAT with `actions:write` on the 8 template repos. Without it the `cascade` job warns and exits clean — PyPI still publishes; templates just don't auto-rebuild. |
+
+### Step 5 specifics
+
+**Local dev (compose stack):**
+```bash
+bash scripts/refresh-workspace-images.sh                  # all runtimes
+bash scripts/refresh-workspace-images.sh --runtime claude-code
+bash scripts/refresh-workspace-images.sh --no-recreate    # pull only, leave containers
+```
+
+**Via platform admin endpoint (any deploy):**
+```bash
+curl -X POST "$PLATFORM/admin/workspace-images/refresh"
+curl -X POST "$PLATFORM/admin/workspace-images/refresh?runtime=claude-code"
+curl -X POST "$PLATFORM/admin/workspace-images/refresh?recreate=false"
+```
+
+The endpoint pulls + recreates from inside the platform container, so it
+needs Docker socket access (the compose stack mounts
+`/var/run/docker.sock` already) AND GHCR auth on the host's docker config
+(`docker login ghcr.io` once per host). On a fresh host without GHCR auth,
+the pull step warns per runtime and the response surfaces the failures.
+
+## Local dev (build the package without publishing)
+
+```bash
+python3 scripts/build_runtime_package.py --version 0.1.0-local --out /tmp/runtime-build
+cd /tmp/runtime-build
+python -m build              # produces dist/*.whl + dist/*.tar.gz
+pip install dist/*.whl       # install into a venv to test locally
+```
+
+This is the same pipeline CI runs. Use it to validate import-rewrite
+correctness before pushing a `runtime-v*` tag.
 
 ## Writing a new adapter
 
@@ -75,5 +192,18 @@ python -m twine upload dist/*
 2. Copy `adapter.py` pattern from any existing adapter repo
 3. Change imports: `from molecule_runtime.adapters.base import BaseAdapter, AdapterConfig`
 4. Create `requirements.txt` with `molecule-ai-workspace-runtime>=0.1.0` + your deps
-5. Create `Dockerfile` with `ENV ADAPTER_MODULE=adapter` and `ENTRYPOINT ["molecule-runtime"]`
+5. Create `Dockerfile` with `ENV ADAPTER_MODULE=adapter` and
+   `ENTRYPOINT ["molecule-runtime"]`
 6. Register the runtime name in the platform's known runtimes list
+
+## Migration note
+
+Prior to this workflow, the runtime was duplicated across monorepo
+`workspace/` AND a sibling repo `molecule-ai-workspace-runtime`, with no
+sync mechanism. That caused 30+ files to drift between the two trees and
+tonight's chat-leak / queued-classification fixes existed only in the
+monorepo copy until manually ported.
+
+If you have an old local checkout of `molecule-ai-workspace-runtime`, treat
+it as outdated. The monorepo `workspace/` is now authoritative; the PyPI
+artifact is rebuilt from it on every `runtime-v*` tag.
diff --git a/scripts/build_runtime_package.py b/scripts/build_runtime_package.py
new file mode 100755
index 00000000..91e121b2
--- /dev/null
+++ b/scripts/build_runtime_package.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python3
+"""Build the molecule-ai-workspace-runtime PyPI package from monorepo workspace/.
+
+Monorepo workspace/ is the single source-of-truth for runtime code. The PyPI
+package is a publish-time mirror produced by this script, NOT a parallel
+editable copy. Anyone editing the runtime should edit workspace/, never the
+sibling molecule-ai-workspace-runtime repo.
+
+What this does
+--------------
+1. Copies workspace/ source into build/molecule_runtime/ (note the rename:
+   bare modules become a real Python package).
+2. Rewrites top-level imports so e.g. `from a2a_client import X` becomes
+   `from molecule_runtime.a2a_client import X`. The rewrite is regex-based
+   on a closed allowlist of modules — third-party imports like `from a2a.X`
+   (the a2a-sdk package) are left alone because the regex is anchored on
+   exact module names.
+3. Writes a pyproject.toml with the requested version + the README + the
+   py.typed marker.
+4. Leaves the build dir ready for `python -m build` to produce a wheel/sdist.
+
+Usage
+-----
+  scripts/build_runtime_package.py --version 0.1.6 --out /tmp/runtime-build
+  cd /tmp/runtime-build && python -m build
+  python -m twine upload dist/*
+
+The publish workflow (.github/workflows/publish-runtime.yml) drives this
+on every `runtime-v*` tag push.
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import shutil
+import sys
+from pathlib import Path
+
+# Top-level Python modules in workspace/ that become molecule_runtime.X.
+# Anything imported as `from <name> import` or `import <name>` (where <name>
+# matches one of these) gets rewritten to use the package prefix.
+#
+# Closed list (not "every .py we copy") because a typo in workspace/ would
+# otherwise leak into a wrong rewrite. Update this when adding a new
+# top-level module to workspace/.
+TOP_LEVEL_MODULES = {
+    "a2a_cli",
+    "a2a_client",
+    "a2a_executor",
+    "a2a_mcp_server",
+    "a2a_tools",
+    "adapter_base",
+    "agent",
+    "agents_md",
+    "claude_sdk_executor",
+    "cli_executor",
+    "config",
+    "consolidation",
+    "coordinator",
+    "events",
+    "executor_helpers",
+    "heartbeat",
+    "hermes_executor",
+    "initial_prompt",
+    "main",
+    "molecule_ai_status",
+    "platform_auth",
+    "plugins",
+    "preflight",
+    "prompt",
+    "shared_runtime",
+}
+
+# Subdirectory packages — these are already real packages (they have or will
+# have __init__.py) so the rewrite is `from <pkg>` → `from molecule_runtime.<pkg>`.
+SUBPACKAGES = {
+    "adapters",
+    "builtin_tools",
+    "plugins_registry",
+    "policies",
+    "skill_loader",
+}
+
+# Files in workspace/ NOT included in the published package. These are
+# build artifacts, dev scripts, or monorepo-only scaffolding.
+EXCLUDE_FILES = {
+    "Dockerfile",
+    "build-all.sh",
+    "rebuild-runtime-images.sh",
+    "entrypoint.sh",
+    "pytest.ini",
+    "requirements.txt",
+    # Note: adapter_base.py, agents_md.py, hermes_executor.py, shared_runtime.py
+    # are kept (referenced by adapters/__init__.py and other modules); they get
+    # their imports rewritten via TOP_LEVEL_MODULES. Excluding them broke the
+    # smoke-test install with `ModuleNotFoundError: adapter_base`.
+}
+
+EXCLUDE_DIRS = {
+    "__pycache__",
+    "tests",
+    "lib",
+    "molecule_audit",
+    "scripts",
+}
+
+
+def build_import_rewriter() -> re.Pattern:
+    """Compile a single regex matching all import statements that need
+    rewriting. The match groups capture the keyword + module name so the
+    replacement preserves whitespace and trailing punctuation.
+
+    Modules included: TOP_LEVEL_MODULES ∪ SUBPACKAGES.
+
+    The negative-lookahead on `\\.` in the suffix prevents matching
+    `from a2a.server.X import Y` against bare `a2a` (which isn't in our
+    set, but the principle matters for any future short module name that
+    happens to be a prefix of a real package name).
+    """
+    names = sorted(TOP_LEVEL_MODULES | SUBPACKAGES)
+    alt = "|".join(re.escape(n) for n in names)
+    # Matches:
+    #   from <name>(\.|\s|import)
+    #   import <name>(\s|$|,)
+    # And captures the keyword + name so we can re-emit with prefix.
+    pattern = (
+        r"(?m)^(?P<indent>\s*)"          # leading whitespace (preserved)
+        r"(?P<kw>from|import)\s+"        # 'from' or 'import'
+        r"(?P<mod>" + alt + r")"          # the module name
+        r"(?P<rest>[\s.,]|$)"            # what follows: '.subpath', ' import …', ',', whitespace, EOL
+    )
+    return re.compile(pattern)
+
+
+def rewrite_imports(text: str, regex: re.Pattern) -> str:
+    """Replace bare imports with package-prefixed ones.
+
+    `import X`           → `import molecule_runtime.X as X`  (preserve binding)
+    `from X import Y`    → `from molecule_runtime.X import Y`
+    `from X.sub import Y` → `from molecule_runtime.X.sub import Y`
+    """
+    def repl(m: re.Match) -> str:
+        indent, kw, mod, rest = m.group("indent"), m.group("kw"), m.group("mod"), m.group("rest")
+        if kw == "from":
+            # `from X` or `from X.sub` — always safe to prefix.
+            return f"{indent}from molecule_runtime.{mod}{rest}"
+        # `import X` — preserve the binding name `X` (callers do `X.foo`)
+        # by aliasing. `import X.sub` is uncommon for our modules and would
+        # need a different binding form, but isn't used in workspace/ today.
+        if rest.startswith("."):
+            # `import X.sub` — rewrite as `import molecule_runtime.X.sub` and
+            # leave the trailing dot pattern intact for the rest of the line.
+            return f"{indent}import molecule_runtime.{mod}{rest}"
+        # Plain `import X` — alias preserves the local name.
+        return f"{indent}import molecule_runtime.{mod} as {mod}{rest}"
+    return regex.sub(repl, text)
+
+
+def copy_tree_filtered(src: Path, dst: Path) -> list[Path]:
+    """Copy src/ → dst/ skipping EXCLUDE_FILES + EXCLUDE_DIRS. Returns the
+    list of .py files copied so the caller can run the import rewrite over
+    them in one pass."""
+    py_files: list[Path] = []
+    if dst.exists():
+        shutil.rmtree(dst)
+    dst.mkdir(parents=True)
+    for entry in src.iterdir():
+        if entry.is_dir():
+            if entry.name in EXCLUDE_DIRS:
+                continue
+            sub_py = copy_tree_filtered(entry, dst / entry.name)
+            py_files.extend(sub_py)
+        else:
+            if entry.name in EXCLUDE_FILES:
+                continue
+            shutil.copy2(entry, dst / entry.name)
+            if entry.suffix == ".py":
+                py_files.append(dst / entry.name)
+    return py_files
+
+
+PYPROJECT_TEMPLATE = """\
+[build-system]
+requires = ["setuptools>=68.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "molecule-ai-workspace-runtime"
+version = "{version}"
+description = "Molecule AI workspace runtime — shared infrastructure for all agent adapters"
+requires-python = ">=3.11"
+license = {{text = "BSL-1.1"}}
+readme = "README.md"
+dependencies = [
+    "a2a-sdk[http-server]>=1.0.0,<2.0",
+    "httpx>=0.27.0",
+    "uvicorn>=0.30.0",
+    "starlette>=0.38.0",
+    "websockets>=12.0",
+    "pyyaml>=6.0",
+    "langchain-core>=0.3.0",
+    "opentelemetry-api>=1.24.0",
+    "opentelemetry-sdk>=1.24.0",
+    "opentelemetry-exporter-otlp-proto-http>=1.24.0",
+    "temporalio>=1.7.0",
+]
+
+[project.scripts]
+molecule-runtime = "molecule_runtime.main:main_sync"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["molecule_runtime*"]
+
+[tool.setuptools.package-data]
+"molecule_runtime" = ["py.typed"]
+"""
+
+
+README_TEMPLATE = """\
+# molecule-ai-workspace-runtime
+
+Shared workspace runtime for [Molecule AI](https://github.com/Molecule-AI/molecule-core)
+agent adapters. Installed by every workspace template image
+(`workspace-template-claude-code`, `-langgraph`, `-hermes`, etc.) to provide
+A2A delegation, heartbeat, memory, plugin loading, and skill management.
+
+This package is **published from the molecule-core monorepo `workspace/`
+directory** by the `publish-runtime` GitHub Actions workflow on every
+`runtime-v*` tag push. **Do not edit this package directly** — edit
+`workspace/` in the monorepo.
+
+See [`docs/workspace-runtime-package.md`](https://github.com/Molecule-AI/molecule-core/blob/main/docs/workspace-runtime-package.md)
+for the publish flow and architecture.
+"""
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--version", required=True, help="Package version, e.g. 0.1.6")
+    parser.add_argument("--out", required=True, type=Path, help="Build output directory (will be wiped)")
+    parser.add_argument("--source", type=Path, default=Path(__file__).resolve().parent.parent / "workspace",
+                        help="Path to monorepo workspace/ directory (default: ../workspace from this script)")
+    args = parser.parse_args()
+
+    src = args.source.resolve()
+    out = args.out.resolve()
+    if not src.is_dir():
+        print(f"error: source not a directory: {src}", file=sys.stderr)
+        return 2
+
+    pkg_dir = out / "molecule_runtime"
+    print(f"[build] source: {src}")
+    print(f"[build] output: {out}")
+    print(f"[build] package: {pkg_dir}")
+
+    if out.exists():
+        shutil.rmtree(out)
+    out.mkdir(parents=True)
+
+    py_files = copy_tree_filtered(src, pkg_dir)
+    print(f"[build] copied {len(py_files)} .py files")
+
+    # Ensure top-level package marker exists. workspace/ doesn't have one
+    # (it's not a package in monorepo), but the published artifact must.
+    init = pkg_dir / "__init__.py"
+    if not init.exists():
+        init.write_text('"""Molecule AI workspace runtime."""\n')
+
+    # Touch py.typed so type-checkers in adapter consumers see the package
+    # as typed. Empty file is the convention.
+    (pkg_dir / "py.typed").touch()
+
+    # Rewrite imports in every .py file we copied + the new __init__.py.
+    regex = build_import_rewriter()
+    rewrites = 0
+    for f in [*py_files, init]:
+        original = f.read_text()
+        rewritten = rewrite_imports(original, regex)
+        if rewritten != original:
+            f.write_text(rewritten)
+            rewrites += 1
+    print(f"[build] rewrote imports in {rewrites} files")
+
+    # Emit pyproject.toml + README at build root.
+    (out / "pyproject.toml").write_text(PYPROJECT_TEMPLATE.format(version=args.version))
+    (out / "README.md").write_text(README_TEMPLATE)
+
+    print(f"[build] done. To publish:")
+    print(f"  cd {out}")
+    print(f"  python -m build")
+    print(f"  python -m twine upload dist/*")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/refresh-workspace-images.sh b/scripts/refresh-workspace-images.sh
new file mode 100755
index 00000000..ec9ea0ba
--- /dev/null
+++ b/scripts/refresh-workspace-images.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+# refresh-workspace-images.sh — pull the latest workspace template images
+# from GHCR and recreate any running ws-* containers against the new digest.
+#
+# This is the local-dev / single-host equivalent of step 5 of the runtime CD
+# chain (see docs/workspace-runtime-package.md). On a SaaS deployment the
+# host's deploy pipeline does the pull on every release; this script is
+# what to run on a local docker-compose host after a runtime release lands.
+#
+# Usage:
+#   bash scripts/refresh-workspace-images.sh                     # pull all 8 + recreate running ws-*
+#   bash scripts/refresh-workspace-images.sh --runtime claude-code  # pull just one template
+#   bash scripts/refresh-workspace-images.sh --no-recreate          # pull only, leave containers
+#
+# Behavior:
+#   - Always pulls fresh; docker is a no-op if local matches remote, so
+#     repeated runs are cheap.
+#   - Recreate is "kill + remove + let the next canvas interaction re-
+#     provision" — simpler than `docker stop / docker run` because the
+#     platform owns the run flags. Workspaces re-register on next probe.
+#   - If a container is mid-conversation, the kill cancels in-flight work.
+#     Run during a quiet window OR add --no-recreate and recreate manually
+#     via canvas Restart buttons.
+
+set -euo pipefail
+
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+log()  { echo -e "${GREEN}[refresh]${NC} $1" >&2; }
+warn() { echo -e "${YELLOW}[refresh]${NC} $1" >&2; }
+err()  { echo -e "${RED}[refresh]${NC} $1" >&2; }
+
+ALL_RUNTIMES=(claude-code langgraph crewai autogen deepagents hermes gemini-cli openclaw)
+RUNTIMES=("${ALL_RUNTIMES[@]}")
+RECREATE=true
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --runtime) RUNTIMES=("$2"); shift 2;;
+    --no-recreate) RECREATE=false; shift;;
+    -h|--help) sed -n '2,30p' "$0"; exit 0;;
+    *) err "unknown arg: $1"; exit 2;;
+  esac
+done
+
+# 1. Pull fresh tags. Soft-fail per runtime — one missing image (e.g., a
+#    template that hasn't been published yet) shouldn't abort the others.
+log "pulling latest images for: ${RUNTIMES[*]}"
+PULLED=()
+FAILED=()
+for rt in "${RUNTIMES[@]}"; do
+  IMG="ghcr.io/molecule-ai/workspace-template-$rt:latest"
+  if docker pull "$IMG" >/dev/null 2>&1; then
+    log "  ✓ $rt"
+    PULLED+=("$rt")
+  else
+    warn "  ✗ $rt (pull failed — image may not exist or auth missing)"
+    FAILED+=("$rt")
+  fi
+done
+
+if [ "$RECREATE" = "false" ]; then
+  log "skip-recreate set — leaving containers untouched"
+  log "done. pulled=${#PULLED[@]} failed=${#FAILED[@]}"
+  exit 0
+fi
+
+# 2. Find ws-* containers whose image is one of the runtimes we pulled.
+#    `docker inspect` exposes the image tag/digest each was created from.
+log "scanning ws-* containers for stale images..."
+TO_RECREATE=()
+for cn in $(docker ps -a --filter "name=ws-" --format "{{.Names}}"); do
+  IMG=$(docker inspect "$cn" --format '{{.Config.Image}}' 2>/dev/null || echo "")
+  for rt in "${PULLED[@]}"; do
+    if [[ "$IMG" == *"workspace-template-$rt"* ]]; then
+      TO_RECREATE+=("$cn")
+      break
+    fi
+  done
+done
+
+if [ "${#TO_RECREATE[@]}" -eq 0 ]; then
+  log "no running ws-* containers using a refreshed image — nothing to recreate"
+  exit 0
+fi
+
+# 3. Kill + remove. Canvas next-interaction will re-provision.
+log "recreating ${#TO_RECREATE[@]} containers (canvas will re-provision on next interaction)"
+for cn in "${TO_RECREATE[@]}"; do
+  docker rm -f "$cn" >/dev/null 2>&1 && log "  removed $cn" || warn "  failed to remove $cn"
+done
+
+log "done. open the canvas and the workspaces will re-provision against the new image."
diff --git a/workspace-server/internal/handlers/admin_workspace_images.go b/workspace-server/internal/handlers/admin_workspace_images.go
new file mode 100644
index 00000000..147bf8ad
--- /dev/null
+++ b/workspace-server/internal/handlers/admin_workspace_images.go
@@ -0,0 +1,227 @@
+package handlers
+
+import (
+	"context"
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/docker/docker/api/types/container"
+	"github.com/docker/docker/api/types/filters"
+	dockerimage "github.com/docker/docker/api/types/image"
+	dockerclient "github.com/docker/docker/client"
+	"github.com/gin-gonic/gin"
+
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
+)
+
+// AdminWorkspaceImagesHandler serves POST /admin/workspace-images/refresh — the
+// production-side end of the runtime CD chain. Operators (or post-publish
+// automation) hit this to (1) pull the latest workspace template images from
+// GHCR via the Docker SDK and (2) recreate any running ws-* containers so
+// they adopt the new image. Without this, a freshly-published runtime sits
+// in the registry but containers keep running the old image until the next
+// manual restart.
+//
+// On a SaaS deployment the deploy pipeline already pulls on every release,
+// so the pull step is a no-op there; the recreate step is still the way to
+// make running workspaces adopt the new image without a full host restart.
+//
+// POST /admin/workspace-images/refresh
+//
+//	?runtime=claude-code   (optional; default = all 8 templates)
+//	&recreate=true|false   (default true; false = pull only)
+//
+// Returns JSON {pulled: [...], failed: [...], recreated: [...]}
+type AdminWorkspaceImagesHandler struct {
+	docker *dockerclient.Client
+}
+
+func NewAdminWorkspaceImagesHandler(docker *dockerclient.Client) *AdminWorkspaceImagesHandler {
+	return &AdminWorkspaceImagesHandler{docker: docker}
+}
+
+// allRuntimes is the canonical list mirroring docs/workspace-runtime-package.md.
+// Update both when a new template is added.
+var allRuntimes = []string{
+	"claude-code", "langgraph", "crewai", "autogen",
+	"deepagents", "hermes", "gemini-cli", "openclaw",
+}
+
+type refreshResult struct {
+	Pulled    []string `json:"pulled"`
+	Failed    []string `json:"failed"`
+	Recreated []string `json:"recreated"`
+}
+
+// ghcrAuthHeader returns the base64-encoded JSON auth payload Docker's
+// ImagePull expects in PullOptions.RegistryAuth, or empty string when no
+// GHCR_USER/GHCR_TOKEN env is set (lets public images pull through).
+//
+// The Docker SDK doesn't read ~/.docker/config.json — every authenticated
+// pull needs an explicit RegistryAuth string. Format per the Docker
+// engine API: {"username":"…","password":"…","serveraddress":"ghcr.io"}
+// → base64-encoded JSON with no trailing padding stripped (engine handles
+// either form).
+func ghcrAuthHeader() string {
+	user := strings.TrimSpace(os.Getenv("GHCR_USER"))
+	token := strings.TrimSpace(os.Getenv("GHCR_TOKEN"))
+	if user == "" || token == "" {
+		return ""
+	}
+	payload := map[string]string{
+		"username":      user,
+		"password":      token,
+		"serveraddress": "ghcr.io",
+	}
+	js, err := json.Marshal(payload)
+	if err != nil {
+		// Should be unreachable for a static map[string]string. Log so a
+		// future contributor adding a non-marshallable field notices.
+		log.Printf("workspace-images: failed to marshal GHCR auth: %v", err)
+		return ""
+	}
+	return base64.URLEncoding.EncodeToString(js)
+}
+
+func (h *AdminWorkspaceImagesHandler) Refresh(c *gin.Context) {
+	runtimes := allRuntimes
+	if r := c.Query("runtime"); r != "" {
+		// Accept a single runtime; reject anything not in the canonical list
+		// so a typo doesn't silently no-op.
+		found := false
+		for _, known := range allRuntimes {
+			if known == r {
+				found = true
+				break
+			}
+		}
+		if !found {
+			c.JSON(http.StatusBadRequest, gin.H{
+				"error":          fmt.Sprintf("unknown runtime: %s", r),
+				"known_runtimes": allRuntimes,
+			})
+			return
+		}
+		runtimes = []string{r}
+	}
+	recreate := c.DefaultQuery("recreate", "true") == "true"
+
+	res := refreshResult{Pulled: []string{}, Failed: []string{}, Recreated: []string{}}
+	auth := ghcrAuthHeader()
+
+	// 1. Pull each template image via the Docker SDK. Soft-fail per-runtime
+	//    so one missing image (e.g. unpublished template) doesn't abort
+	//    the others. Each pull's progress stream is drained to completion
+	//    — the engine treats early-close as "abandon", leaving partial
+	//    layers around with no reference.
+	pullCtx, cancel := context.WithTimeout(c.Request.Context(), 5*time.Minute)
+	defer cancel()
+	for _, rt := range runtimes {
+		image := fmt.Sprintf("ghcr.io/molecule-ai/workspace-template-%s:latest", rt)
+		opts := dockerimage.PullOptions{Platform: provisioner.DefaultImagePlatform()}
+		if auth != "" {
+			opts.RegistryAuth = auth
+		}
+		rc, err := h.docker.ImagePull(pullCtx, image, opts)
+		if err != nil {
+			log.Printf("workspace-images/refresh: pull %s failed: %v", rt, err)
+			res.Failed = append(res.Failed, rt)
+			continue
+		}
+		// Drain to completion. We discard progress payload because no
+		// caller renders it; the platform log already records pulled/failed
+		// per runtime. If a future caller wants live progress, decode the
+		// JSON-line stream into events here.
+		if _, err := io.Copy(io.Discard, rc); err != nil {
+			rc.Close()
+			log.Printf("workspace-images/refresh: drain %s failed: %v", rt, err)
+			res.Failed = append(res.Failed, rt)
+			continue
+		}
+		rc.Close()
+		res.Pulled = append(res.Pulled, rt)
+	}
+
+	if !recreate {
+		c.JSON(http.StatusOK, res)
+		return
+	}
+
+	// 2. Find ws-* containers running an image we just pulled. Recreate
+	//    them — kill+remove and let the platform's normal provisioning
+	//    flow re-create on next canvas interaction.
+	listCtx, listCancel := context.WithTimeout(c.Request.Context(), 30*time.Second)
+	defer listCancel()
+	containers, err := h.docker.ContainerList(listCtx, container.ListOptions{
+		All:     true,
+		Filters: filters.NewArgs(filters.Arg("name", "ws-")),
+	})
+	if err != nil {
+		log.Printf("workspace-images/refresh: container list failed: %v", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "container list failed", "partial_result": res})
+		return
+	}
+
+	pulledSet := map[string]struct{}{}
+	for _, rt := range res.Pulled {
+		pulledSet[rt] = struct{}{}
+	}
+	for _, ctr := range containers {
+		// ContainerList's ctr.Image is the *resolved digest* (sha256:…),
+		// not the human-readable tag. Use ContainerInspect to get the
+		// original Config.Image (e.g. "ghcr.io/molecule-ai/workspace-
+		// template-claude-code:latest") so we can match against the
+		// pulled-runtime set. The cost is one extra round-trip per
+		// ws-* container — there are at most 8 typically, so this is
+		// well below any UX threshold.
+		inspectCtx, inspectCancel := context.WithTimeout(c.Request.Context(), 10*time.Second)
+		full, err := h.docker.ContainerInspect(inspectCtx, ctr.ID)
+		inspectCancel()
+		if err != nil {
+			log.Printf("workspace-images/refresh: inspect %s failed: %v", ctr.ID[:12], err)
+			continue
+		}
+		imageRef := ""
+		if full.Config != nil {
+			imageRef = full.Config.Image
+		}
+		matched := ""
+		for rt := range pulledSet {
+			if strings.Contains(imageRef, "workspace-template-"+rt) {
+				matched = rt
+				break
+			}
+		}
+		if matched == "" {
+			continue
+		}
+		name := strings.TrimPrefix(ctr.Names[0], "/")
+		// Remove with force — the workspace will re-provision on the next
+		// canvas interaction. This drops in-flight conversations on the
+		// removed container; document via the response so callers can
+		// schedule the refresh during a quiet window.
+		rmCtx, rmCancel := context.WithTimeout(c.Request.Context(), 30*time.Second)
+		err = h.docker.ContainerRemove(rmCtx, ctr.ID, container.RemoveOptions{Force: true})
+		rmCancel()
+		if err != nil {
+			log.Printf("workspace-images/refresh: remove %s failed: %v", name, err)
+			continue
+		}
+		res.Recreated = append(res.Recreated, name)
+	}
+
+	authStatus := "no GHCR auth (public images only)"
+	if auth != "" {
+		authStatus = "GHCR_USER/GHCR_TOKEN auth"
+	}
+	log.Printf("workspace-images/refresh: pulled=%d failed=%d recreated=%d (%s)",
+		len(res.Pulled), len(res.Failed), len(res.Recreated), authStatus)
+	c.JSON(http.StatusOK, res)
+}
diff --git a/workspace-server/internal/handlers/admin_workspace_images_test.go b/workspace-server/internal/handlers/admin_workspace_images_test.go
new file mode 100644
index 00000000..26e61f95
--- /dev/null
+++ b/workspace-server/internal/handlers/admin_workspace_images_test.go
@@ -0,0 +1,73 @@
+package handlers
+
+import (
+	"encoding/base64"
+	"encoding/json"
+	"testing"
+)
+
+func TestGHCRAuthHeader_NoEnvReturnsEmpty(t *testing.T) {
+	t.Setenv("GHCR_USER", "")
+	t.Setenv("GHCR_TOKEN", "")
+	if got := ghcrAuthHeader(); got != "" {
+		t.Errorf("expected empty (no auth → public-only), got %q", got)
+	}
+}
+
+func TestGHCRAuthHeader_PartialEnvReturnsEmpty(t *testing.T) {
+	// Both must be set — defensive against half-configured env.
+	t.Setenv("GHCR_USER", "alice")
+	t.Setenv("GHCR_TOKEN", "")
+	if got := ghcrAuthHeader(); got != "" {
+		t.Errorf("user-only env should disable auth, got %q", got)
+	}
+	t.Setenv("GHCR_USER", "")
+	t.Setenv("GHCR_TOKEN", "fake-tok-xxx")
+	if got := ghcrAuthHeader(); got != "" {
+		t.Errorf("token-only env should disable auth, got %q", got)
+	}
+}
+
+func TestGHCRAuthHeader_EncodesDockerEnginePayload(t *testing.T) {
+	t.Setenv("GHCR_USER", "alice")
+	t.Setenv("GHCR_TOKEN", "fake-tok-value")
+	got := ghcrAuthHeader()
+	if got == "" {
+		t.Fatal("expected non-empty auth header")
+	}
+	raw, err := base64.URLEncoding.DecodeString(got)
+	if err != nil {
+		t.Fatalf("auth header is not valid base64-url: %v", err)
+	}
+	var payload map[string]string
+	if err := json.Unmarshal(raw, &payload); err != nil {
+		t.Fatalf("decoded auth is not valid JSON: %v (raw=%s)", err, raw)
+	}
+	if payload["username"] != "alice" {
+		t.Errorf("username: got %q, want alice", payload["username"])
+	}
+	if payload["password"] != "fake-tok-value" {
+		t.Errorf("password: got %q, want fake-tok-value", payload["password"])
+	}
+	if payload["serveraddress"] != "ghcr.io" {
+		t.Errorf("serveraddress: got %q, want ghcr.io", payload["serveraddress"])
+	}
+}
+
+func TestGHCRAuthHeader_TrimsWhitespace(t *testing.T) {
+	// .env lines often have trailing newlines or accidental spaces. Without
+	// trimming, a stray space would produce an auth payload the engine
+	// rejects with a confusing 401.
+	t.Setenv("GHCR_USER", "  alice  ")
+	t.Setenv("GHCR_TOKEN", "\tfake-tok-value\n")
+	got := ghcrAuthHeader()
+	raw, _ := base64.URLEncoding.DecodeString(got)
+	var payload map[string]string
+	_ = json.Unmarshal(raw, &payload)
+	if payload["username"] != "alice" {
+		t.Errorf("username not trimmed: got %q", payload["username"])
+	}
+	if payload["password"] != "fake-tok-value" {
+		t.Errorf("password not trimmed: got %q", payload["password"])
+	}
+}
diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go
index a57a4cb6..a3cd37d2 100644
--- a/workspace-server/internal/provisioner/provisioner.go
+++ b/workspace-server/internal/provisioner/provisioner.go
@@ -1082,6 +1082,13 @@ func pullImageAndDrain(ctx context.Context, cli dockerImageClient, ref, platform
 //
 // Tracked in issue #1875; remove this fallback once the template repos
 // publish multi-arch manifests.
+// DefaultImagePlatform is the exported alias used by the admin
+// workspace-images handler so its ImagePull picks the same platform as
+// the provisioner's. Avoids duplicating the Apple-Silicon-needs-amd64
+// logic and keeps both call sites in sync if Docker manifest support
+// changes (e.g., when the templates start shipping multi-arch).
+func DefaultImagePlatform() string { return defaultImagePlatform() }
+
 func defaultImagePlatform() string {
 	if v, ok := os.LookupEnv("MOLECULE_IMAGE_PLATFORM"); ok {
 		return v
diff --git a/workspace-server/internal/router/router.go b/workspace-server/internal/router/router.go
index 6242067a..b337ef32 100644
--- a/workspace-server/internal/router/router.go
+++ b/workspace-server/internal/router/router.go
@@ -402,6 +402,17 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
 		r.POST("/admin/a2a-queue/drop-stale", middleware.AdminAuth(db.DB), qH.DropStale)
 	}
 
+	// Admin — workspace template image refresh. Pulls latest images from GHCR
+	// and recreates running ws-* containers so they adopt the new image.
+	// Final step of the runtime CD chain — see docs/workspace-runtime-package.md.
+	// Operators (or post-publish automation) hit this after a runtime release.
+	// Reuses the provisioner's Docker client; no-op when prov is nil
+	// (test / non-Docker deploy).
+	if prov != nil {
+		imgH := handlers.NewAdminWorkspaceImagesHandler(prov.DockerClient())
+		r.POST("/admin/workspace-images/refresh", middleware.AdminAuth(db.DB), imgH.Refresh)
+	}
+
 	// Admin — test token minting (issue #6). Hidden in production via TestTokensEnabled().
 	// NOT behind AdminAuth — this is the bootstrap endpoint E2E tests and
 	// fresh installs use to obtain their first admin bearer. Adding AdminAuth

From 762d3b8b2cf97f60f8c70f01f685a36527cbc2c5 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 26 Apr 2026 10:32:33 -0700
Subject: [PATCH 34/34] test(ssrf): pin dev-mode RFC-1918 allow contract
 (follow-up to #2103)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #2103 widened the SSRF saasMode branch to also relax RFC-1918 + ULA
under MOLECULE_ENV=development (so the docker-compose dev pattern stops
rejecting workspace registrations on 172.18.x.x bridge IPs). The
existing TestIsSafeURL_DevMode_StillBlocksOtherRanges covered the
security floor (metadata / TEST-NET / CGNAT stay blocked), but no
test asserted the positive side — that 10.x / 172.x / 192.168.x / fd00::
ARE now allowed under dev mode.

Without this test, a future refactor that quietly drops the
`|| devModeAllowsLoopback()` from isPrivateOrMetadataIP wouldn't trip
any assertion, and the docker-compose dev loop would silently re-break.

Adds TestIsSafeURL_DevMode_AllowsRFC1918 — table of 4 URLs covering
the three RFC-1918 IPv4 ranges + IPv6 ULA fd00::/8. Sets
MOLECULE_DEPLOY_MODE=self-hosted explicitly so the test exercises the
devMode branch, not a SaaS-mode pass.

Closes the Optional finding I left on PR #2103.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../internal/handlers/ssrf_test.go            | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/workspace-server/internal/handlers/ssrf_test.go b/workspace-server/internal/handlers/ssrf_test.go
index 37b2b358..8e246ebc 100644
--- a/workspace-server/internal/handlers/ssrf_test.go
+++ b/workspace-server/internal/handlers/ssrf_test.go
@@ -387,6 +387,37 @@ func TestIsSafeURL_SaaSMode_StillBlocksMetadataEtAl(t *testing.T) {
 	}
 }
 
+// TestIsSafeURL_DevMode_AllowsRFC1918 pins the dev-mode RFC-1918 + ULA
+// relaxation that #2103 widened. The dev-host docker-compose pattern
+// puts the platform + workspaces on the same docker bridge (172.18.0.0/16),
+// so workspace registration via 172.18.x.x must NOT be rejected in dev.
+// SaaS already allowed this; dev mode now matches via
+// `saas := saasMode() || devModeAllowsLoopback()` in isPrivateOrMetadataIP.
+//
+// Without this test, a future refactor that quietly drops the
+// `|| devModeAllowsLoopback()` from line 130 wouldn't trip any test —
+// the existing `TestIsSafeURL_DevMode_StillBlocksOtherRanges` only
+// pins the security floor (metadata / TEST-NET / CGNAT), not the
+// behavior change.
+func TestIsSafeURL_DevMode_AllowsRFC1918(t *testing.T) {
+	// Make sure saasMode() returns false so the test exercises the
+	// devModeAllowsLoopback() branch specifically — not a SaaS-mode pass.
+	t.Setenv("MOLECULE_DEPLOY_MODE", "self-hosted")
+	t.Setenv("MOLECULE_ORG_ID", "")
+	t.Setenv("MOLECULE_ENV", "development")
+
+	for _, url := range []string{
+		"http://10.1.2.3/agent",
+		"http://172.18.0.42:8000/a2a",       // the docker-compose case from the issue
+		"http://192.168.1.100/agent",
+		"http://[fd00::1]/agent",            // IPv6 ULA fd00::/8 also relaxed
+	} {
+		if err := isSafeURL(url); err != nil {
+			t.Errorf("isSafeURL(%q) in dev mode: got %v, want nil", url, err)
+		}
+	}
+}
+
 // TestIsSafeURL_StrictMode_BlocksRFC1918 is the strict-mode counterpart to
 // TestIsSafeURL_SaaSMode_AllowsRFC1918.  In self-hosted / single-container
 // deployments there is no legitimate reason to reach RFC-1918 agents, so the