fix(queue): re-fetch PR head before merge to detect stale SHA

If CI updates the PR head between the initial status check and the merge call, the queue might try to merge an outdated head. Add a pre-merge PR re-fetch that bails out if the head changed, letting the next tick re-evaluate with the current head. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
fix(queue): handle Gitea empty-body 200 on merge endpoint
2026-05-17 04:06:22 +00:00 · 2026-05-17 04:05:37 +00:00 · 2026-05-17 04:03:11 +00:00 · 2026-05-17 02:47:52 +00:00 · 2026-05-16 22:48:49 +00:00 · 2026-05-16 21:09:20 +00:00
24 changed files with 837 additions and 91 deletions
@@ -23,6 +23,7 @@ import dataclasses
 import json
 import os
 import sys
+import time
 import urllib.error
 import urllib.parse
 import urllib.request
@@ -326,6 +327,43 @@ def update_pull(pr_number: int, *, dry_run: bool) -> None:
    )


+def wait_for_ci(
+    head_sha: str,
+    contexts: list[str],
+    *,
+    max_wait_seconds: int = 300,
+    poll_interval: int = 15,
+) -> bool:
+    """Poll CI statuses for head_sha until all required contexts are terminal.
+
+    Returns True if all contexts reached 'success', False if timeout expired
+    (some still pending or failed).
+
+    Background: after a queue-triggered PR update, CI re-runs on the new head.
+    The queue must not update again until CI completes — otherwise the
+    update-then-wait loop keeps the PR in a perpetually-updating state where
+    CI never finishes on any single head.
+    """
+    deadline = time.time() + max_wait_seconds
+    while time.time() < deadline:
+        time.sleep(poll_interval)
+        try:
+            pr_status = get_combined_status(head_sha)
+        except Exception as exc:
+            sys.stderr.write(f"::warning::wait_for_ci: status fetch failed: {exc}\n")
+            continue
+        latest = latest_statuses_by_context(pr_status.get("statuses") or [])
+        ok, bad = required_contexts_green(latest, contexts)
+        if ok:
+            sys.stderr.write(f"::notice::wait_for_ci: all contexts green after {int(time.time() - (deadline - max_wait_seconds))}s\n")
+            return True
+        # Log progress
+        pending = [f"{c}={latest.get(c, {}).get('status', 'missing')}" for c in contexts if latest.get(c, {}).get('status') != 'success']
+        sys.stderr.write(f"::notice::wait_for_ci: still waiting ({int(deadline - time.time())}s left): {', '.join(pending[:3])}\n")
+    sys.stderr.write(f"::warning::wait_for_ci: timeout after {max_wait_seconds}s; proceeding with merge check\n")
+    return False
+
+
 def merge_pull(pr_number: int, *, dry_run: bool) -> None:
    payload = {
        "Do": "merge",
@@ -338,7 +376,24 @@ def merge_pull(pr_number: int, *, dry_run: bool) -> None:
    print(f"::notice::merging PR #{pr_number}")
    if dry_run:
        return
-    api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
+    # Gitea's merge endpoint returns HTTP 200 with an empty body on success.
+    # The generic api() wrapper raises ApiError on non-2xx, so a 200 with an
+    # empty body reaches the json.loads() path and raises JSONDecodeError,
+    # which api() re-raises as ApiError — making the queue think the merge
+    # failed when it actually succeeded.  Work around this by catching the
+    # expected JSONDecodeError here and treating it as success.
+    try:
+        api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
+    except ApiError as exc:
+        # Surface non-merge errors (5xx server errors, 403 forbidden, etc.)
+        if "merge" in str(exc).lower() or "405" in str(exc) or "409" in str(exc):
+            # 405 = PR not mergeable (already merged or CI still running by
+            #    the time we got here — the PR will be re-checked next tick)
+            # 409 = merge conflict detected at merge time
+            # In both cases the PR stays open and the next tick re-evaluates.
+            sys.stderr.write(f"::warning::merge call returned: {exc}\n")
+        else:
+            raise


 def process_once(*, dry_run: bool = False) -> int:
@@ -390,6 +445,32 @@ def process_once(*, dry_run: bool = False) -> int:
    print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}")
    if decision.action == "update":
        update_pull(pr_number, dry_run=dry_run)
+        # After an update, CI re-runs on the new head. If we check statuses
+        # immediately we see pending (CI not started yet on the new head), so
+        # the next tick updates again — CI never completes on any single head.
+        # Fix: re-fetch the PR to get the new head SHA, then poll CI for up
+        # to 5 min until all required contexts reach terminal state.  If CI
+        # finishes in time, proceed to merge on the same tick.
+        if not dry_run:
+            updated_pr = get_pull(pr_number)
+            new_head = updated_pr.get("head", {}).get("sha", "")
+            if new_head and new_head != head_sha:
+                sys.stderr.write(f"::notice::PR #{pr_number}: update created new head {new_head[:8]}; waiting for CI...\n")
+                waited = wait_for_ci(new_head, contexts, max_wait_seconds=300, poll_interval=15)
+                if waited:
+                    # CI completed — re-fetch main to confirm it hasn't moved,
+                    # then merge immediately without another update cycle.
+                    current_main_sha = get_branch_head(WATCH_BRANCH)
+                    if current_main_sha != main_sha:
+                        sys.stderr.write(f"::notice::PR #{pr_number}: main moved {main_sha[:8]} -> {current_main_sha[:8]}; deferring\n")
+                        return 0
+                    sys.stderr.write(f"::notice::PR #{pr_number}: CI complete; merging now\n")
+                    merge_pull(pr_number, dry_run=dry_run)
+                    return 0
+                else:
+                    sys.stderr.write(f"::warning::PR #{pr_number}: CI did not finish within 5 min; will retry next tick\n")
+            else:
+                sys.stderr.write(f"::notice::PR #{pr_number}: update did not change head SHA; will retry\n")
        post_comment(
            pr_number,
            (
@@ -400,6 +481,13 @@ def process_once(*, dry_run: bool = False) -> int:
        )
        return 0
    if decision.ready:
+        # Re-fetch PR to confirm head hasn't changed since we last checked
+        # (CI may have updated the head while we were evaluating).
+        current_pr = get_pull(pr_number)
+        current_head = current_pr.get("head", {}).get("sha", "")
+        if current_head != head_sha:
+            print(f"::notice::PR #{pr_number} head changed {head_sha[:8]} -> {current_head[:8]}; re-evaluating")
+            return 0
        latest_main_sha = get_branch_head(WATCH_BRANCH)
        if latest_main_sha != main_sha:
            print(
@@ -68,7 +68,7 @@ import sys
 import urllib.error
 import urllib.parse
 import urllib.request
-from typing import Any
+from typing import Any, Callable


 # ---------------------------------------------------------------------------
@@ -110,7 +110,7 @@ def normalize_slug(raw: str, numeric_aliases: dict[int, str] | None = None) -> s
 # for /sop-revoke (RFC#351 open question 4 — reason is captured but not
 # yet validated; future iteration may require a min-length).
 _DIRECTIVE_RE = re.compile(
-    r"^[ \t]*/(sop-ack|sop-revoke)[ \t]+([A-Za-z0-9_\- ]+?)(?:[ \t]+(.*))?[ \t]*$",
+    r"^[ \t]*/(sop-ack|sop-revoke|sop-n/a)[ \t]+([A-Za-z0-9_\- ]+?)(?:[ \t]+(.*))?[ \t]*$",
    re.MULTILINE,
 )

@@ -118,19 +118,21 @@ _DIRECTIVE_RE = re.compile(
 def parse_directives(
    comment_body: str,
    numeric_aliases: dict[int, str],
-) -> tuple[list[tuple[str, str, str]], list]:
-    """Extract /sop-ack and /sop-revoke directives from a comment body.
+) -> tuple[list[tuple[str, str, str]], list[tuple[str, str, str]]]:
+    """Extract /sop-ack, /sop-revoke, and /sop-n/a directives from a comment body.

-    Returns (directives, na_directives) where:
-      directives is a list of (kind, canonical_slug, note) tuples
-        kind is "sop-ack" or "sop-revoke"
-        canonical_slug is the normalized form (or "" if unparseable)
-        note is the trailing free-text (may be "")
-      na_directives is reserved for future N/A handling (always [] for now)
+    Returns (directives, na_directives) where each is a list of
+    (kind, canonical_slug, note) tuples:
+      kind is "sop-ack", "sop-revoke", or "sop-n/a"
+      canonical_slug is the normalized form (or "" if unparseable)
+      note is the trailing free-text (may be "")
+    The two lists are kept separate so call sites can unpack them
+    directly (e.g. directives, na_directives = parse_directives(...)).
    """
-    out: list[tuple[str, str, str]] = []
+    directives: list[tuple[str, str, str]] = []
+    na_directives: list[tuple[str, str, str]] = []
    if not comment_body:
-        return out, []
+        return directives, na_directives
    for m in _DIRECTIVE_RE.finditer(comment_body):
        kind = m.group(1)
        raw_slug = (m.group(2) or "").strip()
@@ -160,8 +162,12 @@ def parse_directives(
        note_from_group = (m.group(3) or "").strip()
        # If we collapsed multi-word slug into kebab and there's a
        # trailing-text group too, append it.
-        out.append((kind, canonical, note_from_group))
-    return out, []
+        entry = (kind, canonical, note_from_group)
+        if kind == "sop-n/a":
+            na_directives.append(entry)
+        else:
+            directives.append(entry)
+    return directives, na_directives


 # ---------------------------------------------------------------------------
@@ -174,8 +180,8 @@ def section_marker_present(body: str, marker: str) -> bool:
    on a non-empty line (i.e. the author actually filled it in).

    We require the marker substring AND non-whitespace content on the
-    same line OR within the next line — this prevents trivially-empty
-    checklists like:
+    same line OR within the next non-blank line — this prevents
+    trivially-empty checklists like:

        ## SOP-Checklist
        - [ ] **Comprehensive testing performed**:
@@ -184,9 +190,18 @@ def section_marker_present(body: str, marker: str) -> bool:
    from auto-passing the section-present check. The peer-ack is still
    required, but answering with empty content is captured as a soft
    finding via the section-present test alone.
+
+    NOTE: we scan forward through blank lines (the markdown-header pattern
+    is ## Header\\n\\ncontent) so that a header + blank-line + content
+    structure still satisfies the check. The backward checkbox fallback
+    catches inline markers without a preceding checkbox (mc#1099).
    """
    if not body or not marker:
        return False
+    # Strip trailing whitespace so the blank-line scan below can find
+    # content that appears on the very last line of the body (without
+    # being misled by a trailing \n or spaces).
+    body = body.rstrip()
    body_lower = body.lower()
    marker_lower = marker.lower()
    idx = body_lower.find(marker_lower)
@@ -202,13 +217,44 @@ def section_marker_present(body: str, marker: str) -> bool:
    stripped = re.sub(r"[\s\*:\-\[\]]+", "", line)
    if stripped:
        return True
-    # Fall through: check the NEXT line (multi-line answers).
-    next_line_end = body.find("\n", line_end + 1)
-    if next_line_end < 0:
-        next_line_end = len(body)
-    next_line = body[line_end + 1:next_line_end]
-    stripped_next = re.sub(r"[\s\*:\-\[\]]+", "", next_line)
-    return bool(stripped_next)
+    # Fall through: scan forward, skipping blank-only lines, until we find
+    # non-empty content or run out of body.  Handles:
+    #   ## Header          ← marker line (empty after marker)
+    #                      ← blank line (skipped)
+    #   - actual content   ← found
+    pos = line_end
+    while True:
+        # Skip the current newline and any additional newlines (blank lines).
+        while pos < len(body) and body[pos] == "\n":
+            pos += 1
+        if pos >= len(body):
+            break
+        line_end = body.find("\n", pos)
+        if line_end < 0:
+            line_end = len(body)
+        line = body[pos:line_end]
+        stripped = re.sub(r"[\s\*:\-\[\]]+", "", line)
+        if stripped:
+            return True
+        pos = line_end
+    # Last resort: the marker may appear mid-sentence (e.g.
+    # **Memory/saved-feedback consulted**: No applicable...).
+    # Search backward within the CURRENT LINE only (not preceding lines)
+    # to find a checkbox on the same line before the marker text.
+    # mc#1099 follow-up: memory-consulted detection was failing because
+    # the checkbox was on the same line before the inline marker.
+    _CHECKBOX_RE = re.compile(r"- \[[ x\]]|<input", re.IGNORECASE)
+    line_start = body.rfind("\n", 0, idx) + 1  # 0 if no newline before idx
+    before = body[line_start:idx]
+    m = _CHECKBOX_RE.search(before)
+    if not m:
+        return False
+    # Require meaningful content between the checkbox and the marker text
+    # (markdown formatting like ** or * must also be stripped).
+    # If only whitespace/markdown chars remain, the checkbox line is empty.
+    between = before[m.end() :]
+    stripped_between = re.sub(r"[\s\*:#\[\]_\-]+", "", between)
+    return bool(stripped_between)


 # ---------------------------------------------------------------------------
@@ -251,8 +297,7 @@ def compute_ack_state(
        user = (c.get("user") or {}).get("login", "")
        if not user:
            continue
-        directives, _na = parse_directives(body, numeric_aliases)
-        for kind, slug, _note in directives:
+        for kind, slug, _note in parse_directives(body, numeric_aliases)[0]:
            if not slug:
                unparseable_per_user[user] = unparseable_per_user.get(user, 0) + 1
                continue
@@ -304,6 +349,63 @@ def compute_ack_state(
    }


+# ---------------------------------------------------------------------------
+# N/A-gate evaluation
+# ---------------------------------------------------------------------------
+
+
+def compute_na_state(
+    comments: list[dict[str, Any]],
+    author: str,
+    na_gates: dict[str, Any],
+    probe: Callable[[str, list[str]], list[str]],
+) -> dict[str, dict[str, Any]]:
+    """Evaluate which N/A gates have a valid declaration from a team member.
+
+    Returns dict[gate_name, dict] where each dict has:
+      declared: bool — at least one valid non-author team-member declared N/A
+      decl_ackers: list[str] — usernames who declared this gate N/A
+      rejected: dict with keys:
+        not_in_team: list[str] — users who tried but aren't in required teams
+    """
+    # Build per-user latest N/A directive (most-recent wins per RFC#324).
+    latest_na: dict[str, tuple[str, str]] = {}  # user → (gate, note)
+    for c in comments:
+        body = c.get("body", "") or ""
+        user = (c.get("user") or {}).get("login", "")
+        if not user:
+            continue
+        for kind, gate, note in parse_directives(body, {})[1]:
+            # [1] = na_directives only
+            if gate in na_gates:
+                latest_na[user] = (gate, note)
+
+    result: dict[str, dict[str, Any]] = {}
+    for gate, gate_cfg in na_gates.items():
+        result[gate] = {
+            "declared": False,
+            "decl_ackers": [],
+            "rejected": {"not_in_team": []},
+        }
+        decl_ackers: list[str] = []
+        not_in_team: list[str] = []
+        for user, (g, _note) in latest_na.items():
+            if g != gate:
+                continue
+            if user == author:
+                continue  # authors cannot self-declare N/A
+            approved = probe(gate, [user])
+            if approved:
+                decl_ackers.append(user)
+            else:
+                not_in_team.append(user)
+        result[gate]["declared"] = bool(decl_ackers)
+        result[gate]["decl_ackers"] = decl_ackers
+        result[gate]["rejected"]["not_in_team"] = not_in_team
+
+    return result
+
+
 # ---------------------------------------------------------------------------
 # Gitea API client
 # ---------------------------------------------------------------------------
@@ -698,6 +800,7 @@ def main(argv: list[str] | None = None) -> int:
    cfg = load_config(args.config)
    items: list[dict[str, Any]] = cfg["items"]
    items_by_slug = {it["slug"]: it for it in items}
+    na_gates: dict[str, Any] = cfg.get("n/a_gates", {})
    numeric_aliases = {
        int(it["numeric_alias"]): it["slug"] for it in items if it.get("numeric_alias")
    }
@@ -818,6 +921,46 @@ def main(argv: list[str] | None = None) -> int:
        description=description, target_url=target_url,
    )
    print(f"::notice::status posted: {args.status_context} → {state}")
+
+    # --- N/A gate status (RFC#324 §N/A follow-up) ---
+    # Post a separate status so review-check.sh can discover N/A declarations
+    # and waive the Gitea-approve requirement for that gate.
+    na_state: dict[str, dict[str, Any]] = {}
+    if na_gates:
+        na_state = compute_na_state(comments, author, na_gates, probe)
+
+        na_descs: list[str] = []
+        for gate, s in na_state.items():
+            if s["declared"]:
+                na_descs.append(gate)
+            decl = s["decl_ackers"]
+            rej = s["rejected"]["not_in_team"]
+            if decl:
+                print(f"::notice::  [N/A OK] {gate} — declared by {','.join(decl)}")
+            if rej:
+                print(
+                    f"::notice::  [N/A REJ] {gate} — not-in-team: {','.join(rej)}",
+                    file=sys.stderr,
+                )
+
+        na_desc = ", ".join(sorted(na_descs)) if na_descs else "(none)"
+        na_status_state = "success" if na_descs else "pending"
+        # review-check.sh reads the description to discover which gates are N/A.
+        # Include the gate names so it can grep for them.
+        na_description = f"N/A: {na_desc}" if na_descs else "N/A: (none)"
+
+        if not args.dry_run:
+            client.post_status(
+                args.owner, args.repo, head_sha,
+                state=na_status_state,
+                context="sop-checklist / na-declarations (pull_request)",
+                description=na_description,
+                target_url=target_url,
+            )
+            print(
+                f"::notice::na-declarations status → {na_status_state}: {na_description}"
+            )
+
    # By default exit 0 — the POSTed status IS the gate, NOT the job
    # conclusion. If the job exits 1 BP will see TWO failure signals
    # (one from the job's auto-status, one from our POST), making the
@@ -551,3 +551,55 @@ class TestEndToEndAckFlow(unittest.TestCase):

 if __name__ == "__main__":
    unittest.main(verbosity=2)
+
+
+# ---------------------------------------------------------------------------
+# compute_na_state
+# ---------------------------------------------------------------------------
+
+
+class TestComputeNaState(unittest.TestCase):
+    """Tests for /sop-n/a directive evaluation."""
+
+    def test_no_na_declarations(self):
+        cfg = sop.load_config(CONFIG_PATH)
+        na_gates = cfg.get("n/a_gates", {})
+        comments = []
+        na_state = sop.compute_na_state(comments, "alice", na_gates, lambda *_: [])
+        self.assertFalse(na_state["qa-review"]["declared"])
+        self.assertFalse(na_state["security-review"]["declared"])
+
+    def test_na_declared_by_authorized_user(self):
+        cfg = sop.load_config(CONFIG_PATH)
+        na_gates = cfg.get("n/a_gates", {})
+        comments = [_comment("bob", "/sop-n/a qa-review N/A: pure tooling change")]
+        na_state = sop.compute_na_state(comments, "alice", na_gates, lambda g, u: u)
+        self.assertTrue(na_state["qa-review"]["declared"])
+        self.assertEqual(na_state["qa-review"]["decl_ackers"], ["bob"])
+
+    def test_na_declared_by_unauthorized_user_rejected(self):
+        cfg = sop.load_config(CONFIG_PATH)
+        na_gates = cfg.get("n/a_gates", {})
+        comments = [_comment("mallory", "/sop-n/a qa-review N/A: not real team")]
+        na_state = sop.compute_na_state(comments, "alice", na_gates, lambda g, u: [])
+        self.assertFalse(na_state["qa-review"]["declared"])
+        self.assertEqual(na_state["qa-review"]["rejected"]["not_in_team"], ["mallory"])
+
+    def test_author_cannot_self_declare_na(self):
+        cfg = sop.load_config(CONFIG_PATH)
+        na_gates = cfg.get("n/a_gates", {})
+        comments = [_comment("alice", "/sop-n/a qa-review N/A: I am the author")]
+        na_state = sop.compute_na_state(comments, "alice", na_gates, lambda g, u: u)
+        self.assertFalse(na_state["qa-review"]["declared"])
+
+    def test_parse_directives_separates_na_from_ack(self):
+        directives, na_directives = sop.parse_directives(
+            "/sop-ack comprehensive-testing\n/sop-n/a qa-review N/A: no surface",
+            {},
+        )
+        self.assertEqual(len(directives), 1)
+        self.assertEqual(directives[0][0], "sop-ack")
+        self.assertEqual(len(na_directives), 1)
+        self.assertEqual(na_directives[0][0], "sop-n/a")
+        self.assertEqual(na_directives[0][1], "qa-review")
+        self.assertIn("no surface", na_directives[0][2])
@@ -32,6 +32,12 @@ on:
  # iterating all open PRs when PR_NUMBER is empty.
  workflow_dispatch:

+# Cancel stale runs so the 8-runner pool stays available for PR jobs.
+# Per-SHA group ensures push and cron runs at different SHAs don't cancel each other.
+concurrency:
+  group: gate-check-v3-${{ github.event.pull_request.head.sha || github.sha }}
+  cancel-in-progress: true
+
 permissions:
  # read: contents — for checkout (base ref, not PR head for security)
  # read: pull-requests — for reading PR info via API
@@ -49,13 +49,17 @@ jobs:
  # bp-exempt: post-merge image publication side effect; CI / all-required gates source changes.
  build-and-push:
    name: Build & push canvas image
-    # REVERTED (infra/revert-docker-runner-label): `runs-on: ubuntu-latest` restored.
-    # The `docker` label is not registered on any act_runner. `runs-on: [ubuntu-latest, docker]`
-    # causes jobs to queue indefinitely with zero eligible runners — strictly worse than the
-    # pre-#599 coin-flip (50% success rate). Once the `docker` label is registered on
-    # ≥2 runners, re-apply the fix from #599 (infra/docker-runner-label).
-    # See issue #576 + infra-lead pulse ~00:30Z.
-    runs-on: ubuntu-latest
+    # Dedicated publish/release lane (internal#462 / #394 / #399). Ship
+    # path (on: push:main, canvas/**) — reserved capacity so a merged
+    # canvas fix's image build never FIFO-queues behind PR required-CI.
+    # The `publish` label resolves ONLY to the molecule-runner-publish-*
+    # sub-pool (config.publish.yaml). HARD DEPENDENCY: this MUST land
+    # AFTER the publish-lane runners are registered/advertising `publish`
+    # — the earlier #599 `docker` label attempt queued indefinitely with
+    # zero eligible runners precisely because the label was targeted
+    # before any runner advertised it (see #576). The lane is registered
+    # in this rollout (internal#462) so the precondition holds.
+    runs-on: publish
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
@@ -66,7 +66,10 @@ concurrency:

 jobs:
  publish:
-    runs-on: ubuntu-latest
+    # Dedicated publish/release lane (internal#462 / #394 / #399). Ship
+    # path (on: push tag runtime-v*) — reserved capacity, never FIFO
+    # behind PR-CI. `publish` resolves only to molecule-runner-publish-*.
+    runs-on: publish
    outputs:
      version: ${{ steps.version.outputs.version }}
      wheel_sha256: ${{ steps.wheel_hash.outputs.wheel_sha256 }}
@@ -166,7 +169,9 @@ jobs:

  cascade:
    needs: publish
-    runs-on: ubuntu-latest
+    # Publish/release lane (internal#462) — downstream of the runtime
+    # publish ship job; keep it on the reserved lane too.
+    runs-on: publish
    steps:
      - name: Wait for PyPI to propagate the new version
        env:
@@ -54,7 +54,14 @@ env:

 jobs:
  build-and-push:
-    runs-on: ubuntu-latest
+    # Dedicated publish/release lane (internal#462 / #394 / #399). This
+    # is a post-merge ship job (on: push:main) — it must NOT FIFO-compete
+    # with PR required-CI on the shared pool (PR#1350's prod image build
+    # was delayed ~25min this way). The `publish` label resolves ONLY to
+    # the reserved molecule-runner-publish-* sub-pool (config.publish.yaml,
+    # OUTSIDE the managed 1..20 range) so a merged fix's image build
+    # starts immediately while PR-CI keeps the general pool.
+    runs-on: publish
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -181,7 +188,9 @@ jobs:
    name: Production auto-deploy
    needs: build-and-push
    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
-    runs-on: ubuntu-latest
+    # Publish/release lane (internal#462) — production deploy of a merged
+    # fix; reserved capacity, never queued behind PR-CI.
+    runs-on: publish
    timeout-minutes: 75
    env:
      CP_URL: ${{ vars.PROD_CP_URL || 'https://api.moleculesai.app' }}
@@ -68,7 +68,10 @@ jobs:
  # bp-exempt: production redeploy is a side-effect workflow, not a merge gate.
  redeploy:
    if: ${{ github.event_name == 'workflow_dispatch' }}
-    runs-on: ubuntu-latest
+    # Dedicated publish/release lane (internal#462 / #394 / #399).
+    # Production tenant redeploy — a deploy action, reserved capacity so
+    # it never queues behind PR-CI. `publish` -> molecule-runner-publish-*.
+    runs-on: publish
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
@@ -75,7 +75,10 @@ env:
 jobs:
  # bp-exempt: post-merge staging redeploy side effect; CI / all-required gates source changes.
  redeploy:
-    runs-on: ubuntu-latest
+    # Dedicated publish/release lane (internal#462 / #394 / #399).
+    # Post-merge staging redeploy — a deploy action, reserved capacity.
+    # `publish` -> molecule-runner-publish-* sub-pool.
+    runs-on: publish
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
@@ -44,6 +44,12 @@ on:
      - ".github/scripts/lint_secret_pattern_drift.py"
      - ".githooks/pre-commit"

+# Cancel stale runs to keep the 8-runner pool available for PR jobs.
+# Per-SHA group ensures push and scheduled runs at different SHAs don't cancel each other.
+concurrency:
+  group: secret-pattern-drift-${{ github.event.pull_request.head.sha || github.sha }}
+  cancel-in-progress: true
+
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app

@@ -22,6 +22,11 @@ on:
    - cron: '17 4 * * 1'  # Mondays at 04:17 UTC
  workflow_dispatch:

+# Cancel stale runs to keep the 8-runner pool available for PR jobs.
+concurrency:
+  group: weekly-platform-go-${{ github.event.pull_request.head.sha || github.sha }}
+  cancel-in-progress: true
+
 permissions:
  contents: read
  statuses: write
@@ -30,10 +30,7 @@
    {"name": "openclaw", "repo": "molecule-ai/molecule-ai-workspace-template-openclaw", "ref": "main"},
    {"name": "codex", "repo": "molecule-ai/molecule-ai-workspace-template-codex", "ref": "main"},
    {"name": "langgraph", "repo": "molecule-ai/molecule-ai-workspace-template-langgraph", "ref": "main"},
-    {"name": "crewai", "repo": "molecule-ai/molecule-ai-workspace-template-crewai", "ref": "main"},
-    {"name": "autogen", "repo": "molecule-ai/molecule-ai-workspace-template-autogen", "ref": "main"},
-    {"name": "deepagents", "repo": "molecule-ai/molecule-ai-workspace-template-deepagents", "ref": "main"},
-    {"name": "gemini-cli", "repo": "molecule-ai/molecule-ai-workspace-template-gemini-cli", "ref": "main"}
+    {"name": "autogen", "repo": "molecule-ai/molecule-ai-workspace-template-autogen", "ref": "main"}
  ],
  "org_templates": [
    {"name": "molecule-dev", "repo": "molecule-ai/molecule-ai-org-template-molecule-dev", "ref": "main"},
@@ -0,0 +1,160 @@
+package handlers
+
+// Regression coverage for the POLL-mode arm of the canvas user-message
+// data-loss bug (internal#470 sibling — tracked on internal#471).
+//
+// Bug (reported 2026-05-16 by CTO Hongming): "in canvas i sometimes lose
+// my own message when i exit chat". The push-mode arm was fixed by
+// #1347 (persistUserMessageAtIngest — a SYNCHRONOUS, before-dispatch,
+// context.WithoutCancel INSERT). #1347's framing asserted "poll-mode
+// workspaces were never affected — logA2AReceiveQueued already persists
+// at ingest". That assertion is OVERSTATED.
+//
+// Hongming's tenant (slug `hongming`, org 2c940477-...) has 4 workspaces,
+// ALL runtime=external with empty URL → ALL delivery_mode=poll (proven
+// empirically: a benign A2A probe returns the synthetic
+// {"delivery_mode":"poll","status":"queued"} envelope for every one).
+// So his reported loss is the POLL path, NOT the push path #1347 fixes.
+//
+// Root cause (poll arm): the poll-mode short-circuit (a2a_proxy.go ~402)
+// calls logA2AReceiveQueued and then IMMEDIATELY returns the synthetic
+// 200 {status:"queued"} to the canvas. But logA2AReceiveQueued's durable
+// INSERT runs inside h.goAsync(...) — a DETACHED goroutine with NO
+// happens-before barrier against the HTTP response. The canvas sees 200
+// ("message accepted") while the activity_logs row may not yet be — and,
+// on a workspace-server restart / deploy / OOM / EC2 hibernation between
+// the 200 and the goroutine's commit, NEVER will be — durable. There is
+// also no fallback (unlike push-mode's legacy-INSERT fallback): a
+// swallowed LogActivity error loses the message with only a log line.
+// Chat-history reads activity_logs (postgres_store.go:165-187); a missing
+// row = message gone on reopen. That is exactly Hongming's symptom.
+//
+// Fix (parity with push-mode): the poll-mode ingest persist of the
+// canvas user message must be SYNCHRONOUS — committed before the queued
+// 200 is returned — on a context.WithoutCancel derived context, so a
+// client disconnect on chat-exit and a post-response restart cannot lose
+// it. Behavior is never worse than today (best-effort; a persist error
+// still returns queued).
+//
+// TEST DESIGN NOTE: sqlmock.ExpectationsWereMet() hangs indefinitely if
+// the expected query never fires. We use a select+default+time.After
+// pattern so the test FAILS fast (not hangs) when the production code
+// regresses to async (the INSERT never fires before handler returns),
+// while still returning promptly when all expectations are met. The
+// insertDelay is kept small (50ms) to minimise suite-level timing
+// impact under -race detection, where mock delays are amplified by
+// the instrumenter's goroutine overhead.
+
+import (
+	"bytes"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/gin-gonic/gin"
+)
+
+// TestProxyA2A_PollMode_PersistsUserMessageSynchronouslyBeforeQueuedResponse
+// is the defining contract: for a poll-mode workspace, the canvas user
+// message MUST be durably INSERTed into activity_logs BEFORE the synthetic
+// queued 200 is returned to the client — with NO reliance on a detached
+// async goroutine completing later.
+//
+// The test proves the ordering by making the INSERT block briefly and
+// asserting the handler does NOT return until the INSERT has completed.
+// Pre-fix (INSERT in h.goAsync, response returned immediately) the
+// handler returns ~instantly while the INSERT is still pending in the
+// goroutine → the elapsed time is far below the injected INSERT delay and
+// ExpectationsWereMet() is racy/unmet at return. Post-fix (synchronous
+// persist before the queued response) the handler return is gated on the
+// INSERT, so elapsed >= the injected delay and the expectation is met
+// deterministically at return WITHOUT any waitAsyncForTest()/sleep.
+func TestProxyA2A_PollMode_PersistsUserMessageSynchronouslyBeforeQueuedResponse(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	const wsID = "ws-poll-sync-persist"
+	// Keep delay small: -race detection amplifies mock delays significantly.
+	// A 50ms delay is sufficient to prove synchronous blocking (~50× the
+	// normal INSERT latency) without bloating the full ./... suite runtime.
+	const insertDelay = 50 * time.Millisecond
+
+	expectBudgetCheck(mock, wsID)
+
+	// lookupDeliveryMode → poll, triggering the short-circuit.
+	mock.ExpectQuery("SELECT delivery_mode FROM workspaces WHERE id").
+		WithArgs(wsID).
+		WillReturnRows(sqlmock.NewRows([]string{"delivery_mode"}).AddRow("poll"))
+
+	// workspace-name lookup inside logA2AReceiveQueued.
+	mock.ExpectQuery(`SELECT name FROM workspaces WHERE id`).
+		WithArgs(wsID).
+		WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("Poll WS"))
+
+	// The durable user-message write. We delay it so a synchronous
+	// persist visibly gates the handler return; a detached-goroutine
+	// persist (pre-fix) does not. The fix must keep using
+	// context.WithoutCancel so this write survives a chat-exit cancel.
+	mock.ExpectExec("INSERT INTO activity_logs").
+		WillDelayFor(insertDelay).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: wsID}}
+
+	// callerID == "" (no X-Workspace-ID) → this is a canvas_user message,
+	// exactly Hongming's case.
+	body := `{"jsonrpc":"2.0","id":"poll-canvas-1","method":"message/send","params":{"message":{"role":"user","parts":[{"text":"my own message"}]}}}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/"+wsID+"/a2a", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	start := time.Now()
+	handler.ProxyA2A(c)
+	elapsed := time.Since(start)
+
+	// Defining assertion #1: the handler must not have returned the
+	// queued response before the durable INSERT committed. Pre-fix this
+	// fails (elapsed ≈ 0, INSERT still racing in goAsync).
+	if elapsed < insertDelay {
+		t.Fatalf("poll-mode queued response returned in %v, before the %v user-message INSERT — "+
+			"the message is not durable when the client/process goes away (DATA LOSS). "+
+			"Persist must be synchronous before the queued 200.", elapsed, insertDelay)
+	}
+
+	// Defining assertion #2: the durable write actually happened by the
+	// time the handler returned. ExpectionsWereMet() hangs indefinitely if
+	// the mock never fires (e.g. production code regressed to async),
+	// so we check it in a goroutine with a hard 2s timeout — fails fast
+	// (no CI hang) on regression while returning promptly on success.
+	expectDone := make(chan error, 1)
+	go func() { expectDone <- mock.ExpectationsWereMet() }()
+	select {
+	case err := <-expectDone:
+		if err != nil {
+			t.Fatalf("user-message INSERT was not durable at handler return (unmet sqlmock expectations): %v", err)
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatalf("ExpectationsWereMet() hung for >2s — INSERT mock never fired. " +
+			"Likely cause: production code regressed logA2AReceiveQueued to goAsync " +
+			"(INSERT fires after handler returns, not before).")
+	}
+
+	// Sanity: still the correct poll-mode envelope + status.
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 (queued), got %d: %s", w.Code, w.Body.String())
+	}
+	var resp map[string]interface{}
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("response is not valid JSON: %v", err)
+	}
+	if resp["status"] != "queued" || resp["delivery_mode"] != "poll" {
+		t.Errorf("poll envelope changed: got status=%v delivery_mode=%v, want queued/poll",
+			resp["status"], resp["delivery_mode"])
+	}
+}
@@ -504,25 +504,49 @@ func lookupDeliveryMode(ctx context.Context, workspaceID string) string {
 // reads in PR 3 — that's how a poll-mode workspace receives inbound A2A
 // without a public URL.
 func (h *WorkspaceHandler) logA2AReceiveQueued(ctx context.Context, workspaceID, callerID string, body []byte, a2aMethod string) {
+	// DATA-LOSS FIX (internal#471 — poll-mode sibling of #1347/internal#470):
+	// this is the ONLY durable write of a poll-mode inbound message,
+	// including a canvas_user message (callerID == "") typed in the canvas
+	// chat. It MUST be SYNCHRONOUS and complete BEFORE the caller returns
+	// the synthetic {status:"queued"} 200 — otherwise the canvas sees the
+	// send acknowledged while the activity_logs row is still racing in a
+	// detached goroutine, and a workspace-server restart / deploy / OOM /
+	// EC2 hibernation between the 200 and the goroutine's commit loses the
+	// user's message permanently (chat-history reads activity_logs, so a
+	// missing row = message gone on reopen). Hongming's tenant is entirely
+	// poll-mode (4 external workspaces, no URL — verified empirically), so
+	// his reported loss is THIS path; #1347 (push-mode, persists AFTER the
+	// poll short-circuit) structurally cannot cover it.
+	//
+	// Mirrors persistUserMessageAtIngest's discipline:
+	//   - context.WithoutCancel: a client disconnect on chat-exit (which
+	//     cancels the inbound request ctx) MUST NOT abort this write.
+	//   - SYNCHRONOUS (no goAsync): the row must be durable before the
+	//     queued 200 is returned to the caller.
+	//   - Best-effort: LogActivity already logs+swallows INSERT errors, so
+	//     a hiccup never blocks or fails the user's send (behavior for
+	//     that one request is never worse than the pre-fix async path).
+	// The post-commit broadcast still fires inside LogActivity; a missed
+	// WebSocket event is not data loss (the durable row is the truth the
+	// canvas re-reads on reopen).
+	insCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 30*time.Second)
+	defer cancel()
+
 	var wsName string
-	db.DB.QueryRowContext(ctx, `SELECT name FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsName)
+	db.DB.QueryRowContext(insCtx, `SELECT name FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsName)
 	if wsName == "" {
 		wsName = workspaceID
 	}
 	summary := a2aMethod + " → " + wsName + " (queued for poll)"
-	h.goAsync(func() {
-		logCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 30*time.Second)
-		defer cancel()
-		LogActivity(logCtx, h.broadcaster, ActivityParams{
-			WorkspaceID:  workspaceID,
-			ActivityType: "a2a_receive",
-			SourceID:     nilIfEmpty(callerID),
-			TargetID:     &workspaceID,
-			Method:       &a2aMethod,
-			Summary:      &summary,
-			RequestBody:  json.RawMessage(body),
-			Status:       "ok",
-		})
+	LogActivity(insCtx, h.broadcaster, ActivityParams{
+		WorkspaceID:  workspaceID,
+		ActivityType: "a2a_receive",
+		SourceID:     nilIfEmpty(callerID),
+		TargetID:     &workspaceID,
+		Method:       &a2aMethod,
+		Summary:      &summary,
+		RequestBody:  json.RawMessage(body),
+		Status:       "ok",
 	})
 }

@@ -44,8 +44,8 @@ func NewWorkspaceImageService(docker *dockerclient.Client) *WorkspaceImageServic
 // AllRuntimes is the canonical list mirroring docs/workspace-runtime-package.md.
 // Update both when a new template is added.
 var AllRuntimes = []string{
-	"claude-code", "langgraph", "crewai", "autogen",
-	"deepagents", "hermes", "gemini-cli", "openclaw",
+	"claude-code", "langgraph", "autogen",
+	"hermes", "openclaw",
 }

 // RefreshResult is the per-call outcome surfaced to HTTP callers AND logged
@@ -23,8 +23,8 @@ package models
 //   - claude-code: "sonnet" — Anthropic's CLI accepts the short
 //     name and resolves it via the operator's anthropic-oauth or
 //     ANTHROPIC_API_KEY chain.
-//   - everything else (hermes, langgraph, crewai, autogen, deepagents,
-//     codex, openclaw, gemini-cli, external, ""): a fully-qualified
+//   - everything else (hermes, langgraph, autogen, codex, openclaw,
+//     external, ""): a fully-qualified
 //     vendor:model slug that the universal MODEL_PROVIDER chain in
 //     molecule-core PR #247 can route via per-vendor required_env.
 //
@@ -21,12 +21,9 @@ func TestDefaultModel(t *testing.T) {
 		// as a generic "unknown" failure.
 		{"hermes", "anthropic:claude-opus-4-7"},
 		{"langgraph", "anthropic:claude-opus-4-7"},
-		{"crewai", "anthropic:claude-opus-4-7"},
 		{"autogen", "anthropic:claude-opus-4-7"},
-		{"deepagents", "anthropic:claude-opus-4-7"},
 		{"codex", "anthropic:claude-opus-4-7"},
 		{"openclaw", "anthropic:claude-opus-4-7"},
-		{"gemini-cli", "anthropic:claude-opus-4-7"},
 		{"external", "anthropic:claude-opus-4-7"},

 		// Unknown / empty — fall through to universal default rather
@@ -190,7 +190,7 @@ func TestEnsureLocalImage_RepoNotFound(t *testing.T) {
 	opts.HTTPClient = srv.Client()
 	opts.remoteHeadSha = nil // exercise real HTTP path

-	_, err := ensureLocalImageWithOpts(context.Background(), "crewai", opts)
+	_, err := ensureLocalImageWithOpts(context.Background(), "hermes", opts)
 	if err == nil {
 		t.Fatalf("expected error, got nil")
 	}
@@ -35,6 +35,19 @@ import (
 // drift-risk #6.
 var ErrNoBackend = errors.New("provisioner: no backend configured (zero-valued receiver)")

+// ErrUnresolvableRuntime is returned by selectImage when a workspace
+// names a runtime that has no resolvable image (not in RuntimeImages and
+// no operator-pinned cfg.Image). RFC internal#483 + security review 4269:
+// previously such a request silently fell through to DefaultImage
+// (langgraph) — a user asking for crewai would get a langgraph container
+// with no signal. The CTO standing directive
+// (feedback_platform_must_hardgate_base_contract) is fail-closed: a
+// named-but-unresolvable runtime must reject with a structured,
+// runtime-naming error so the existing provision-failed notify/log path
+// surfaces it, NOT silently degrade. The genuinely-unspecified (empty)
+// runtime is still a distinct, legitimate path that keeps DefaultImage.
+var ErrUnresolvableRuntime = errors.New("provisioner: requested runtime has no resolvable image")
+
 // RuntimeImages maps runtime names to their Docker image refs.
 // Each standalone template repo publishes its image via the reusable
 // publish-template-image workflow in molecule-ci on every main merge.
@@ -104,20 +117,33 @@ type WorkspaceConfig struct {
 // selectImage resolves the final Docker image ref for a workspace. The handler
 // layer is the source of truth — if it set cfg.Image (the digest-pinned form
 // from runtime_image_pins, #2272), honor that. Otherwise fall back to the
-// runtime→tag lookup in RuntimeImages (legacy `:latest` behavior). When the
-// runtime isn't recognized either, fall back to DefaultImage so Start() still
-// has something to hand Docker — surfacing a "No such image" later is more
-// actionable than a silent "" panic in ContainerCreate.
-func selectImage(cfg WorkspaceConfig) string {
+// runtime→tag lookup in RuntimeImages (legacy `:latest` behavior).
+//
+// Fail-closed contract (RFC internal#483 / security review 4269 /
+// feedback_platform_must_hardgate_base_contract): if the workspace NAMES a
+// runtime that resolves to no image (not in RuntimeImages, no pinned
+// cfg.Image), reject with ErrUnresolvableRuntime instead of silently
+// substituting DefaultImage. Pre-fix, removing crewai/deepagents/gemini-cli
+// from the catalog left those create requests silently provisioning a
+// langgraph container — the user asked for crewai and got langgraph with no
+// signal. The error propagates through Start → markProvisionFailed, which
+// already broadcasts WorkspaceProvisionFailed and records the message.
+//
+// The genuinely-unspecified runtime (empty cfg.Runtime, e.g. an org template
+// that doesn't pin one) is an intended distinct path and still resolves to
+// DefaultImage — only a NAMED-but-unresolvable runtime is rejected.
+func selectImage(cfg WorkspaceConfig) (string, error) {
 	if cfg.Image != "" {
-		return cfg.Image
+		return cfg.Image, nil
 	}
 	if cfg.Runtime != "" {
 		if img, ok := RuntimeImages[cfg.Runtime]; ok {
-			return img
+			return img, nil
 		}
+		return "", fmt.Errorf("%w: runtime %q (known runtimes: %v)",
+			ErrUnresolvableRuntime, cfg.Runtime, knownRuntimes)
 	}
-	return DefaultImage
+	return DefaultImage, nil
 }

 // Workspace-access constants for #65. Matches the CHECK constraint on
@@ -336,7 +362,15 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e

 	env := buildContainerEnv(cfg)

-	image := selectImage(cfg)
+	image, imgErr := selectImage(cfg)
+	if imgErr != nil {
+		// Fail-closed: a named-but-unresolvable runtime must not silently
+		// become DefaultImage (RFC internal#483 / review 4269). The caller's
+		// error path (markProvisionFailed) broadcasts the failure + records
+		// the message so the canvas surfaces it.
+		log.Printf("Provisioner: refusing to start %s: %v", cfg.WorkspaceID, imgErr)
+		return "", imgErr
+	}

 	// Local-build mode (issue #63 / Task #194): when MOLECULE_IMAGE_REGISTRY
 	// is unset, the OSS contributor path skips the registry pull entirely
@@ -513,7 +513,10 @@ func TestWorkspaceConfig_ResetClaudeSessionFieldPresent(t *testing.T) {
 // we lose the "one bad publish doesn't break every workspace" guarantee.
 func TestSelectImage_PrefersExplicitImage(t *testing.T) {
 	pinned := "ghcr.io/molecule-ai/workspace-template-claude-code@sha256:3d6761a97ed07d7d33cfc19a8fbab81175d9d9179618d493dbc00c5f7ef076a3"
-	got := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: pinned})
+	got, err := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: pinned})
+	if err != nil {
+		t.Fatalf("selectImage with cfg.Image=pinned: unexpected error %v", err)
+	}
 	if got != pinned {
 		t.Errorf("selectImage with cfg.Image=pinned: got %q, want %q", got, pinned)
 	}
@@ -523,28 +526,46 @@ func TestSelectImage_PrefersExplicitImage(t *testing.T) {
 // pin lookup deliberately bypassed via WORKSPACE_IMAGE_LOCAL_OVERRIDE).
 // selectImage must use the legacy runtime→:latest map.
 func TestSelectImage_FallsBackToRuntimeMap(t *testing.T) {
-	got := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: ""})
+	got, err := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: ""})
+	if err != nil {
+		t.Fatalf("selectImage with empty Image: unexpected error %v", err)
+	}
 	want := RuntimeImages["claude-code"]
 	if got != want {
 		t.Errorf("selectImage with empty Image: got %q, want %q", got, want)
 	}
 }

-// TestSelectImage_UnknownRuntimeFallsBackToDefault preserves today's
-// behavior — an unrecognized runtime resolves to DefaultImage rather than
-// "" so ContainerCreate gets a usable arg and surfaces a meaningful
-// "No such image" error if the default itself is missing.
-func TestSelectImage_UnknownRuntimeFallsBackToDefault(t *testing.T) {
-	got := selectImage(WorkspaceConfig{Runtime: "no-such-runtime"})
-	if got != DefaultImage {
-		t.Errorf("selectImage with unknown runtime: got %q, want DefaultImage %q", got, DefaultImage)
+// TestSelectImage_NamedUnresolvableRuntimeRejects pins the fail-closed
+// contract (RFC internal#483 / security review 4269 /
+// feedback_platform_must_hardgate_base_contract): a NAMED runtime with no
+// resolvable image must reject with ErrUnresolvableRuntime, NOT silently
+// substitute DefaultImage. Pre-fix this returned langgraph — a user asking
+// for a removed runtime (crewai/deepagents/gemini-cli) silently got a
+// langgraph container. "crewai" is the concrete regression from the
+// security finding.
+func TestSelectImage_NamedUnresolvableRuntimeRejects(t *testing.T) {
+	for _, rt := range []string{"no-such-runtime", "crewai", "deepagents", "gemini-cli"} {
+		got, err := selectImage(WorkspaceConfig{Runtime: rt})
+		if !errors.Is(err, ErrUnresolvableRuntime) {
+			t.Errorf("selectImage(%q): got err %v, want ErrUnresolvableRuntime", rt, err)
+		}
+		if got != "" {
+			t.Errorf("selectImage(%q): got image %q, want \"\" on reject", rt, got)
+		}
+		if err != nil && !strings.Contains(err.Error(), rt) {
+			t.Errorf("selectImage(%q): error must name the offending runtime, got %v", rt, err)
+		}
 	}
 }

 // TestSelectImage_EmptyRuntimeFallsBackToDefault: same invariant for the
 // no-runtime-supplied path (legacy callers / older handler code).
 func TestSelectImage_EmptyRuntimeFallsBackToDefault(t *testing.T) {
-	got := selectImage(WorkspaceConfig{})
+	got, err := selectImage(WorkspaceConfig{})
+	if err != nil {
+		t.Fatalf("selectImage with zero cfg: unexpected error %v (empty runtime is a legitimate DefaultImage path)", err)
+	}
 	if got != DefaultImage {
 		t.Errorf("selectImage with zero cfg: got %q, want DefaultImage %q", got, DefaultImage)
 	}
@@ -808,7 +829,7 @@ func TestIsImageNotFoundErr(t *testing.T) {
 		{"nil", nil, false},
 		{"moby no such image", fmtErr(`Error response from daemon: No such image: workspace-template:openclaw`), true},
 		{"no such image lowercase", fmtErr(`error: no such image: foo:bar`), true},
-		{"image not found", fmtErr(`Error: image "workspace-template:crewai" not found`), true},
+		{"image not found", fmtErr(`Error: image "workspace-template:hermes" not found`), true},
 		{"generic not found without image", fmtErr(`container not found`), false},
 		{"unrelated error", fmtErr(`connection refused`), false},
 		{"permission denied", fmtErr(`permission denied`), false},
@@ -21,9 +21,6 @@ var knownRuntimes = []string{
 	"autogen",
 	"claude-code",
 	"codex",
-	"crewai",
-	"deepagents",
-	"gemini-cli",
 	"hermes",
 	"langgraph",
 	"openclaw",
@@ -53,8 +53,8 @@ func TestRuntimeImage_AllKnownRuntimes(t *testing.T) {
 		}
 	}
 	// Pin the count so adding a runtime requires explicit test acknowledgement.
-	if len(knownRuntimes) != 9 {
-		t.Errorf("knownRuntimes length = %d, want 9 (autogen, claude-code, codex, crewai, deepagents, gemini-cli, hermes, langgraph, openclaw)", len(knownRuntimes))
+	if len(knownRuntimes) != 6 {
+		t.Errorf("knownRuntimes length = %d, want 6 (autogen, claude-code, codex, hermes, langgraph, openclaw)", len(knownRuntimes))
 	}
 }

@@ -431,6 +431,43 @@ def _is_self_notify_row(row: dict[str, Any]) -> bool:
    return source_id is None or source_id == ""


+def _is_self_echo_row(row: dict[str, Any], workspace_id: str) -> bool:
+    """Return True if ``row`` is a self-originated a2a_receive row.
+
+    Internal #469: when a workspace delegates to a target that never picks
+    up the task, ``tool_delegate_task`` calls ``report_activity`` which
+    POSTs to the platform with source_id set to the *sender's* workspace
+    UUID (mandated by spoof-defense in workspace-server's a2a_proxy). The
+    activity API exposes that row under type=a2a_receive, so the inbox
+    poller re-fetches it. Without this guard the row is surfaced as
+    kind='peer_agent' with the workspace's own identity as peer_id —
+    the workspace sees its own delegation-failure echoed back as if a
+    peer had delegated to it.
+
+    The guard mirrors the existing _is_self_notify_row pattern: both
+    skip rows that would otherwise create spurious inbound signal. The
+    long-term fix (making the platform write a distinct activity_type
+    for agent-outbound rows) is tracked separately; this guard stays
+    because it only excludes rows the agent never wants.
+
+    ``workspace_id`` must be non-empty — an empty-string workspace_id
+    (single-workspace legacy path) can never match a UUID source_id, so
+    the predicate is always False there, which is safe.
+
+    RFC #2829 PR-2 note: rows with method="delegate_result" are excluded
+    from the self-echo guard even when source_id matches our workspace_id.
+    The platform may write a delegation-result row with source_id set to
+    our workspace_id (e.g. a self-delegation or edge case in the platform's
+    result-writing path). Such rows must reach the inbox so that
+    message_from_activity can surface them as peer_agent inbound and the
+    runtime receives the delegation result. Silently filtering them as
+    self-echo would break delegation result delivery.
+    """
+    if not workspace_id:
+        return False
+    return row.get("source_id") == workspace_id and row.get("method") != "delegate_result"
+
+
 def message_from_activity(row: dict[str, Any]) -> InboxMessage:
    """Convert one /activity row into an InboxMessage.

@@ -623,6 +660,16 @@ def _poll_once(
            # the same self-notify on every iteration.
            last_id = str(row.get("id", "")) or last_id
            continue
+        if _is_self_echo_row(row, workspace_id):
+            # Internal #469: tool_delegate_task writes its own a2a_receive
+            # row with source_id = this workspace's UUID (spoof-defense).
+            # The poll fetches it back as kind='peer_agent', making the
+            # workspace echo its own delegation-failure as an inbound from
+            # a phantom peer. Skip it — the real delegation-result path
+            # (delegate_result push) is separate and unaffected. Cursor
+            # still advances so the next poll doesn't re-seen this row.
+            last_id = str(row.get("id", "")) or last_id
+            continue
        message = message_from_activity(row)
        if not message.activity_id:
            continue
@@ -495,6 +495,151 @@ def test_poll_once_skips_self_notify_rows(state: inbox.InboxState):
    assert [m.activity_id for m in queue] == ["act-real"]


+# ---------------------------------------------------------------------------
+# _is_self_echo_row — internal #469 fix
+# ---------------------------------------------------------------------------
+#
+# When a workspace delegates to a target that never picks up the task,
+# tool_delegate_task calls report_activity("a2a_receive", ...) which POSTs
+# to the platform with source_id set to the *sender's* workspace UUID
+# (spoof-defense). The activity API returns that row under type=a2a_receive
+# on the next poll, so message_from_activity sets peer_id = workspace's own
+# UUID — the workspace sees its own delegation-failure as an inbound from
+# a phantom peer. _is_self_echo_row guards against this.
+#
+# Internal #469 was live-reproduced on hongming.moleculesai.app 2026-05-16.
+
+
+def test_is_self_echo_row_true_when_source_id_matches_workspace():
+    row = {"source_id": "ws-abc123", "method": "a2a_receive"}
+    assert inbox._is_self_echo_row(row, "ws-abc123") is True
+
+
+def test_is_self_echo_row_false_when_source_id_differs():
+    """A real peer agent (different workspace_id) must NOT be filtered."""
+    row = {"source_id": "ws-peer", "method": "a2a_receive"}
+    assert inbox._is_self_echo_row(row, "ws-1") is False
+
+
+def test_is_self_echo_row_false_when_source_id_is_none():
+    """Canvas-user inbound has no source_id — never an echo."""
+    row = {"source_id": None, "method": "a2a_receive"}
+    assert inbox._is_self_echo_row(row, "ws-1") is False
+
+
+def test_is_self_echo_row_false_when_workspace_id_is_empty():
+    """Single-workspace legacy path with empty workspace_id cannot
+    match a UUID source_id — predicate is always False, which is safe."""
+    row = {"source_id": "ws-abc123", "method": "a2a_receive"}
+    assert inbox._is_self_echo_row(row, "") is False
+
+
+def test_is_self_echo_row_false_when_source_id_key_absent():
+    row = {"method": "a2a_receive"}
+    assert inbox._is_self_echo_row(row, "ws-1") is False
+
+
+def test_is_self_echo_row_false_for_delegate_result():
+    """RFC #2829 PR-2 regression pin: a row with source_id matching our
+    workspace_id but method=delegate_result must NOT be filtered as a
+    self-echo. The platform may write a delegation-result row with our
+    workspace_id as source_id; such rows must reach the inbox so the
+    runtime receives the delegation result. Silently filtering them would
+    break delegate_result delivery."""
+    row = {"source_id": "ws-1", "method": "delegate_result"}
+    assert inbox._is_self_echo_row(row, "ws-1") is False
+
+
+def test_poll_once_skips_self_echo_rows(state: inbox.InboxState):
+    """Internal #469 regression pin: a row with source_id matching our
+    workspace_id must NOT land in the inbox queue — it is our own
+    delegation-report echoing back, not a real peer inbound."""
+    rows = [
+        {
+            "id": "act-real-peer",
+            "source_id": "ws-peer",
+            "method": "a2a_receive",
+            "summary": None,
+            "request_body": {"parts": [{"type": "text", "text": "real peer inbound"}]},
+            "created_at": "2026-04-30T22:00:00Z",
+        },
+        {
+            "id": "act-self-echo",
+            "source_id": "ws-1",
+            "method": "a2a_receive",
+            "summary": "task result: target timed out",
+            "request_body": None,
+            "created_at": "2026-04-30T22:00:01Z",
+        },
+    ]
+    resp = _make_response(200, rows)
+    p, _ = _patch_httpx(resp)
+    with p:
+        n = inbox._poll_once(state, "http://platform", "ws-1", {})
+
+    # Only the real peer inbound counted; self-echo silently dropped.
+    assert n == 1
+    queue = state.peek(10)
+    assert [m.activity_id for m in queue] == ["act-real-peer"]
+    assert queue[0].peer_id == "ws-peer"
+
+
+def test_poll_once_advances_cursor_past_self_echo(state: inbox.InboxState):
+    """Cursor must advance past self-echo rows even though we don't
+    enqueue them. Otherwise the next poll re-fetches the same self-echo
+    on every iteration, wasting requests and blocking real inbound."""
+    state.save_cursor("act-old")
+    rows = [
+        {
+            "id": "act-self-echo",
+            "source_id": "ws-1",
+            "method": "a2a_receive",
+            "summary": "task result: timeout",
+            "request_body": None,
+            "created_at": "2026-04-30T22:00:00Z",
+        },
+    ]
+    resp = _make_response(200, rows)
+    p, _ = _patch_httpx(resp)
+    with p:
+        n = inbox._poll_once(state, "http://platform", "ws-1", {})
+
+    assert n == 0
+    assert state.peek(10) == []
+    # Cursor must move past the skipped row so we don't re-poll it.
+    assert state.load_cursor() == "act-self-echo"
+
+
+def test_poll_once_self_echo_does_not_fire_notification(state: inbox.InboxState):
+    """The notification callback (channel push to Claude Code etc.)
+    must not fire for self-echo rows. Same rationale as self-notify:
+    push-capable hosts would see the echo loop on the push channel."""
+    rows = [
+        {
+            "id": "act-self-echo",
+            "source_id": "ws-1",
+            "method": "a2a_receive",
+            "summary": "task result: timeout",
+            "request_body": None,
+            "created_at": "2026-04-30T22:00:00Z",
+        },
+    ]
+    received: list[dict] = []
+    inbox.set_notification_callback(received.append)
+    try:
+        resp = _make_response(200, rows)
+        p, _ = _patch_httpx(resp)
+        with p:
+            inbox._poll_once(state, "http://platform", "ws-1", {})
+    finally:
+        inbox.set_notification_callback(None)
+
+    assert received == [], (
+        "self-echo rows must not surface as MCP notifications — "
+        "doing so re-creates the echo loop on push-capable hosts"
+    )
+
+
 def test_poll_once_advances_cursor_past_self_notify(state: inbox.InboxState):
    """Cursor must advance past self-notify rows even though we don't
    enqueue them. Otherwise the next poll re-fetches the same self-