test(e2e): keyless required-lane coverage for mock runtime + terminal/webhooks/budget/checkpoints/audit/traces/session-search/rescue/billing-mode/resume/hibernate + wire orphaned secrets-dispatch

Closes coverage-audit gaps for CI-coverable, keyless feature endpoints that had NO e2e assertion in the required `E2E API Smoke Test` lane. New: tests/e2e/test_keyless_feature_contracts_e2e.sh — a self-contained, hermetic script (runtime=external fixture, NO LLM key) asserting the real HTTP contract + a meaningful failure mode for each endpoint: * GET /workspaces/:id/terminal/diagnose — 200 report / 401 no-auth (the /terminal WS-upgrade sibling that is HTTP-assertable keyless) * POST /webhooks/:type (public) — 200 ignored / 400 bad-json / 404 unknown * GET /workspaces/:id/budget + PATCH — periods view / set+persist / 400 / 401 * /workspaces/:id/checkpoints* — upsert→latest→list→delete→404 / 400 / 401 * GET /workspaces/:id/audit — total0+chain_valid null / 400 bad-from / 401 * GET /workspaces/:id/traces — 200 [] without Langfuse / 401 * GET /workspaces/:id/session-search — q-filter hit / [] miss / 401 * GET /workspaces/:id/rescue — fail-closed 503 (no MOLECULE_ORG_ID) / 401 * GET/PUT /admin/workspaces/:id/llm-billing-mode — flip byok+readback / 400 ×3 * Lifecycle pause→resume + hibernate — transitions / 404 wrong-state / 401 Auth model mirrors wsauth_middleware.go: WorkspaceAuth is strict (401 without bearer once a token exists), AdminAuth accepts the platform ADMIN_TOKEN OR the workspace bearer (Tier-3) — so the script is green in BOTH the current no-ADMIN_TOKEN CI shape and the post-#2286 ADMIN_TOKEN shape (proven locally, 48/48 each). Mock-runtime A2A canned round-trip is left to #2286's mock arm (not duplicated). Does not touch e2e-api.yml admin-auth wiring or test_priority_runtimes runtime arms (#2286 owns those) — only adds run steps. Wire: tests/e2e/test_secrets_dispatch.sh was orphaned (no workflow ran it). Added as a required-lane step. It is hermetic (extracts + runs the SECRETS_JSON branch-order block in isolation; no platform/bearer/network), guarding the 2026-05-03 "wrong LLM-key shape wins" incident class. Proof: local PG+Redis+platform-server (CI shape), all three scripts GREEN in lane order under both auth shapes; bash -n + shellcheck clean. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Merge pull request 'feat(e2e): #2261 Gap1 live take-control e2e (acquire→WS upgrade→real frame)' (#2275 ) from feat/2261-gap1-takecontrol-e2e into main
2026-06-05 01:04:35 -07:00 · 2026-06-05 05:06:02 +00:00 · 2026-06-05 04:52:13 +00:00 · 2026-06-05 04:52:09 +00:00 · 2026-06-05 04:51:19 +00:00 · 2026-06-05 04:51:09 +00:00
112 changed files with 11177 additions and 554 deletions
@@ -8,7 +8,8 @@ pair diverges.
 Sources:
  A. `.gitea/workflows/ci.yml` jobs  (CI source — the actual job set)
  B. `status_check_contexts` in branch_protections (the merge gate)
-  C. `REQUIRED_CHECKS` env in audit-force-merge.yml (the audit env)
+  C. `REQUIRED_CHECKS_JSON` (preferred) or `REQUIRED_CHECKS` (legacy)
+     env in audit-force-merge.yml (the audit env)

 Three failure classes:
  F1  Job in (A) is not under the sentinel's `needs:` — sentinel
@@ -250,13 +251,21 @@ def sentinel_needs(ci_doc: dict) -> set[str]:
    return set(needs)


-def required_checks_env(audit_doc: dict) -> set[str]:
-    """Pull the REQUIRED_CHECKS env value from audit-force-merge.yml.
+def required_checks_env(audit_doc: dict, branch: str) -> set[str]:
+    """Pull the required-checks env value from audit-force-merge.yml.
+
    Walks the YAML AST per `feedback_behavior_based_ast_gates`: we do
-    NOT grep for `REQUIRED_CHECKS:` — that breaks under reformatting,
+    NOT grep for env keys — that breaks under reformatting,
    multi-job workflows, or a future move of the env to a different
-    step. Instead, look inside every job's every step's `env:` map."""
-    found: list[str] = []
+    step. Instead, look inside every job's every step's `env:` map.
+
+    Supports two variants:
+      - REQUIRED_CHECKS_JSON (preferred): JSON dict keyed by branch name.
+        We extract the array for the target branch.
+      - REQUIRED_CHECKS (legacy): newline-separated list of context names.
+    """
+    found_json: list[str] = []
+    found_legacy: list[str] = []
    jobs = audit_doc.get("jobs", {})
    if not isinstance(jobs, dict):
        sys.stderr.write(f"::warning::{AUDIT_WORKFLOW_PATH} has no jobs: mapping\n")
@@ -268,27 +277,67 @@ def required_checks_env(audit_doc: dict) -> set[str]:
            if not isinstance(step, dict):
                continue
            step_env = step.get("env") or {}
-            if isinstance(step_env, dict) and "REQUIRED_CHECKS" in step_env:
-                v = step_env["REQUIRED_CHECKS"]
-                if isinstance(v, str):
-                    found.append(v)
-    if not found:
-        sys.stderr.write(
-            f"::error::REQUIRED_CHECKS env not found in any step of "
-            f"{AUDIT_WORKFLOW_PATH}\n"
-        )
-        sys.exit(3)
-    if len(found) > 1:
-        # Defensive: refuse to guess which one is canonical.
-        sys.stderr.write(
-            f"::error::REQUIRED_CHECKS env present in {len(found)} steps; ambiguous\n"
-        )
-        sys.exit(3)
-    raw = found[0]
-    # YAML block-scalars (`|`) leave a trailing newline + blanks; trim
-    # consistently with audit-force-merge.sh's parser so both sides
-    # produce identical sets.
-    return {line.strip() for line in raw.splitlines() if line.strip()}
+            if isinstance(step_env, dict):
+                if "REQUIRED_CHECKS_JSON" in step_env:
+                    v = step_env["REQUIRED_CHECKS_JSON"]
+                    if isinstance(v, str):
+                        found_json.append(v)
+                if "REQUIRED_CHECKS" in step_env:
+                    v = step_env["REQUIRED_CHECKS"]
+                    if isinstance(v, str):
+                        found_legacy.append(v)
+
+    # JSON variant takes precedence.
+    if found_json:
+        if len(found_json) > 1:
+            sys.stderr.write(
+                f"::error::REQUIRED_CHECKS_JSON env present in {len(found_json)} steps; ambiguous\n"
+            )
+            sys.exit(3)
+        try:
+            parsed = json.loads(found_json[0])
+        except json.JSONDecodeError as e:
+            sys.stderr.write(
+                f"::error::REQUIRED_CHECKS_JSON is not valid JSON: {e}\n"
+            )
+            sys.exit(3)
+        if not isinstance(parsed, dict):
+            sys.stderr.write(
+                f"::error::REQUIRED_CHECKS_JSON parsed to {type(parsed).__name__}, expected dict\n"
+            )
+            sys.exit(3)
+        branch_checks = parsed.get(branch)
+        if branch_checks is None:
+            sys.stderr.write(
+                f"::error::REQUIRED_CHECKS_JSON has no entry for branch '{branch}'\n"
+            )
+            sys.exit(3)
+        if not isinstance(branch_checks, list):
+            sys.stderr.write(
+                f"::error::REQUIRED_CHECKS_JSON['{branch}'] is {type(branch_checks).__name__}, expected list\n"
+            )
+            sys.exit(3)
+        return {str(item).strip() for item in branch_checks if str(item).strip()}
+
+    # Legacy variant fallback.
+    if found_legacy:
+        if len(found_legacy) > 1:
+            # Defensive: refuse to guess which one is canonical.
+            sys.stderr.write(
+                f"::error::REQUIRED_CHECKS env present in {len(found_legacy)} steps; ambiguous\n"
+            )
+            sys.exit(3)
+        raw = found_legacy[0]
+        # YAML block-scalars (`|`) leave a trailing newline + blanks; trim
+        # consistently with audit-force-merge.sh's parser so both sides
+        # produce identical sets.
+        return {line.strip() for line in raw.splitlines() if line.strip()}
+
+    sys.stderr.write(
+        f"::error::Neither REQUIRED_CHECKS_JSON nor REQUIRED_CHECKS env found in any step of "
+        f"{AUDIT_WORKFLOW_PATH}\n"
+    )
+    sys.exit(3)


 # --------------------------------------------------------------------------
@@ -330,7 +379,7 @@ def detect_drift(branch: str) -> tuple[list[str], dict]:
    jobs = ci_job_names(ci_doc)
    jobs_all = ci_jobs_all(ci_doc)
    needs = sentinel_needs(ci_doc)
-    env_set = required_checks_env(audit_doc)
+    env_set = required_checks_env(audit_doc, branch)

    # Protection
    # api() raises ApiError on non-2xx. Transient 5xx should fail loud.
@@ -524,7 +573,7 @@ def render_body(branch: str, findings: list[str], debug: dict) -> str:
            "- **F2**: rename the protection context to match an emitter, "
            "or remove it from `status_check_contexts` "
            "(PATCH `/api/v1/repos/{owner}/{repo}/branch_protections/{branch}`).",
-            "- **F3a / F3b**: bring `REQUIRED_CHECKS` env in "
+            "- **F3a / F3b**: bring `REQUIRED_CHECKS_JSON` (or `REQUIRED_CHECKS` legacy) env in "
            "`.gitea/workflows/audit-force-merge.yml` into set-equality with "
            "`status_check_contexts` (single PR, both files).",
            "",
@@ -26,6 +26,10 @@ PROFILES: dict[str, dict[str, str]] = {
        "handlers": (
            r"^workspace-server/internal/handlers/"
            r"|^workspace-server/internal/wsauth/"
+            # #2149: the scheduler real-PG integration tests run in this same
+            # workflow (they reuse its migrated Postgres), so changes to the
+            # scheduler package must trigger the job too.
+            r"|^workspace-server/internal/scheduler/"
            r"|^workspace-server/migrations/"
            r"|^\.gitea/workflows/handlers-postgres-integration\.yml$"
        ),
@@ -174,3 +178,4 @@ def main(argv: list[str]) -> int:

 if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))
+
@@ -466,12 +466,40 @@ def fetch_log(target_url: str) -> str | None:

 def grep_fail_markers(log_text: str) -> list[str]:
    """Return up to 5 sample matching lines for any FAIL_PATTERNS hit.
-    Empty list = clean log."""
+    Empty list = clean log.
+
+    Heuristic: skip lines where the marker appears inside script source
+    (e.g. ``echo "::error::..."`` in a ``::group::Run`` block) rather
+    than actual execution output. The Gitea Actions log prints the raw
+    script before executing it; ``echo "::error::"`` lines in that
+    display are false positives.
+    """
    matches: list[str] = []
+    in_run_group = False
+    group_depth = 0
    for line in log_text.splitlines():
+        stripped = line.strip()
+        # Track Gitea Actions group markers so we can skip the
+        # ``::group::Run`` script-source display blocks.
+        if stripped.startswith("::group::Run"):
+            in_run_group = True
+            group_depth = 1
+            continue
+        if stripped == "::endgroup::":
+            if in_run_group:
+                in_run_group = False
+                group_depth = 0
+            continue
+        if in_run_group:
+            continue
        for pat in FAIL_PATTERNS:
            if pat in line:
-                # Truncate to keep error output bounded.
+                # Additional false-positive guard: ``echo "::error::"``
+                # is script source, not a runtime error emission.
+                if pat == "::error::":
+                    prefix = line[: line.index(pat)].strip()
+                    if prefix.endswith('echo') or prefix.endswith("echo '") or prefix.endswith('echo "'):
+                        break
                matches.append(line.strip()[:240])
                break
        if len(matches) >= 5:
@@ -364,6 +364,71 @@ def _api_json_optional(url: str, token: str) -> tuple[int, dict | None]:
        return exc.code, None


+def current_branch_head(env: dict[str, str]) -> str | None:
+    """Return the SHA at the tip of the deploy branch (main) per Gitea, or None.
+
+    Used to detect a *superseded* deploy job (see `superseded_by`). Fail-safe:
+    any read error / missing token returns None so the caller treats the job as
+    NOT superseded and the strict /buildinfo verify still runs. We never let an
+    unreadable head silently green a deploy.
+    """
+
+    token = env.get("GITEA_TOKEN", "").strip()
+    if not token:
+        return None
+    host = env.get("GITEA_HOST", "git.moleculesai.app")
+    repo = env.get("GITHUB_REPOSITORY", "molecule-ai/molecule-core")
+    # Deploy lane is on: push:main; the branch is always main here, but read it
+    # from the ref name when present so a future branch rename doesn't break us.
+    branch = env.get("GITHUB_REF_NAME", "").strip() or "main"
+    url = f"https://{host}/api/v1/repos/{repo}/branches/{quote(branch, safe='')}"
+    status, body = _api_json_optional(url, token)
+    if status != 200 or not isinstance(body, dict):
+        return None
+    commit = body.get("commit")
+    if isinstance(commit, dict):
+        head = commit.get("id") or commit.get("sha")
+        if isinstance(head, str) and head.strip():
+            return head.strip()
+    return None
+
+
+def superseded_by(env: dict[str, str]) -> str | None:
+    """Return the newer head SHA if THIS deploy job has been superseded, else None.
+
+    This workflow runs with no `concurrency:` (intentional — Gitea 1.22.6 cancels
+    queued runs, which is unacceptable for a prod deploy). When two main pushes
+    land close together, BOTH deploy-production jobs run. The newer push rolls the
+    fleet forward first; the OLDER job's strict /buildinfo verify then sees tenants
+    on the NEWER SHA and false-reds with "$slug is stale" — even though the fleet
+    is AHEAD, not behind. Git SHAs aren't ordered, so the verify can't tell ahead
+    from behind on its own (and /buildinfo exposes only git_sha, no build time).
+
+    Resolve it at the source of truth for ordering — the branch ref: if main's
+    current head is a DIFFERENT SHA than the one this job is deploying, a newer
+    commit has landed and this job is superseded; the newest job's verify is the
+    authoritative one. We return that head SHA so the caller can log it and exit
+    success early, skipping the strict-equality verify for this stale job.
+
+    Fail-safe: returns None (NOT superseded) when the head can't be read or equals
+    our SHA, so a genuinely-behind tenant under the LATEST deploy job still fails
+    the strict verify loudly. This never suppresses a real-stale signal — it only
+    excuses a job that is no longer the latest from asserting exact equality.
+    """
+
+    sha = env.get("GITHUB_SHA", "").strip()
+    if not sha:
+        return None
+    head = current_branch_head(env)
+    if not head:
+        return None
+    # SHA lengths can differ (short vs full); compare on the shorter prefix.
+    n = min(len(head), len(sha))
+    if head[:n].lower() == sha[:n].lower():
+        return None
+    return head
+
+
 def live_disable_flag(env: dict[str, str]) -> str:
    """Return a live disable value from Gitea variables when readable.

@@ -442,6 +507,14 @@ def main() -> int:
    sub.add_parser("plan", help="print production deploy plan as JSON")
    sub.add_parser("assert-enabled", help="fail if production deploy is currently disabled")
    sub.add_parser("wait-ci", help="block until required CI context is green")
+    sub.add_parser(
+        "check-superseded",
+        help=(
+            "exit 0 if a newer commit has landed on the deploy branch (this job "
+            "is superseded; prints the newer head SHA), exit 10 if this job is "
+            "still the latest"
+        ),
+    )
    rollout_parser = sub.add_parser("rollout", help="execute canary-first scoped production rollout")
    rollout_parser.add_argument("--plan", required=True, help="path to prod-auto-deploy plan JSON")
    rollout_parser.add_argument("--response", required=True, help="path to write aggregate response JSON")
@@ -457,6 +530,16 @@ def main() -> int:
        if args.command == "wait-ci":
            wait_for_ci_context(dict(os.environ))
            return 0
+        if args.command == "check-superseded":
+            newer = superseded_by(dict(os.environ))
+            if newer:
+                print(newer)
+                return 0
+            # Exit 10 (not 0, not 1): "this job is still the latest". The
+            # workflow treats only exit 0 as superseded; 10 means proceed to
+            # the strict verify. A non-zero code here is informational, not a
+            # failure — the workflow step swallows it.
+            return 10
        if args.command == "rollout":
            rollout_from_plan_file(args.plan, args.response, dict(os.environ))
            return 0
@@ -1228,10 +1228,13 @@ def main(argv: list[str] | None = None) -> int:
                )

        na_desc = ", ".join(sorted(na_descs)) if na_descs else "(none)"
-        na_status_state = "success" if na_descs else "pending"
+        # internal#818: na-declarations is an informational context, not a merge
+        # gate. An empty declaration list is a terminal success state — pending
+        # here poisons the PR combined status.
+        na_status_state = "success"
        # review-check.sh reads the description to discover which gates are N/A.
        # Include the gate names so it can grep for them.
-        na_description = f"N/A: {na_desc}" if na_descs else "N/A: (none)"
+        na_description = f"N/A: {na_desc}"

        if not args.dry_run:
            client.post_status(
@@ -114,6 +114,19 @@ if [ -z "$WHOAMI" ]; then
 fi
 echo "::notice::token resolves to user: $WHOAMI"

+# 0.5 Read PR head SHA so we can reject stale approvals after head moves
+# (internal#816). Reviews carry the commit_id they were submitted against.
+HEAD_SHA=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}" | jq -r '.head.sha // ""') || true
+if [ -z "$HEAD_SHA" ]; then
+  echo "::error::Failed to fetch PR head SHA — token may be invalid."
+  if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
+    echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
+    exit 0
+  fi
+  exit 1
+fi
+debug "pr-head-sha=$HEAD_SHA"
+
 # 1. Read tier label. || true ensures set -euo pipefail does not abort the
 # script if curl or jq fails (e.g. 401 from empty token).
 LABELS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/issues/${PR_NUMBER}/labels" | jq -r '.[].name') || true
@@ -265,7 +278,7 @@ if [ $_REVIEWS_EXIT -ne 0 ] || [ -z "$REVIEWS" ]; then
  fi
  exit 1
 fi
-APPROVERS=$(echo "$REVIEWS" | jq -r '[.[] | select(.state=="APPROVED") | .user.login] | unique | .[]') || true
+APPROVERS=$(echo "$REVIEWS" | jq -r --arg head_sha "$HEAD_SHA" '[.[] | select(.state=="APPROVED" and .commit_id == $head_sha) | .user.login] | unique | .[]') || true
 if [ -z "$APPROVERS" ]; then
  echo "::error::No approving reviews on this PR. Set SOP_DEBUG=1 and re-run for diagnostics."
  exit 1
@@ -1,4 +1,5 @@
 import importlib.util
+import json
 import sys
 from pathlib import Path
 from unittest.mock import patch
@@ -36,6 +37,76 @@ def _make_audit_doc(required_checks: list[str]) -> dict:
    }


+def _make_audit_doc_json(required_checks_json: dict) -> dict:
+    return {
+        "jobs": {
+            "audit": {
+                "steps": [
+                    {"env": {"REQUIRED_CHECKS_JSON": json.dumps(required_checks_json)}}
+                ]
+            }
+        }
+    }
+
+
+# ---------------------------------------------------------------------------
+# required_checks_env — dual-variant parsing
+# ---------------------------------------------------------------------------
+
+def test_required_checks_env_prefers_json_over_legacy():
+    doc = {
+        "jobs": {
+            "audit": {
+                "steps": [
+                    {
+                        "env": {
+                            "REQUIRED_CHECKS_JSON": json.dumps(
+                                {"main": ["ctx-a"], "staging": ["ctx-b"]}
+                            ),
+                            "REQUIRED_CHECKS": "ctx-legacy\nctx-old",
+                        }
+                    }
+                ]
+            }
+        }
+    }
+    assert drift.required_checks_env(doc, "main") == {"ctx-a"}
+    assert drift.required_checks_env(doc, "staging") == {"ctx-b"}
+
+
+def test_required_checks_env_falls_back_to_legacy():
+    doc = _make_audit_doc(["legacy-ctx"])
+    assert drift.required_checks_env(doc, "main") == {"legacy-ctx"}
+
+
+def test_required_checks_env_json_missing_branch_fails():
+    doc = _make_audit_doc_json({"staging": ["ctx-b"]})
+    try:
+        drift.required_checks_env(doc, "main")
+    except SystemExit as exc:
+        assert exc.code == 3
+    else:
+        raise AssertionError("expected SystemExit(3)")
+
+
+def test_required_checks_env_json_malformed_fails():
+    doc = {
+        "jobs": {
+            "audit": {
+                "steps": [
+                    {"env": {"REQUIRED_CHECKS_JSON": "not-json"}}
+                ]
+            }
+        }
+    }
+    try:
+        drift.required_checks_env(doc, "main")
+    except SystemExit as exc:
+        assert exc.code == 3
+    else:
+        raise AssertionError("expected SystemExit(3)")
+
+
 # ---------------------------------------------------------------------------
 # sentinel_needs
 # ---------------------------------------------------------------------------
@@ -0,0 +1,244 @@
+"""Live-fire regression test for #2159 — gate auto-fire runtime verification.
+
+Static tests (test_gate_review_auto_fire.py) validate that the workflow YAML
+is structurally correct. This test validates the *runtime* path: submitting an
+APPROVED review to a PR whose head contains the current gate workflows causes
+Gitea Actions to queue the qa-review + security-review workflows and POST the
+branch-protection-required (pull_request_target) contexts within a reasonable
+window.
+
+Skipped when Gitea API credentials are not available. Intended for:
+  - manual developer verification
+  - CI jobs provisioned with a service-account token
+
+Environment:
+  GITEA_HOST            — default: git.moleculesai.app
+  GITEA_TOKEN           — token with read:repository + write:issues (for review POST)
+  REPO                  — default: molecule-ai/molecule-core
+  LIVEFIRE_PR_NUMBER    — optional; if omitted the test tries to find a
+                          suitable open PR automatically, or skips.
+  LIVEFIRE_TIMEOUT_SEC  — default: 120
+"""
+
+import base64
+import json
+import os
+import re
+import time
+import urllib.error
+import urllib.request
+from pathlib import Path
+
+import pytest
+
+import yaml
+
+GITEA_HOST = os.environ.get("GITEA_HOST", "git.moleculesai.app")
+GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
+REPO = os.environ.get("REPO", "molecule-ai/molecule-core")
+LIVEFIRE_PR_NUMBER = os.environ.get("LIVEFIRE_PR_NUMBER", "")
+LIVEFIRE_TIMEOUT_SEC = int(os.environ.get("LIVEFIRE_TIMEOUT_SEC", "120"))
+
+REQUIRED_CONTEXTS = [
+    "qa-review / approved (pull_request_target)",
+    "security-review / approved (pull_request_target)",
+]
+
+skip_no_token = pytest.mark.skipif(
+    not GITEA_TOKEN,
+    reason="GITEA_TOKEN not set — live-fire test requires API credentials",
+)
+
+
+def _api(method: str, path: str, body: dict | None = None) -> tuple[int, dict]:
+    url = f"https://{GITEA_HOST}/api/v1{path}"
+    headers = {
+        "Authorization": f"token {GITEA_TOKEN}",
+        "Content-Type": "application/json",
+    }
+    data = json.dumps(body).encode() if body else None
+    req = urllib.request.Request(url, data=data, headers=headers, method=method)
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            raw = resp.read()
+            code = resp.status
+    except urllib.error.HTTPError as exc:
+        raw = exc.read()
+        code = exc.code
+    payload = json.loads(raw) if raw else {}
+    return code, payload
+
+
+def _get_pr(number: int) -> dict:
+    code, pr = _api("GET", f"/repos/{REPO}/pulls/{number}")
+    if code != 200:
+        pytest.fail(f"GET /pulls/{number} returned HTTP {code}: {pr}")
+    return pr
+
+
+def _list_open_prs() -> list[dict]:
+    code, prs = _api("GET", f"/repos/{REPO}/pulls?state=open&limit=50")
+    if code != 200:
+        pytest.fail(f"GET /pulls?state=open returned HTTP {code}: {prs}")
+    return prs
+
+
+def _pr_has_trigger_in_head(pr: dict) -> bool:
+    """Return True if the PR head contains pull_request_review in both workflows."""
+    head_sha = pr["head"]["sha"]
+    for wf_name in ("qa-review.yml", "security-review.yml"):
+        path = f"/repos/{REPO}/contents/.gitea/workflows/{wf_name}?ref={head_sha}"
+        code, payload = _api("GET", path)
+        if code != 200:
+            return False
+        raw = base64.b64decode(payload.get("content", "")).decode("utf-8")
+        wf = yaml.safe_load(raw)
+        on = wf.get(True) or wf.get("on") or {}
+        if isinstance(on, str):
+            if on != "pull_request_review":
+                return False
+        elif "pull_request_review" not in on:
+            return False
+    return True
+
+
+def _find_suitable_pr() -> dict:
+    if LIVEFIRE_PR_NUMBER:
+        pr = _get_pr(int(LIVEFIRE_PR_NUMBER))
+        if pr.get("state") != "open":
+            pytest.skip(f"PR {LIVEFIRE_PR_NUMBER} is not open")
+        return pr
+
+    prs = _list_open_prs()
+    for pr in prs:
+        if _pr_has_trigger_in_head(pr):
+            return pr
+    pytest.skip("No open PR found whose head contains the pull_request_review trigger")
+
+
+def _submit_approved_review(pr_number: int) -> dict:
+    code, review = _api(
+        "POST",
+        f"/repos/{REPO}/pulls/{pr_number}/reviews",
+        {"body": "Live-fire test APPROVED review", "event": "APPROVED"},
+    )
+    # 200 = created, 422 = review already exists (idempotent enough for our purposes)
+    if code not in (200, 201, 422):
+        pytest.fail(f"POST /pulls/{pr_number}/reviews returned HTTP {code}")
+    return review
+
+
+def _get_status_snapshot(sha: str) -> dict[str, dict]:
+    """Return mapping context -> {id, updated_at, target_url} for required contexts."""
+    code, statuses = _api("GET", f"/repos/{REPO}/statuses/{sha}?limit=100")
+    if code != 200:
+        return {}
+    result: dict[str, dict] = {}
+    for st in statuses:
+        ctx = st.get("context", "")
+        if ctx in REQUIRED_CONTEXTS:
+            result[ctx] = {
+                "id": st.get("id"),
+                "updated_at": st.get("updated_at", st.get("created_at", "")),
+                "target_url": st.get("target_url"),
+            }
+    return result
+
+
+def _extract_run_id(target_url: str | None) -> str | None:
+    """Extract the Actions run_id from a status target_url."""
+    if not target_url:
+        return None
+    m = re.search(r"/actions/runs/(\d+)", target_url)
+    return m.group(1) if m else None
+
+
+def _poll_fresh_statuses(
+    sha: str,
+    prior_snapshot: dict[str, dict],
+    timeout_sec: int = LIVEFIRE_TIMEOUT_SEC,
+) -> dict[str, dict]:
+    """Poll until required contexts appear fresh (newer timestamp, id, or run)."""
+    deadline = time.monotonic() + timeout_sec
+    found: dict[str, dict] = {}
+    while time.monotonic() < deadline:
+        code, statuses = _api("GET", f"/repos/{REPO}/statuses/{sha}?limit=100")
+        if code == 200:
+            for st in statuses:
+                ctx = st.get("context", "")
+                if ctx in REQUIRED_CONTEXTS:
+                    updated_at = st.get("updated_at", st.get("created_at", ""))
+                    status_id = st.get("id")
+                    target_url = st.get("target_url")
+                    prior = prior_snapshot.get(ctx, {})
+                    # Fresh if timestamp changed, id changed, or target_url changed.
+                    is_fresh = (
+                        ctx not in prior_snapshot
+                        or updated_at != prior.get("updated_at", "")
+                        or status_id != prior.get("id")
+                        or target_url != prior.get("target_url")
+                    )
+                    if is_fresh:
+                        found[ctx] = {
+                            "state": st.get("state", st.get("status", "")),
+                            "updated_at": updated_at,
+                            "id": status_id,
+                            "target_url": target_url,
+                        }
+        if all(ctx in found for ctx in REQUIRED_CONTEXTS):
+            return found
+        time.sleep(5)
+    return found
+
+
+@skip_no_token
+class TestGateAutoFireLive:
+    def test_auto_fire_posts_required_contexts(self):
+        """Submit APPROVED review; assert BP-required contexts appear fresh within timeout."""
+        pr = _find_suitable_pr()
+        pr_number = pr["number"]
+        head_sha = pr["head"]["sha"]
+
+        # Capture pre-existing status snapshot so we can prove FRESH contexts
+        # were posted after the review submission (not stale from a prior run).
+        prior_snapshot = _get_status_snapshot(head_sha)
+        prior_run_ids = {
+            _extract_run_id(s["target_url"])
+            for s in prior_snapshot.values()
+            if _extract_run_id(s["target_url"])
+        }
+
+        review = _submit_approved_review(pr_number)
+
+        found = _poll_fresh_statuses(head_sha, prior_snapshot)
+
+        missing = [ctx for ctx in REQUIRED_CONTEXTS if ctx not in found]
+        if missing:
+            pytest.fail(
+                f"After {LIVEFIRE_TIMEOUT_SEC}s, fresh contexts still missing: {missing}. "
+                f"Found: {found}. Prior snapshot: {prior_snapshot}. "
+                f"PR #{pr_number} head={head_sha}. "
+                f"This indicates the pull_request_review trigger did not fire at runtime."
+            )
+
+        # The contexts appeared fresh — that's the proof of auto-fire.
+        # We do NOT assert success vs failure; the evaluator decides that.
+        # The point of #2159 is that the workflows QUEUE and POST at all.
+        for ctx, info in found.items():
+            state = info["state"]
+            assert state in ("pending", "success", "failure"), (
+                f"Unexpected state {state!r} for {ctx}"
+            )
+
+            # CR2 Finding 1: prove a NEW workflow run was triggered, not just
+            # an in-place status update. Gitea 1.22.6 lacks REST /actions/runs/*
+            # endpoints, so we use the run_id embedded in the status target_url
+            # as a proxy for distinct run_id.
+            run_id = _extract_run_id(info.get("target_url"))
+            if run_id and run_id in prior_run_ids:
+                pytest.fail(
+                    f"Context {ctx!r} has target_url run_id {run_id} which existed "
+                    f"BEFORE the review was submitted. This means the status was "
+                    f"updated in-place by an existing run, not by a new workflow "
+                    f"run triggered from the pull_request_review event."
+                )
@@ -0,0 +1,145 @@
+"""Stale-head diagnostic test for #2159.
+
+Deterministically reports whether a PR's HEAD contains the pull_request_review
+trigger in qa-review.yml and security-review.yml. If the trigger is absent,
+auto-fire on APPROVED review is impossible for that PR.
+
+This is used as a self-diagnostic for future stale-PR situations (PRs opened
+before #2157 merged, or branches cut from old bases).
+
+Environment:
+  GITEA_HOST  — default: git.moleculesai.app
+  GITEA_TOKEN — token with read:repository scope (optional; falls back to local files)
+  REPO        — default: molecule-ai/molecule-core
+  PR_NUMBER   — required when running against a real PR
+"""
+
+import base64
+import json
+import os
+import urllib.error
+import urllib.request
+from pathlib import Path
+
+import pytest
+
+import yaml
+
+GITEA_HOST = os.environ.get("GITEA_HOST", "git.moleculesai.app")
+GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
+REPO = os.environ.get("REPO", "molecule-ai/molecule-core")
+PR_NUMBER = os.environ.get("PR_NUMBER", "")
+
+ROOT = Path(__file__).resolve().parents[2]
+
+
+def _api(method: str, path: str) -> tuple[int, dict]:
+    url = f"https://{GITEA_HOST}/api/v1{path}"
+    headers = {"Authorization": f"token {GITEA_TOKEN}"}
+    req = urllib.request.Request(url, headers=headers, method=method)
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            return resp.status, json.loads(resp.read())
+    except urllib.error.HTTPError as exc:
+        body = exc.read()
+        return exc.code, json.loads(body) if body else {}
+
+
+def _fetch_workflow_from_ref(workflow_name: str, ref: str) -> dict:
+    path = f"/repos/{REPO}/contents/.gitea/workflows/{workflow_name}?ref={ref}"
+    code, payload = _api("GET", path)
+    if code != 200:
+        pytest.fail(
+            f"GET {path} returned HTTP {code}: {payload}. "
+            f"Cannot determine whether PR head contains the trigger."
+        )
+    raw = base64.b64decode(payload.get("content", "")).decode("utf-8")
+    return yaml.safe_load(raw)
+
+
+def _fetch_workflow_local(workflow_name: str) -> dict:
+    p = ROOT / "workflows" / workflow_name
+    if not p.exists():
+        pytest.fail(f"Local workflow file not found: {p}")
+    return yaml.safe_load(p.read_text())
+
+
+def _has_pull_request_review_trigger(wf: dict) -> bool:
+    on = wf.get(True) or wf.get("on") or {}
+    if isinstance(on, list):
+        return "pull_request_review" in on
+    if isinstance(on, dict):
+        return "pull_request_review" in on
+    if isinstance(on, str):
+        return on == "pull_request_review"
+    return False
+
+
+def _diagnose_pr(pr_number: int) -> dict[str, bool]:
+    code, pr = _api("GET", f"/repos/{REPO}/pulls/{pr_number}")
+    if code != 200:
+        pytest.fail(f"GET /pulls/{pr_number} returned HTTP {code}: {pr}")
+
+    head_ref = pr["head"]["ref"]
+    head_sha = pr["head"]["sha"]
+
+    results: dict[str, bool] = {}
+    for wf_name in ("qa-review.yml", "security-review.yml"):
+        wf = _fetch_workflow_from_ref(wf_name, head_sha)
+        results[wf_name] = _has_pull_request_review_trigger(wf)
+
+    return {
+        "pr_number": pr_number,
+        "head_ref": head_ref,
+        "head_sha": head_sha,
+        "triggers": results,
+        "auto_fire_possible": all(results.values()),
+    }
+
+
+def _diagnose_local() -> dict[str, bool]:
+    results: dict[str, bool] = {}
+    for wf_name in ("qa-review.yml", "security-review.yml"):
+        wf = _fetch_workflow_local(wf_name)
+        results[wf_name] = _has_pull_request_review_trigger(wf)
+    return {
+        "pr_number": None,
+        "head_ref": "local-checkout",
+        "head_sha": None,
+        "triggers": results,
+        "auto_fire_possible": all(results.values()),
+    }
+
+
+class TestStaleHeadDiagnostic:
+    """Test deterministically reports 'auto-fire impossible for this PR' when
+    the PR head lacks the pull_request_review trigger.
+    """
+
+    def test_local_checkout_has_pull_request_review_trigger(self):
+        """Local files (the ones in this checkout) must contain the trigger.
+
+        This is the baseline: if the checkout itself is stale, every PR cut
+        from it will also be stale.
+        """
+        diag = _diagnose_local()
+        missing = [n for n, ok in diag["triggers"].items() if not ok]
+        if missing:
+            pytest.fail(
+                f"Local checkout is missing pull_request_review trigger in: {missing}. "
+                f"This branch cannot produce PRs that auto-fire."
+            )
+
+    @pytest.mark.skipif(not GITEA_TOKEN, reason="GITEA_TOKEN not set")
+    @pytest.mark.skipif(not PR_NUMBER, reason="PR_NUMBER not set")
+    def test_pr_head_has_pull_request_review_trigger(self):
+        """When PR_NUMBER is given, assert the PR head contains the trigger."""
+        diag = _diagnose_pr(int(PR_NUMBER))
+        if not diag["auto_fire_possible"]:
+            missing = [n for n, ok in diag["triggers"].items() if not ok]
+            pytest.fail(
+                f"Auto-fire impossible for PR #{diag['pr_number']}. "
+                f"Head ref={diag['head_ref']} sha={diag['head_sha']}. "
+                f"Missing trigger in: {missing}. "
+                f"This PR needs /qa-recheck + /security-recheck fallback, or a rebase onto current main."
+            )
@@ -486,3 +486,129 @@ def test_scoped_rollout_dry_run_does_not_assert_coverage():
        sleep=lambda _s: None,
    )
    assert aggregate["ok"] is True
+
+
+# --- Superseded-deploy guard (false-stale fix) -----------------------------
+#
+# Scenario this fixes: no `concurrency:` on the prod-deploy workflow means two
+# close main pushes run BOTH deploy-production jobs. eb31bcf (Fix A) and 286338
+# (Fix C) merge back-to-back; the 286338 job rolls the fleet to staging-2863380
+# first; the OLDER eb31bcf job's strict verify then sees tenants on 2863380 and
+# false-reds "stale" though the fleet is AHEAD. superseded_by detects that main's
+# head is no longer eb31bcf and lets the older job succeed without weakening the
+# behind-tenant signal for whichever job IS the latest.
+
+
+def test_superseded_by_returns_newer_head_when_main_moved_ahead(monkeypatch):
+    # eb31bcf job: main head is now 2863380 -> superseded, return the newer head.
+    monkeypatch.setattr(prod, "current_branch_head", lambda _env: "2863380fullhash")
+    newer = prod.superseded_by({"GITHUB_SHA": "eb31bcffullhash"})
+    assert newer == "2863380fullhash"
+
+
+def test_superseded_by_none_when_this_job_is_still_head(monkeypatch):
+    # 2863380 job (the latest): head == our SHA -> NOT superseded -> strict verify
+    # runs, so a genuinely-behind tenant still fails loudly.
+    monkeypatch.setattr(prod, "current_branch_head", lambda _env: "2863380fullhash")
+    assert prod.superseded_by({"GITHUB_SHA": "2863380fullhash"}) is None
+
+
+def test_superseded_by_matches_on_short_vs_full_sha_prefix(monkeypatch):
+    # GITHUB_SHA is full; Gitea may return a different-length id. Equal prefixes
+    # must NOT count as superseded (avoid false-skipping the real latest job).
+    monkeypatch.setattr(prod, "current_branch_head", lambda _env: "2863380")
+    assert prod.superseded_by({"GITHUB_SHA": "2863380fullhash"}) is None
+    monkeypatch.setattr(prod, "current_branch_head", lambda _env: "2863380FULLHASH")
+    assert prod.superseded_by({"GITHUB_SHA": "2863380fullhash"}) is None
+
+
+def test_superseded_by_fail_safe_returns_none_when_head_unreadable(monkeypatch):
+    # Fail-safe: unreadable head (no token / API error) must NOT be treated as
+    # superseded, so the strict verify still runs and never silently greens.
+    monkeypatch.setattr(prod, "current_branch_head", lambda _env: None)
+    assert prod.superseded_by({"GITHUB_SHA": "eb31bcffullhash"}) is None
+
+
+def test_superseded_by_none_without_github_sha(monkeypatch):
+    monkeypatch.setattr(prod, "current_branch_head", lambda _env: "2863380fullhash")
+    assert prod.superseded_by({}) is None
+
+
+def test_current_branch_head_parses_gitea_branch_commit_id(monkeypatch):
+    captured = {}
+
+    def fake_optional(url, _token):
+        captured["url"] = url
+        return 200, {"name": "main", "commit": {"id": "2863380fullhash"}}
+
+    monkeypatch.setattr(prod, "_api_json_optional", fake_optional)
+    head = prod.current_branch_head(
+        {"GITEA_TOKEN": "secret", "GITHUB_REPOSITORY": "molecule-ai/molecule-core"}
+    )
+    assert head == "2863380fullhash"
+    assert captured["url"].endswith("/repos/molecule-ai/molecule-core/branches/main")
+
+
+def test_current_branch_head_uses_ref_name_branch(monkeypatch):
+    captured = {}
+
+    def fake_optional(url, _token):
+        captured["url"] = url
+        return 200, {"commit": {"sha": "deadbeef"}}
+
+    monkeypatch.setattr(prod, "_api_json_optional", fake_optional)
+    head = prod.current_branch_head(
+        {"GITEA_TOKEN": "secret", "GITHUB_REF_NAME": "release"}
+    )
+    assert head == "deadbeef"
+    assert captured["url"].endswith("/branches/release")
+
+
+def test_current_branch_head_none_without_token():
+    assert prod.current_branch_head({}) is None
+
+
+def test_current_branch_head_none_on_non_200(monkeypatch):
+    monkeypatch.setattr(prod, "_api_json_optional", lambda _u, _t: (500, None))
+    assert prod.current_branch_head({"GITEA_TOKEN": "secret"}) is None
+
+
+# --- #2213: superseded check must fire BEFORE production side effects ----------
+#
+# Real incident shape: two main pushes land ~2 min apart. The OLDER deploy job
+# (GITHUB_SHA=7a72516, target staging-7a72516) started LATE — main head was
+# already 7f25373. The #2194 guard only protected the *verify* step, so the
+# older job still:
+#   1. rolled the canary (hongming) BACKWARD to staging-7a72516 (the #2213 red,
+#      seen as the newer job's verify reading hongming on the old SHA), then
+#   2. promoted :latest backward to the older image,
+# before finally skipping verify. The workflow now calls this same superseded
+# check BEFORE the redeploy + promote steps and gates both off when it fires.
+# These tests pin the contract that check-superseded relies on for the exact
+# incident shape.
+
+
+def test_superseded_by_fires_for_older_job_when_newer_already_head(monkeypatch):
+    # Older job (7a72516) re-checks the head just before rollout and finds the
+    # newer merge (7f25373) already owns main -> superseded -> skip side effects.
+    monkeypatch.setattr(
+        prod, "current_branch_head", lambda _env: "7f25373309eca54a36f08c371ff783c3a47c3f8d"
+    )
+    newer = prod.superseded_by(
+        {"GITHUB_SHA": "7a72516f7e7ba1a710c4f393fef08be8d22e1866"}
+    )
+    assert newer == "7f25373309eca54a36f08c371ff783c3a47c3f8d"
+
+
+def test_superseded_by_none_for_latest_job_so_it_still_rolls(monkeypatch):
+    # The newer job (7f25373) IS the head -> NOT superseded -> it proceeds to
+    # roll the fleet and verify, so a genuinely-behind tenant still fails loud.
+    monkeypatch.setattr(
+        prod, "current_branch_head", lambda _env: "7f25373309eca54a36f08c371ff783c3a47c3f8d"
+    )
+    assert (
+        prod.superseded_by(
+            {"GITHUB_SHA": "7f25373309eca54a36f08c371ff783c3a47c3f8d"}
+        )
+        is None
+    )
@@ -1299,3 +1299,108 @@ class TestGetCIStatus(unittest.TestCase):
        self.assertEqual(
            sop.get_ci_status(client, "o", "r", "sha1"), "unknown"
        )
+
+
+# ---------------------------------------------------------------------------
+# internal#818 — na-declarations status must be terminal success
+# ---------------------------------------------------------------------------
+
+
+class TestNaDeclarationsStatusTerminal(unittest.TestCase):
+    """Regression for internal#818: the na-declarations context is
+    informational, not a merge gate.  An empty N/A declaration list must
+    post `success` (not `pending`) so it does not poison the PR combined
+    status."""
+
+    def _run_with_fake_client(self, fake_client_class):
+        """Swap GiteaClient temporarily and invoke main() with a fake token."""
+        orig_client = sop.GiteaClient
+        orig_token = os.environ.get("GITEA_TOKEN")
+        try:
+            sop.GiteaClient = fake_client_class
+            os.environ["GITEA_TOKEN"] = "fake-token"
+            return sop.main([
+                "--owner", "o", "--repo", "r", "--pr", "1",
+                "--config", CONFIG_PATH,
+                "--gitea-host", "git.example.com",
+            ])
+        finally:
+            sop.GiteaClient = orig_client
+            if orig_token is None:
+                os.environ.pop("GITEA_TOKEN", None)
+            else:
+                os.environ["GITEA_TOKEN"] = orig_token
+
+    def test_empty_na_descriptions_posts_success(self):
+        posted = []
+
+        class FakeClient(sop.GiteaClient):
+            def get_pr(self, owner, repo, pr):
+                return {
+                    "state": "open",
+                    "user": {"login": "alice"},
+                    "head": {"sha": "abc123"},
+                    "labels": [],
+                }
+
+            def get_issue_comments(self, owner, repo, issue, max_comments=None):
+                return []
+
+            def resolve_team_id(self, org, team_name):
+                return None
+
+            def is_team_member(self, team_id, login):
+                return False
+
+            def post_status(self, owner, repo, sha, state, context,
+                            description, target_url=""):
+                posted.append({
+                    "state": state,
+                    "context": context,
+                    "description": description,
+                })
+
+        rc = self._run_with_fake_client(FakeClient)
+        self.assertEqual(rc, 0)
+        na_posts = [p for p in posted if "na-declarations" in p["context"]]
+        self.assertEqual(len(na_posts), 1, f"expected one na-declarations post, got {posted}")
+        self.assertEqual(na_posts[0]["state"], "success")
+        self.assertEqual(na_posts[0]["description"], "N/A: (none)")
+
+    def test_populated_na_descriptions_posts_success(self):
+        posted = []
+
+        class FakeClient(sop.GiteaClient):
+            def get_pr(self, owner, repo, pr):
+                return {
+                    "state": "open",
+                    "user": {"login": "alice"},
+                    "head": {"sha": "abc123"},
+                    "labels": [],
+                }
+
+            def get_issue_comments(self, owner, repo, issue, max_comments=None):
+                return [
+                    {"user": {"login": "bob"}, "body": "/sop-n/a qa-review N/A: docs-only"},
+                ]
+
+            def resolve_team_id(self, org, team_name):
+                return 1
+
+            def is_team_member(self, team_id, login):
+                return True
+
+            def post_status(self, owner, repo, sha, state, context,
+                            description, target_url=""):
+                posted.append({
+                    "state": state,
+                    "context": context,
+                    "description": description,
+                })
+
+        rc = self._run_with_fake_client(FakeClient)
+        self.assertEqual(rc, 0)
+        na_posts = [p for p in posted if "na-declarations" in p["context"]]
+        self.assertEqual(len(na_posts), 1)
+        self.assertEqual(na_posts[0]["state"], "success")
+        self.assertIn("qa-review", na_posts[0]["description"])
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# Regression test for internal#816 — sop-tier-check must ignore APPROVED
+# reviews that were submitted against an old PR head SHA.
+#
+# Bug: the script collected approvers with
+#   jq '[.[] | select(.state=="APPROVED") | .user.login]'
+# without filtering on .commit_id == HEAD_SHA. After a PR head moved,
+# stale approvals looked valid to the tier gate.
+#
+# Fix: the jq filter now includes
+#   select(.state=="APPROVED" and .commit_id == $head_sha)
+# where $head_sha is the current PR head fetched from the API.
+
+set -euo pipefail
+
+# jq may not be on PATH in all environments (e.g. dev containers).
+PATH="/tmp/bin:$PATH"
+command -v jq >/dev/null 2>&1 || { echo "::error::jq required but not found"; exit 1; }
+
+PASS=0
+FAIL=0
+
+assert_eq() {
+  local label="$1"
+  local expected="$2"
+  local got="$3"
+  if [ "$expected" = "$got" ]; then
+    echo "  PASS  $label"
+    PASS=$((PASS + 1))
+  else
+    echo "  FAIL  $label"
+    echo "        expected: <$expected>"
+    echo "        got:      <$got>"
+    FAIL=$((FAIL + 1))
+  fi
+}
+
+# Sample reviews matching the shape from Gitea API
+REVIEWS_JSON='[
+  {"state":"APPROVED","commit_id":"abc123","user":{"login":"bob"}},
+  {"state":"APPROVED","commit_id":"old456","user":{"login":"alice"}},
+  {"state":"COMMENT","commit_id":"abc123","user":{"login":"carol"}},
+  {"state":"APPROVED","commit_id":"abc123","user":{"login":"dave"}},
+  {"state":"REQUEST_CHANGES","commit_id":"abc123","user":{"login":"eve"}}
+]'
+
+echo "test: jq filter keeps only APPROVED on current head"
+GOT=$(echo "$REVIEWS_JSON" | jq -r --arg head_sha "abc123" \
+  '[.[] | select(.state=="APPROVED" and .commit_id == $head_sha) | .user.login] | unique | .[]')
+assert_eq "current-head approvers" "bob dave" "$(echo "$GOT" | tr '\n' ' ' | sed 's/ $//')"
+
+echo "test: jq filter with all-stale reviews yields empty"
+GOT=$(echo "$REVIEWS_JSON" | jq -r --arg head_sha "new789" \
+  '[.[] | select(.state=="APPROVED" and .commit_id == $head_sha) | .user.login] | unique | .[]')
+assert_eq "all-stale yields empty" "" "$GOT"
+
+echo "test: jq filter handles null commit_id gracefully"
+NULL_JSON='[{"state":"APPROVED","commit_id":null,"user":{"login":"mallory"}}]'
+GOT=$(echo "$NULL_JSON" | jq -r --arg head_sha "abc123" \
+  '[.[] | select(.state=="APPROVED" and .commit_id == $head_sha) | .user.login] | unique | .[]')
+assert_eq "null commit_id excluded" "" "$GOT"
+
+echo
+echo "------"
+echo "PASS=$PASS FAIL=$FAIL"
+[ "$FAIL" -eq 0 ]
@@ -96,6 +96,7 @@ env:
  GITHUB_SERVER_URL: https://git.moleculesai.app

 jobs:
+  # bp-exempt: advisory arm64 pilot, non-gating by design (internal#418).
  fast-checks:
    name: fast-checks
    # AND-set: only the Mac arm64 runner advertises macos-self-hosted.
@@ -25,10 +25,9 @@
 #      sufficient for `actions/checkout` against this same repo.
 #
 #   4. Docs — no docs/scripts reference github.com URLs that need swapping.
-#      The canvas-deploy-reminder step writes a `ghcr.io/...` image
-#      reference into the step summary text — that's documentation prose
-#      pointing at the ECR-mirrored canvas image and stays unchanged for
-#      this port (a separate cleanup if ghcr→ECR sweep is in scope).
+#      The canvas-deploy-status step (core#2226, formerly canvas-deploy-reminder)
+#      writes the canvas ordered-deploy status into the step summary; it points
+#      at the ECR canvas image and the publish workflow, no ghcr.io prose.
 #
 # Cross-links:
 #   - RFC: internal#219 (CI/CD hard-gate hardening)
@@ -365,6 +364,14 @@ jobs:
          # check missed. If a refactor weakens the gate to a shape check,
          # this step goes red on every PR.
          bash tests/e2e/test_completion_assert_unit.sh
+          # harden/e2e-staging-saas-failclosed: fail-direction proof for the
+          # E2E_REQUIRE_LIVE fail-closed-on-skip guard in
+          # test_staging_full_saas.sh. Offline (no LLM/network/provisioning):
+          # asserts the guard exits 5 when a live lifecycle did NOT run and
+          # passes when all milestones fired — so a refactor that lets the
+          # staging gate report green without a real provision→online→A2A
+          # cycle goes red on every PR.
+          bash tests/e2e/test_require_live_guard_unit.sh

      - if: ${{ needs.changes.outputs.scripts == 'true' }}
        name: Test ECR promote-tenant-image script (mock-driven, no live infra)
@@ -389,61 +396,61 @@ jobs:

  # mc#959 root-fix (sre)

-  canvas-deploy-reminder:
-    name: Canvas Deploy Reminder
+  canvas-deploy-status:
+    # core#2226: replaces the old advisory "Canvas Deploy Reminder". The canvas
+    # image now has a real ORDERED auto-deploy (publish-canvas-image.yml:
+    # build → push :staging-<sha> → wait green main CI → promote :latest by
+    # digest), and docker-compose pins via CANVAS_IMAGE_TAG. There is no longer
+    # a manual "go run docker compose pull by hand" step to remind operators
+    # about — so this job just records, on a canvas-touching main push, that the
+    # ordered deploy is handling it (and where to watch), instead of prescribing
+    # a manual action that determinism made obsolete.
+    name: Canvas Deploy Status
    runs-on: docker-host
-    # mc#1982 root-fix: added job-level `if:` so ci-required-drift.py's
-    # ci_job_names() detects this as github.ref-gated and skips it from F1.
-    # The step-level exit 0 handles the "not main push" case; the job-level
-    # `if:` makes the gating explicit so the drift script sees it.
-    # Runs on both main and staging pushes; step exits 0 when not applicable.
+    # Job-level `if:` so ci-required-drift.py's ci_job_names() detects this as
+    # github.ref-gated and skips it from the required-context F1 set (mc#1982).
+    # Step-level exit 0 handles the "not a canvas main push" case.
    if: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/staging' }}
    needs: [changes, canvas-build]
    steps:
-      - name: Write deploy reminder to step summary
+      - name: Record canvas ordered-deploy status
        env:
          COMMIT_SHA: ${{ github.sha }}
          CANVAS_CHANGED: ${{ needs.changes.outputs.canvas }}
          EVENT_NAME: ${{ github.event_name }}
          REF_NAME: ${{ github.ref }}
-          # github.server_url resolves via the workflow-level env override
-          # to the Gitea instance, so the RUN_URL points at the Gitea run
-          # page (not github.com). See feedback_act_runner_github_server_url.
-          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          # github.server_url resolves via the workflow-level env override to the
+          # Gitea instance, so RUN_URL points at the Gitea run page (not github.com).
+          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions
        run: |
          set -euo pipefail
          if [ "$CANVAS_CHANGED" != "true" ] || [ "$EVENT_NAME" != "push" ] || [ "$REF_NAME" != "refs/heads/main" ]; then
-            echo "Canvas deploy reminder not applicable for event=$EVENT_NAME ref=$REF_NAME canvas_changed=$CANVAS_CHANGED."
+            echo "Canvas deploy status not applicable for event=$EVENT_NAME ref=$REF_NAME canvas_changed=$CANVAS_CHANGED."
            exit 0
          fi

          # Write body to a temp file — avoids backtick escaping in shell.
-          cat > /tmp/deploy-reminder.md << 'BODY'
-          ## Canvas build passed — deploy required
+          cat > /tmp/deploy-status.md << 'BODY'
+          ## Canvas ordered deploy in progress — no manual action required

-          The `publish-canvas-image` workflow is now building a fresh Docker image
-          (`ghcr.io/molecule-ai/canvas:latest`) in the background.
+          This canvas-touching main push triggers `publish-canvas-image`, which now
+          runs an ORDERED, CI-gated deploy (core#2226) — the same shape as the
+          platform's deploy-production:

-          Once it completes (~3–5 min), apply on the host machine with:
-          ```bash
-          cd <runner-workspace>
-          git pull origin main
-          docker compose pull canvas && docker compose up -d canvas
-          ```
+          1. Build → push `molecule-ai/canvas:staging-<sha>` + `:staging-latest`.
+          2. Wait for green main CI on this SHA.
+          3. Promote `:latest` to the verified `:staging-<sha>` by digest.

-          If you need to rebuild from local source instead (e.g. testing unreleased
-          changes or a new `NEXT_PUBLIC_*` URL), use:
-          ```bash
-          docker compose build canvas && docker compose up -d canvas
-          ```
+          Tenants/hosts pin via `CANVAS_IMAGE_TAG` (default `latest` = the last
+          CI-green build), so a deploy is reproducible — no hand-run
+          `docker compose pull` needed. Watch the run in the canvas publish workflow.
          BODY
-          printf '\n> Posted automatically by CI · commit `%s` · [build log](%s)\n' \
-            "$COMMIT_SHA" "$RUN_URL" >> /tmp/deploy-reminder.md
+          printf '\n> Posted automatically by CI · commit `%s` · [publish workflow](%s)\n' \
+            "$COMMIT_SHA" "$RUN_URL" >> /tmp/deploy-status.md

-          # Gitea has no commit-comments API; write to GITHUB_STEP_SUMMARY,
-          # which both GitHub Actions and Gitea Actions render as the
-          # workflow run's summary page. (#75 / PR-D)
-          cat /tmp/deploy-reminder.md >> "$GITHUB_STEP_SUMMARY"
+          # Gitea has no commit-comments API; write to GITHUB_STEP_SUMMARY, which
+          # both GitHub and Gitea Actions render as the run's summary page.
+          cat /tmp/deploy-status.md >> "$GITHUB_STEP_SUMMARY"

  # Python Lint & Test — required check, always runs.
  # Runtime Python moved to molecule-ai-workspace-runtime. Keep this context as
@@ -123,8 +123,9 @@ jobs:
    # integration). See internal#512 for the class defect.
    runs-on: docker-host
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    # mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
-    continue-on-error: true
+    # mc#1982: mask removed. If regressions appear, root-fix the underlying
+    # test — do NOT renew the mask silently.
+    continue-on-error: false
    outputs:
      api: ${{ steps.decide.outputs.api }}
    steps:
@@ -160,8 +161,9 @@ jobs:
    # detect-changes for the full rationale.
    runs-on: docker-host
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
-    # mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
-    continue-on-error: true
+    # mc#1982: mask removed. If regressions appear, root-fix the underlying
+    # test — do NOT renew the mask silently.
+    continue-on-error: false
    timeout-minutes: 15
    env:
      # Unique per-run container names so concurrent runs on the host-
@@ -325,19 +327,57 @@ jobs:
          # start-redis steps point at this run's per-run host ports.
          ./platform-server > platform.log 2>&1 &
          echo $! > platform.pid
-      - name: Wait for /health
+      - name: Wait for /health (with migration completion gate)
+        # Issue #2205: 30 one-second probes is insufficient when the migration
+        # chain is still running; /health can flip true before migrations
+        # finish, so subsequent steps that touch the DB fail. Hybrid fix:
+        # bump timeout to 300s AND gate exit on the same workspaces-table
+        # existence check the downstream "Assert migrations applied" uses.
        if: needs.detect-changes.outputs.api == 'true'
        run: |
-          for i in $(seq 1 30); do
+          # Readiness signal: the platform binds /health only AFTER the full
+          # migration chain has been applied on cold start (it prints
+          # "Platform starting on :PORT" at that point). So a 200 from /health
+          # is the real "migrations done + server listening" signal.
+          #
+          # The migration chain grows every release, so a fixed ~30s budget is
+          # brittle by construction (it WILL be exceeded as migrations accrue).
+          # Use a generous wall-clock budget that comfortably exceeds
+          # cold-start + full-migration time, polling fast. This is robust to a
+          # growing chain WITHOUT masking a genuinely dead platform: if the
+          # background platform-server process has exited (e.g. a broken
+          # migration crashed it), we stop and fail loudly at once instead of
+          # waiting out the whole budget.
+          #
+          # Issue #2205: /health can flip true before migrations finish on a
+          # growing chain, so we gate exit on the workspaces-table existence
+          # check the downstream "Assert migrations applied" uses.
+          DEADLINE_SECS=300          # cold-start + full migration chain headroom
+          PLATFORM_PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"
+          start=$(date +%s)
+          while :; do
            if curl -sf "$BASE/health" > /dev/null; then
-              echo "Platform up after ${i}s"
-              exit 0
+              tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \
+                "SELECT count(*) FROM information_schema.tables WHERE table_schema='public' AND table_name='workspaces'" 2>/dev/null || echo "0")
+              if [ "$tables" = "1" ]; then
+                echo "Platform healthy + migrations applied after $(( $(date +%s) - start ))s"
+                exit 0
+              fi
+            fi
+            # Fast-fail: if the platform process died, /health will never come.
+            if [ -n "$PLATFORM_PID" ] && ! kill -0 "$PLATFORM_PID" 2>/dev/null; then
+              echo "::error::platform-server (pid ${PLATFORM_PID}) exited before /health became reachable — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
+            fi
+            if [ "$(( $(date +%s) - start ))" -ge "$DEADLINE_SECS" ]; then
+              echo "::error::Platform did not become healthy with migrations applied within ${DEADLINE_SECS}s — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
            fi
            sleep 1
          done
-          echo "::error::Platform did not become healthy in 30s"
-          cat workspace-server/platform.log || true
-          exit 1
+
      - name: Assert migrations applied
        if: needs.detect-changes.outputs.api == 'true'
        run: |
@@ -354,6 +394,21 @@ jobs:
      - name: Run E2E API tests
        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_api.sh
+      - name: Run keyless feature-contract E2E (terminal-diagnose / webhooks / budget / checkpoints / audit / traces / session-search / rescue / llm-billing-mode / resume / hibernate)
+        # Keyless required-lane coverage for feature endpoints that ship without
+        # an LLM key (runtime=external fixture). Each asserts the real HTTP
+        # contract + a meaningful failure mode (401/400/fail-closed) so a
+        # regression goes RED, not silently green. The mock-runtime A2A canned
+        # round-trip is covered by the priority-runtimes `mock` arm, not here.
+        if: needs.detect-changes.outputs.api == 'true'
+        run: bash tests/e2e/test_keyless_feature_contracts_e2e.sh
+      - name: Run secrets-dispatch contract test (keyless SECRETS_JSON branch order)
+        # Previously orphaned (no workflow referenced it). Hermetic unit-style
+        # contract over test_staging_full_saas.sh's LLM-key branch precedence —
+        # needs no platform, no bearer, no network. Guards the 2026-05-03
+        # "wrong key shape wins" incident class.
+        if: needs.detect-changes.outputs.api == 'true'
+        run: bash tests/e2e/test_secrets_dispatch.sh
      - name: Run notify-with-attachments E2E
        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_notify_attachments_e2e.sh
@@ -113,6 +113,28 @@ jobs:
    runs-on: docker-host
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    # mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
+    #
+    # PROMOTION-READINESS (toward required gate — do NOT flip continue-on-error
+    # without CTO sign-off, that's the irreversible call):
+    #   NOW FAIL-CLOSED:
+    #     - Postgres/Redis/platform/canvas readiness are already bounded
+    #       readiness-polls that hard-fail (and dump logs) at their deadline,
+    #       not fixed sleeps — preserved.
+    #     - passWithNoTests:false + forbidOnly (playwright.config.ts) → a
+    #       renamed/moved spec or stray test.only can no longer green the lane.
+    #     - REQUIRE-LIVE guard in "Run Playwright E2E tests" → chat==true must
+    #       actually execute >=1 test, else exit 1.
+    #     - chat-desktop "activity log" test no longer swallows its assertion.
+    #   STILL BLOCKS PROMOTION:
+    #     - The echo round-trip asserts on rendered "Echo: ..." text but never
+    #       asserts the echo runtime actually RECEIVED the A2A request
+    #       (fixtures/echo-runtime.ts exposes lastRequest, unused) — an
+    #       optimistic client-side render could pass without a real round-trip.
+    #       Add a server-received assertion before required.
+    #     - The "No-op pass" path (detect-changes chat!=true) is a legitimate
+    #       paths-filter skip, but a required gate needs it to be a neutral
+    #       check, not a green "success", so a skipped heavy lane can't be
+    #       mistaken for a passed one.
    continue-on-error: true
    timeout-minutes: 15
    env:
@@ -242,16 +264,36 @@ jobs:
      - name: Wait for /health
        if: needs.detect-changes.outputs.chat == 'true'
        run: |
-          for i in $(seq 1 30); do
+          # Readiness signal: the platform binds /health only AFTER the full
+          # migration chain has been applied on cold start (it prints
+          # "Platform starting on :PORT" at that point). So a 200 from /health
+          # is the real "migrations done + server listening" signal.
+          #
+          # The migration chain grows every release, so a fixed ~30s budget is
+          # brittle by construction. Use a generous wall-clock budget that
+          # comfortably exceeds cold-start + full-migration time, polling fast.
+          # Robust to a growing chain WITHOUT masking a dead platform: if the
+          # background platform-server process has exited, fail loudly at once.
+          DEADLINE_SECS=180          # cold-start + full migration chain headroom
+          PLATFORM_PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"
+          start=$(date +%s)
+          while :; do
            if curl -sf "http://127.0.0.1:${PLATFORM_PORT}/health" > /dev/null; then
-              echo "Platform up after ${i}s"
+              echo "Platform healthy after $(( $(date +%s) - start ))s"
              exit 0
            fi
+            if [ -n "$PLATFORM_PID" ] && ! kill -0 "$PLATFORM_PID" 2>/dev/null; then
+              echo "::error::platform-server (pid ${PLATFORM_PID}) exited before /health became reachable — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
+            fi
+            if [ "$(( $(date +%s) - start ))" -ge "$DEADLINE_SECS" ]; then
+              echo "::error::Platform did not become healthy within ${DEADLINE_SECS}s — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
+            fi
            sleep 1
          done
-          echo "::error::Platform did not become healthy in 30s"
-          cat workspace-server/platform.log || true
-          exit 1

      - name: Install canvas dependencies
        if: needs.detect-changes.outputs.chat == 'true'
@@ -278,25 +320,68 @@ jobs:
          export NEXT_PUBLIC_WS_URL="ws://127.0.0.1:${PLATFORM_PORT}/ws"
          npx next dev --turbopack -p "${CANVAS_PORT}" > canvas.log 2>&1 &
          echo $! > canvas.pid
-          for i in $(seq 1 30); do
-            if curl -sf "http://localhost:${CANVAS_PORT}" > /dev/null 2>&1; then
-              echo "Canvas up after ${i}s"
-              exit 0
+          # Readiness must wait for the actual chat route to *compile*, not
+          # just for the dev server to bind the port. `next dev --turbopack`
+          # accepts the TCP connection well before it has compiled a route
+          # on first request, so a bare `curl /` can 200 (or hang) while the
+          # page the tests load is still building. We therefore probe the
+          # real route the specs navigate to (`/?m=chat`) and require a 2xx,
+          # which only happens once Turbopack has finished the first
+          # compile. The previous 30s budget was also too tight for a cold
+          # Turbopack first-compile on a loaded operator-host runner — the
+          # `Canvas did not start in 30s` flake. Raise to 120s (job
+          # timeout-minutes is 15, so this is comfortably bounded) and probe
+          # every 2s.
+          READY=""
+          for i in $(seq 1 60); do
+            # Tempfile-routed -w + set +e/-e prevents curl-exit-code
+            # pollution of the captured status (lint-curl-status-capture.yml).
+            set +e
+            curl -s -o /dev/null -w '%{http_code}' "http://localhost:${CANVAS_PORT}/?m=chat" > /tmp/canvas-ready.code
+            set -e
+            CODE=$(cat /tmp/canvas-ready.code 2>/dev/null || echo "000")
+            if [ "$CODE" -ge 200 ] && [ "$CODE" -lt 400 ]; then
+              echo "Canvas (chat route compiled) up after ~$((i*2))s (HTTP ${CODE})"
+              READY=1
+              break
            fi
-            sleep 1
+            sleep 2
          done
-          echo "::error::Canvas did not start in 30s"
-          cat canvas.log || true
-          exit 1
+          if [ -z "$READY" ]; then
+            echo "::error::Canvas chat route did not compile in 120s (last HTTP ${CODE})"
+            cat canvas.log || true
+            exit 1
+          fi

      - name: Run Playwright E2E tests
        if: needs.detect-changes.outputs.chat == 'true'
        working-directory: canvas
+        env:
+          # CI=1 activates forbidOnly in playwright.config.ts (a stray
+          # `test.only` would otherwise green the suite while skipping the
+          # rest). passWithNoTests:false (also in the config) already makes
+          # a zero-match selection exit non-zero.
+          CI: "1"
        run: |
+          set -euo pipefail
          export E2E_PLATFORM_URL="http://127.0.0.1:${PLATFORM_PORT}"
          export E2E_DATABASE_URL="${DATABASE_URL}"
          export PLAYWRIGHT_BASE_URL="http://localhost:${CANVAS_PORT}"
-          npx playwright test e2e/chat-desktop.spec.ts e2e/chat-mobile.spec.ts
+
+          # REQUIRE-LIVE guard (mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE):
+          # this lane reached here only because detect-changes said chat==true,
+          # so it MUST actually execute the round-trip specs. `pipefail` makes
+          # a real test failure (playwright non-zero) abort here under `set -e`;
+          # passWithNoTests:false makes a zero-match selection non-zero too. The
+          # explicit grep below is belt-and-braces: assert the list reporter
+          # printed an executed-count summary, so a silent all-skip / no-op can
+          # never report green.
+          npx playwright test e2e/chat-desktop.spec.ts e2e/chat-mobile.spec.ts \
+            --reporter=list 2>&1 | tee /tmp/pw-chat.out
+          if ! grep -qE '[0-9]+ (passed|failed|skipped)' /tmp/pw-chat.out; then
+            echo "::error::E2E Chat REQUIRE-LIVE: chat==true but Playwright reported no executed tests — specs missing or all-skipped, refusing to report green."
+            exit 1
+          fi

      - name: Dump platform log on failure
        if: failure() && needs.detect-changes.outputs.chat == 'true'
@@ -130,13 +130,37 @@ jobs:
        run: |
          set -euo pipefail
          ./workspace-server/platform-server > workspace-server/platform.log 2>&1 &
-          echo $! > workspace-server/platform.pid
-          for i in $(seq 1 30); do
-            curl -sf "$BASE/health" >/dev/null && exit 0
+          PLATFORM_PID=$!
+          echo "$PLATFORM_PID" > workspace-server/platform.pid
+          # Readiness signal: the platform binds /health only AFTER the full
+          # migration chain has been applied on cold start (it prints
+          # "Platform starting on :PORT" at that point). So a 200 from /health
+          # is the real "migrations done + server listening" signal.
+          #
+          # The migration chain grows every release, so a fixed ~30s budget is
+          # brittle by construction. Use a generous wall-clock budget that
+          # comfortably exceeds cold-start + full-migration time, polling fast.
+          # Robust to a growing chain WITHOUT masking a dead platform: if the
+          # background platform-server process has exited, fail loudly at once.
+          DEADLINE_SECS=180          # cold-start + full migration chain headroom
+          start=$(date +%s)
+          while :; do
+            if curl -sf "$BASE/health" >/dev/null; then
+              echo "Platform healthy after $(( $(date +%s) - start ))s"
+              exit 0
+            fi
+            if ! kill -0 "$PLATFORM_PID" 2>/dev/null; then
+              echo "::error::platform-server (pid ${PLATFORM_PID}) exited before /health became reachable — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
+            fi
+            if [ "$(( $(date +%s) - start ))" -ge "$DEADLINE_SECS" ]; then
+              echo "::error::Platform did not become healthy within ${DEADLINE_SECS}s — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
+            fi
            sleep 1
          done
-          cat workspace-server/platform.log || true
-          exit 1

      - name: Run comprehensive E2E
        run: bash tests/e2e/test_comprehensive_e2e.sh
@@ -126,6 +126,7 @@ jobs:
  # push/dispatch/cron only (30+ min). This is NOT a fake-green mask of
  # the real assertion — it validates the driving script's bash syntax
  # and inline-python so a broken test script fails at PR time.
+  # bp-required: pending #1296 — PR emitter, not yet required (tracked in #1296).
  pr-validate:
    name: E2E Peer Visibility
    runs-on: ubuntu-latest
@@ -267,12 +268,36 @@ jobs:
          echo $! > platform.pid
      - name: Wait for /health
        run: |
-          for i in $(seq 1 30); do
-            curl -sf "$BASE/health" > /dev/null && { echo "Platform up after ${i}s"; exit 0; }
+          # Readiness signal: the platform binds /health only AFTER the full
+          # migration chain has been applied on cold start (it prints
+          # "Platform starting on :PORT" at that point). So a 200 from /health
+          # is the real "migrations done + server listening" signal.
+          #
+          # The migration chain grows every release, so a fixed ~30s budget is
+          # brittle by construction. Use a generous wall-clock budget that
+          # comfortably exceeds cold-start + full-migration time, polling fast.
+          # Robust to a growing chain WITHOUT masking a dead platform: if the
+          # background platform-server process has exited, fail loudly at once.
+          DEADLINE_SECS=180          # cold-start + full migration chain headroom
+          PLATFORM_PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"
+          start=$(date +%s)
+          while :; do
+            if curl -sf "$BASE/health" > /dev/null; then
+              echo "Platform healthy after $(( $(date +%s) - start ))s"
+              exit 0
+            fi
+            if [ -n "$PLATFORM_PID" ] && ! kill -0 "$PLATFORM_PID" 2>/dev/null; then
+              echo "::error::platform-server (pid ${PLATFORM_PID}) exited before /health became reachable — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
+            fi
+            if [ "$(( $(date +%s) - start ))" -ge "$DEADLINE_SECS" ]; then
+              echo "::error::Platform did not become healthy within ${DEADLINE_SECS}s — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
+            fi
            sleep 1
          done
-          echo "::error::Platform did not become healthy in 30s"
-          cat workspace-server/platform.log || true; exit 1
      - name: Run LOCAL fresh-provision peer-visibility E2E (literal MCP list_peers)
        # HONEST gate — NO continue-on-error. The local backend uses
        # external-mode workspaces so this context tests the literal MCP
@@ -12,9 +12,30 @@ name: E2E Staging Canvas (Playwright)
 #

 # Playwright test suite that provisions a fresh staging org per run and
-# verifies every workspace-panel tab renders without crashing. Complements
-# e2e-staging-saas.yml (which tests the API shape) by exercising the
-# actual browser + canvas bundle against live staging.
+# verifies every workspace-panel tab renders REAL content (not just an
+# empty/errored container). Complements e2e-staging-saas.yml (which tests
+# the API shape) by exercising the actual browser + canvas bundle against
+# live staging.
+#
+# PROMOTION-READINESS (toward making this a HARD merge-gate):
+#   NOW RELIABLE (spec hardened — staging-tabs.spec.ts):
+#     - All waits condition-based (toBeVisible/toHaveAttribute/expect.poll);
+#       no fixed waitForTimeout in the spec.
+#     - Tabs asserted on settled REAL content, not "container visible".
+#     - ErrorBoundary + visible error alerts fail non-degraded tabs.
+#     - Tab-list parity-checked vs live DOM; fail-closed on missing tenant.
+#   STILL BLOCKS PROMOTION-TO-REQUIRED (do NOT remove continue-on-error —
+#   CTO-owned, RFC internal#219 §1):
+#     - Infra dependency: real staging EC2 per run (12-20 min cold boot);
+#       AWS/Cloudflare/CP availability would become merge-blockers.
+#     - Shared-zone TLS/DNS/ACME propagation flake surface is upstream of
+#       this repo and outside its control.
+#     - Required-gate correctness needs CP_STAGING_ADMIN_API_TOKEN GUARANTEED
+#       present; today's skip-if-absent (core#2225) is right for non-gating
+#       but would skip-green a required check.
+#     - Single hermes/platform_managed workspace; agent-dependent content
+#       (live chat/traces round-trip) not exercised on staging (#2162).
+#   The full checklist lives at the foot of canvas/e2e/staging-tabs.spec.ts.
 #
 # Triggers: push to main, PR touching canvas sources + this workflow only
 # after the PR enters `merge-queue`, manual dispatch, and scheduled cron to
@@ -167,16 +188,30 @@ jobs:
      - if: needs.detect-changes.outputs.canvas == 'true'
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

-      - name: Verify admin token present
+      # Skip-if-absent (core#2225), mirroring the serving-e2e gate's
+      # skip-if-secret-unset contract: a MISSING CI secret is an operator
+      # CONFIG gap, not a code regression, so it must not paint this E2E
+      # red. When CP_STAGING_ADMIN_API_TOKEN is unset we emit a LOUD
+      # ::warning:: + ::notice:: and skip the real provision/test steps (the
+      # job still completes green). When the secret IS present we run the
+      # full suite exactly as before. Operators: set
+      # CP_STAGING_ADMIN_API_TOKEN as a repo/org Actions secret on
+      # molecule-core to actually exercise this E2E.
+      - name: Check admin token (skip-if-absent)
+        id: token_check
        if: needs.detect-changes.outputs.canvas == 'true'
        run: |
          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
-            echo "::error::Missing CP_STAGING_ADMIN_API_TOKEN"
-            exit 2
+            echo "::warning::CP_STAGING_ADMIN_API_TOKEN is not set on this runner — SKIPPING the staging canvas E2E (cannot auth to staging CP). This is an operator config gap, not a code failure; set the secret on molecule-core (repo or org Actions secrets) to run it. See core#2225."
+            echo "::notice::E2E Staging Canvas skipped: CP_STAGING_ADMIN_API_TOKEN absent."
+            echo "present=false" >> "$GITHUB_OUTPUT"
+          else
+            echo "CP_STAGING_ADMIN_API_TOKEN present ✓ — running staging canvas E2E."
+            echo "present=true" >> "$GITHUB_OUTPUT"
          fi

      - name: Set up Node
-        if: needs.detect-changes.outputs.canvas == 'true'
+        if: needs.detect-changes.outputs.canvas == 'true' && steps.token_check.outputs.present == 'true'
        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: '20'
@@ -184,11 +219,11 @@ jobs:
          cache-dependency-path: canvas/package-lock.json

      - name: Install canvas deps
-        if: needs.detect-changes.outputs.canvas == 'true'
+        if: needs.detect-changes.outputs.canvas == 'true' && steps.token_check.outputs.present == 'true'
        run: npm ci

      - name: Install Playwright browsers
-        if: needs.detect-changes.outputs.canvas == 'true'
+        if: needs.detect-changes.outputs.canvas == 'true' && steps.token_check.outputs.present == 'true'
        timeout-minutes: 10
        run: |
          PREBAKED_PLAYWRIGHT=/ms-playwright
@@ -200,7 +235,7 @@ jobs:
          npx playwright install --with-deps chromium

      - name: Run staging canvas E2E
-        if: needs.detect-changes.outputs.canvas == 'true'
+        if: needs.detect-changes.outputs.canvas == 'true' && steps.token_check.outputs.present == 'true'
        run: npx playwright test --config=playwright.staging.config.ts

      - name: Upload Playwright report on failure
@@ -85,6 +85,25 @@ jobs:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    # mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
+    #
+    # PROMOTION-READINESS (toward required gate — do NOT flip continue-on-error
+    # without CTO sign-off, that's the irreversible call):
+    #   NOW FAIL-CLOSED:
+    #     - Missing CP_STAGING_ADMIN_API_TOKEN → hard exit 2 (preflight).
+    #     - Staging CP unhealthy → hard exit 1 (preflight, not a workspace bug).
+    #     - Harness E2E_REQUIRE_LIVE=1 → exit 5 if a clean exit didn't prove
+    #       all four awaiting_agent transitions (no silent skip).
+    #     - Sweep transition (step 6) is now a bounded readiness-poll, not a
+    #       fixed sleep + one-shot assert → no more sweep-cadence flake.
+    #     - register / re-register retry ONLY transient edge 5xx (bounded),
+    #       fail closed on 4xx → no more cold-boot-502 flake.
+    #   STILL BLOCKS PROMOTION:
+    #     - Single shared staging tenant + EC2 quota window: an infra-side
+    #       provisioning outage (not a code bug) would turn the gate red.
+    #       Needs an infra-class vs code-class signal split before required.
+    #     - "CP unhealthy → exit 1" currently looks identical to a real
+    #       failure on the run page; required-gate would need it demoted to
+    #       a neutral/skip so staging flakiness can't block merges.
    continue-on-error: true
    timeout-minutes: 25

@@ -124,6 +143,15 @@ jobs:

      - name: Run external-runtime E2E
        id: e2e
+        # E2E_REQUIRE_LIVE=1: the harness fails CLOSED (exit 5) if it ever
+        # reaches a clean exit without proving all four awaiting_agent
+        # transitions. Mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE — a
+        # silent skip / early-return / dropped assertion can no longer
+        # masquerade as green. Token-missing and CP-unhealthy already
+        # hard-fail in the two preflight steps above, so reaching this step
+        # means a real cycle is expected.
+        env:
+          E2E_REQUIRE_LIVE: "1"
        run: bash tests/e2e/test_staging_external_runtime.sh

      # Mirror the e2e-staging-saas.yml safety net: if the runner is
@@ -0,0 +1,199 @@
+name: E2E Staging Reconciler (heals terminated EC2)
+
+# Live staging proof for the core#2261 instance-state reconciler
+# (workspace-server/internal/registry/cp_instance_reconciler.go). The
+# real-infra complement to the deterministic unit tests: provisions a real
+# staging workspace, TERMINATES its EC2, and asserts the reconciler flips it
+# off 'online' (PRIMARY gate) and auto-reprovisions on a new instance_id
+# (SECONDARY, best-effort). See
+# tests/e2e/test_reconciler_heals_terminated_instance.sh for the assertion
+# contract + timeouts.
+#
+# Modeled on e2e-staging-saas.yml. Same secrets + same Gitea-port caveats:
+#   - Dropped workflow_dispatch.inputs (Gitea 1.22.6 parser rejects them).
+#   - Dropped merge_group / environment (no Gitea equivalent).
+#   - Workflow-level env.GITHUB_SERVER_URL pinned per
+#     feedback_act_runner_github_server_url.
+#
+# NOT a required check (yet). This is a brand-new live E2E that provisions +
+# terminates real EC2 (costs money, shares the cp#245 cold-boot flake
+# surface). A new live e2e must NOT hard-gate every merge until it has a
+# green track record. continue-on-error: true surfaces failures without
+# blocking. PROMOTE to branch-required (flip continue-on-error → false AND
+# add "E2E Staging Reconciler" to branch protection) once it has run green on
+# main for several consecutive days — same de-flake discipline the
+# platform-boot job in e2e-staging-saas.yml documents.
+
+on:
+  # Run when the reconciler itself, the script, or the libs it depends on
+  # change — so a reconciler regression is caught on the PR that introduces
+  # it (paths filter), plus a daily schedule to catch infra/AMI drift.
+  push:
+    branches: [main]
+    paths:
+      - 'workspace-server/internal/registry/cp_instance_reconciler.go'
+      - 'tests/e2e/test_reconciler_heals_terminated_instance.sh'
+      - 'tests/e2e/lib/aws_leak_check.sh'
+      - 'tests/e2e/lib/model_slug.sh'
+      - '.gitea/workflows/e2e-staging-reconciler.yml'
+  pull_request:
+    branches: [main]
+    paths:
+      - 'workspace-server/internal/registry/cp_instance_reconciler.go'
+      - 'tests/e2e/test_reconciler_heals_terminated_instance.sh'
+      - 'tests/e2e/lib/aws_leak_check.sh'
+      - 'tests/e2e/lib/model_slug.sh'
+      - '.gitea/workflows/e2e-staging-reconciler.yml'
+  workflow_dispatch:
+  schedule:
+    # 08:00 UTC daily — offset from e2e-staging-saas (07:00) so the two live
+    # harnesses don't fight over staging's per-hour org-creation quota.
+    - cron: '0 8 * * *'
+
+# Serialize against itself: staging has a finite per-hour org-creation quota,
+# and a cancelled run mid-teardown leaks EC2. cancel-in-progress: false
+# mirrors e2e-staging-saas.yml.
+concurrency:
+  group: e2e-staging-reconciler
+  cancel-in-progress: false
+
+env:
+  GITHUB_SERVER_URL: https://git.moleculesai.app
+
+jobs:
+  # PR-validation path: always posts success so a workflow-only / script-only
+  # PR has a status check (this workflow's real job only fires on the paths
+  # filter). Mirrors the pr-validate job in e2e-staging-saas.yml.
+  pr-validate:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          fetch-depth: 1
+        continue-on-error: true
+      - name: YAML validation (best-effort)
+        run: |
+          echo "e2e-staging-reconciler.yml — PR validation: workflow YAML is valid."
+          echo "Live E2E step runs only when the reconciler / script / libs change."
+        continue-on-error: true
+
+  e2e-staging-reconciler:
+    name: E2E Staging Reconciler
+    runs-on: ubuntu-latest
+    # NOT required yet — surface failures without blocking merges. Flip to
+    # false + add to branch protection once green on main for a de-flake
+    # window (see the header note). mc#1982: do not renew this mask silently.
+    continue-on-error: true
+    timeout-minutes: 60
+    permissions:
+      contents: read
+
+    env:
+      MOLECULE_CP_URL: https://staging-api.moleculesai.app
+      # Single admin-bearer secret drives provision + tenant-token retrieval +
+      # teardown (= Railway staging CP_ADMIN_API_TOKEN). Same secret name the
+      # saas workflow canonicalised to under internal#322.
+      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      AWS_DEFAULT_REGION: us-east-2
+      # Leak-check is REQUIRED here: this test deliberately terminates an EC2,
+      # so teardown MUST positively confirm no slug-tagged box survives.
+      E2E_AWS_LEAK_CHECK: required
+      E2E_AWS_TERMINATE_LEAKS: '1'
+      # claude-code + MiniMax is the cheapest boot-to-online path (same as the
+      # saas job). The reconciler test never makes a completion, but the key is
+      # wired so the first boot reaches online on the same path the saas
+      # harness uses. First non-empty wins in the script's priority chain.
+      E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
+      E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
+      E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
+      E2E_RUNTIME: claude-code
+      # Platform-managed create path (moonshot/kimi-k2.6, no tenant key) — the
+      # combo proven to create cleanly; this test only needs the ws online.
+      E2E_LLM_PATH: platform
+      E2E_MODEL_SLUG: MiniMax-M2
+      E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
+      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Verify required secrets present
+        run: |
+          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
+            echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
+            exit 2
+          fi
+          for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
+            if [ -z "${!var:-}" ]; then
+              echo "::error::$var not set — this test terminates an EC2 and verifies no leak; AWS creds are mandatory"
+              exit 2
+            fi
+          done
+          echo "Required secrets present ✓"
+
+      - name: CP staging health preflight
+        run: |
+          code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
+          if [ "$code" != "200" ]; then
+            echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a reconciler bug."
+            exit 1
+          fi
+          echo "Staging CP healthy ✓"
+
+      - name: Run reconciler heal E2E
+        id: e2e
+        run: bash tests/e2e/test_reconciler_heals_terminated_instance.sh
+
+      # Belt-and-braces teardown: the script installs its own EXIT trap, but if
+      # the runner is cancelled the trap may not fire. This always() step
+      # double-deletes any e2e-rec-* org from THIS run. The admin DELETE is
+      # idempotent so double-invoking is safe.
+      - name: Teardown safety net (runs on cancel/failure)
+        if: always()
+        env:
+          ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
+        run: |
+          set +e
+          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
+            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
+            | python3 -c "
+          import json, sys, os, datetime
+          run_id = os.environ.get('GITHUB_RUN_ID', '')
+          d = json.load(sys.stdin)
+          today = datetime.date.today()
+          yesterday = today - datetime.timedelta(days=1)
+          dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
+          # Slug shape: e2e-rec-YYYYMMDD-<run_id>-<attempt>-...
+          if run_id:
+              prefixes = tuple(f'e2e-rec-{d}-{run_id}-' for d in dates)
+          else:
+              prefixes = tuple(f'e2e-rec-{d}-' for d in dates)
+          candidates = [o['slug'] for o in d.get('orgs', [])
+                        if any(o.get('slug','').startswith(p) for p in prefixes)
+                        and o.get('instance_status') not in ('purged',)]
+          print('\n'.join(candidates))
+          " 2>/dev/null)
+          leaks=()
+          for slug in $orgs; do
+            echo "Safety-net teardown: $slug"
+            set +e
+            curl -sS -o /tmp/rec-cleanup.out -w "%{http_code}" \
+              -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
+              -H "Authorization: Bearer $ADMIN_TOKEN" \
+              -H "Content-Type: application/json" \
+              -d "{\"confirm\":\"$slug\"}" >/tmp/rec-cleanup.code
+            set -e
+            code=$(cat /tmp/rec-cleanup.code 2>/dev/null || echo "000")
+            if [ "$code" = "200" ] || [ "$code" = "204" ]; then
+              echo "[teardown] deleted $slug (HTTP $code)"
+            else
+              echo "::warning::reconciler teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/rec-cleanup.out 2>/dev/null)"
+              leaks+=("$slug")
+            fi
+          done
+          if [ ${#leaks[@]} -gt 0 ]; then
+            echo "::warning::reconciler teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
+          fi
+          exit 0
@@ -48,8 +48,10 @@ on:
      - 'workspace-server/internal/handlers/a2a_proxy.go'
      - 'workspace-server/internal/middleware/**'
      - 'workspace-server/internal/provisioner/**'
+      - 'workspace-server/internal/providers/providers.yaml'
      - 'tests/e2e/test_staging_full_saas.sh'
      - 'tests/e2e/lib/completion_assert.sh'
+      - 'tests/e2e/lib/model_slug.sh'
      - 'tests/e2e/lib/aws_leak_check.sh'
      - 'tests/e2e/test_aws_leak_check.sh'
      - '.gitea/workflows/e2e-staging-saas.yml'
@@ -61,8 +63,10 @@ on:
      - 'workspace-server/internal/handlers/a2a_proxy.go'
      - 'workspace-server/internal/middleware/**'
      - 'workspace-server/internal/provisioner/**'
+      - 'workspace-server/internal/providers/providers.yaml'
      - 'tests/e2e/test_staging_full_saas.sh'
      - 'tests/e2e/lib/completion_assert.sh'
+      - 'tests/e2e/lib/model_slug.sh'
      - 'tests/e2e/lib/aws_leak_check.sh'
      - 'tests/e2e/test_aws_leak_check.sh'
      - '.gitea/workflows/e2e-staging-saas.yml'
@@ -168,9 +172,23 @@ jobs:
      # and defeats the cost saving. Operators can override via the
      # workflow_dispatch flow (no input wired here yet — runtime
      # override is enough for ad-hoc).
-      E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'codex' && 'openai/gpt-4o' || github.event.inputs.runtime == 'google-adk' && 'google_genai:gemini-2.5-pro' || 'MiniMax-M2' }}
+      #
+      # #2263 deploy-skew: the claude-code default is the COLON-namespaced BYOK
+      # id `minimax:MiniMax-M2.7`, NOT bare `MiniMax-M2`. The deployed staging
+      # ws-server's compiled registry can lag source; validateRegisteredModelForRuntime
+      # 400s the bare form on an older image (the sibling Platform Boot job, on
+      # the SAME image, succeeds with namespaced `moonshot/kimi-k2.6`). The colon
+      # form stays in the BYOK `minimax` arm (providers.yaml:851) so it resolves
+      # provider=minimax (BYOK) and the #1994 byok-not-platform guard still
+      # passes — the slash/platform form `minimax/MiniMax-M2.7` would not.
+      E2E_MODEL_SLUG: ${{ github.event.inputs.runtime == 'hermes' && 'openai/gpt-4o' || github.event.inputs.runtime == 'codex' && 'openai/gpt-4o' || github.event.inputs.runtime == 'google-adk' && 'google_genai:gemini-2.5-pro' || 'minimax:MiniMax-M2.7' }}
      E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
+      # Fail-closed-on-skip: in CI the harness MUST prove ≥1 full
+      # provision→online→A2A cycle. If it reaches the end having validated
+      # nothing (a future short-circuit / skip path), it exits 5 rather than
+      # reporting a false green. Mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE.
+      E2E_REQUIRE_LIVE: '1'

    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -315,3 +333,152 @@ jobs:
            echo "::warning::saas teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
          fi
          exit 0
+
+  # ── PLATFORM-MANAGED BOOT REGRESSION (moonshot/kimi NOT_CONFIGURED) ──────────
+  #
+  # The REAL-boot complement to the deterministic unit suite
+  # (workspace_provision_platform_boot_test.go). Provisions a REAL staging
+  # claude-code workspace on the PLATFORM-managed path — provider=platform,
+  # model=moonshot/kimi-k2.6, NO tenant LLM key — and asserts it reaches
+  # status=online (NOT not_configured) and a completion returns 200, via the same
+  # online-wait + completion-assert the BYOK job uses.
+  #
+  # Why a SEPARATE job (not a matrix leg of e2e-staging-saas): the platform path
+  # injects NO secret and pins a different model, so its env block diverges from
+  # the BYOK job's. A dedicated job keeps each path's "verify key present" preflight
+  # honest (BYOK requires a key; platform requires its ABSENCE not to matter) and
+  # gives the regression its own named commit-status for branch protection.
+  #
+  # Add `E2E Staging Platform Boot` to branch protection after 3 consecutive
+  # green runs on main (de-flake window; this path shares the cp#245
+  # boot-timeout flake surface the BYOK job has, so it must prove stable before
+  # it can BLOCK — see the gate-making plan in the PR body).
+  # bp-required: pending #2187
+  e2e-staging-platform-boot:
+    name: E2E Staging Platform Boot
+    runs-on: ubuntu-latest
+    # Phase 3 (RFC #219 §1): surface without blocking until the de-flake window
+    # closes. mc#1982: do NOT renew this mask silently — the gate-making plan
+    # tracks the flip to false under #2187.
+    continue-on-error: true
+    timeout-minutes: 45
+    permissions:
+      contents: read
+
+    env:
+      MOLECULE_CP_URL: https://staging-api.moleculesai.app
+      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      AWS_DEFAULT_REGION: us-east-2
+      E2E_AWS_LEAK_CHECK: required
+      E2E_AWS_TERMINATE_LEAKS: '1'
+      # The regression combo: claude-code + platform-managed + moonshot/kimi-k2.6.
+      # NO E2E_*_API_KEY is set — platform-managed billing is owned by Molecule via
+      # the CP LLM proxy. The harness's E2E_LLM_PATH=platform branch sends empty
+      # secrets and pin-selects the platform model.
+      E2E_RUNTIME: claude-code
+      E2E_LLM_PATH: platform
+      # Smoke mode: a single parent workspace is enough to prove online +
+      # completion for the platform path (the A2A/delegation matrix is the BYOK
+      # job's job). Override E2E_DEFAULT_PLATFORM_MODEL via workflow_dispatch to
+      # exercise another platform model id.
+      E2E_MODE: smoke
+      E2E_RUN_ID: "platform-${{ github.run_id }}-${{ github.run_attempt }}"
+      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
+      # Fail-closed-on-skip (see BYOK job). smoke mode still runs steps 2/4/7/8b,
+      # so all four required milestones (provisioned/tenant_online/
+      # workspace_online/a2a_roundtrip) fire — the guard is valid for this lane too.
+      E2E_REQUIRE_LIVE: '1'
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Verify admin token present
+        run: |
+          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
+            echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
+            exit 2
+          fi
+          for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do
+            if [ -z "${!var:-}" ]; then
+              echo "::error::$var not set — EC2 leak verification cannot run"
+              exit 2
+            fi
+          done
+          echo "Admin token present ✓"
+
+      - name: Assert NO BYOK key leaks into the platform run
+        run: |
+          # The whole point of this job is the platform-managed path. A stray
+          # E2E_*_API_KEY in the runner env would (via the harness) still be
+          # skipped by the E2E_LLM_PATH=platform branch — but assert their
+          # absence loudly here so a future env edit can't silently convert this
+          # into a masked BYOK run that no longer exercises the regression.
+          for var in E2E_MINIMAX_API_KEY E2E_ANTHROPIC_API_KEY E2E_OPENAI_API_KEY; do
+            if [ -n "${!var:-}" ]; then
+              echo "::warning::$var is set in this platform-boot job's env — the harness ignores it on E2E_LLM_PATH=platform, but it should not be wired here."
+            fi
+          done
+          echo "Platform-managed path: no tenant LLM key required ✓"
+
+      - name: CP staging health preflight
+        run: |
+          code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
+          if [ "$code" != "200" ]; then
+            echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a workspace bug."
+            exit 1
+          fi
+          echo "Staging CP healthy ✓"
+
+      - name: Run platform-managed boot E2E (online + completion)
+        id: e2e
+        run: bash tests/e2e/test_staging_full_saas.sh
+
+      - name: Teardown safety net (runs on cancel/failure)
+        if: always()
+        env:
+          ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
+        run: |
+          set +e
+          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
+            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
+            | python3 -c "
+          import json, sys, os, datetime
+          run_id = os.environ.get('GITHUB_RUN_ID', '')
+          d = json.load(sys.stdin)
+          today = datetime.date.today()
+          yesterday = today - datetime.timedelta(days=1)
+          dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
+          # smoke mode slugs are e2e-smoke-YYYYMMDD-platform-<run_id>-...
+          if run_id:
+              prefixes = tuple(f'e2e-smoke-{d}-platform-{run_id}-' for d in dates)
+          else:
+              prefixes = tuple(f'e2e-smoke-{d}-platform-' for d in dates)
+          candidates = [o['slug'] for o in d.get('orgs', [])
+                        if any(o.get('slug','').startswith(p) for p in prefixes)
+                        and o.get('instance_status') not in ('purged',)]
+          print('\n'.join(candidates))
+          " 2>/dev/null)
+          leaks=()
+          for slug in $orgs; do
+            echo "Safety-net teardown: $slug"
+            set +e
+            curl -sS -o /tmp/plat-cleanup.out -w "%{http_code}" \
+              -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
+              -H "Authorization: Bearer $ADMIN_TOKEN" \
+              -H "Content-Type: application/json" \
+              -d "{\"confirm\":\"$slug\"}" >/tmp/plat-cleanup.code
+            set -e
+            code=$(cat /tmp/plat-cleanup.code 2>/dev/null || echo "000")
+            if [ "$code" = "200" ] || [ "$code" = "204" ]; then
+              echo "[teardown] deleted $slug (HTTP $code)"
+            else
+              echo "::warning::platform-boot teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within ~45 min. Body: $(head -c 300 /tmp/plat-cleanup.out 2>/dev/null)"
+              leaks+=("$slug")
+            fi
+          done
+          if [ ${#leaks[@]} -gt 0 ]; then
+            echo "::warning::platform-boot teardown left ${#leaks[@]} leak(s): ${leaks[*]}"
+          fi
+          exit 0
@@ -88,8 +88,9 @@ jobs:
    # surprises and keeps the routing rule discoverable in one place.
    runs-on: docker-host
    # mc#1982 Phase 3 (RFC §1): surface broken workflows without blocking.
-    # mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
-    continue-on-error: true
+    # mc#1982: mask removed. If regressions appear, root-fix the underlying
+    # test — do NOT renew the mask silently.
+    continue-on-error: false
    outputs:
      handlers: ${{ steps.filter.outputs.handlers }}
    steps:
@@ -119,8 +120,9 @@ jobs:
    # exists). See detect-changes for the full routing rationale.
    runs-on: docker-host
    # mc#1982 Phase 3 (RFC §1): surface broken workflows without blocking.
-    # mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
-    continue-on-error: true
+    # mc#1982: mask removed. If regressions appear, root-fix the underlying
+    # test — do NOT renew the mask silently.
+    continue-on-error: false
    env:
      # Unique name per run so concurrent jobs don't collide on the
      # bridge network. ${RUN_ID}-${RUN_ATTEMPT} is unique even across
@@ -241,7 +243,8 @@ jobs:
          # MUST exist for the integration tests to be meaningful. Hard-
          # fail if any didn't land — that would be a real regression we
          # want loud.
-          for tbl in delegations workspaces activity_logs pending_uploads; do
+          # workspace_schedules added for the #2149 scheduler integration tests.
+          for tbl in delegations workspaces activity_logs pending_uploads workspace_schedules; do
            if ! psql -h "${PG_HOST}" -U postgres -d molecule -tA \
                -c "SELECT 1 FROM information_schema.tables WHERE table_name = '$tbl'" \
                | grep -q 1; then
@@ -251,6 +254,19 @@ jobs:
            echo "✓ $tbl table present"
          done

+      - if: needs.detect-changes.outputs.handlers == 'true'
+        name: Preflight — INTEGRATION_DB_URL must be present
+        run: |
+          # Belt-and-suspenders: if the postgres-start step failed to
+          # export INTEGRATION_DB_URL, fail loud BEFORE go test can
+          # t.Skip its way to a green build. Closes the workflow-level
+          # fail-open gap identified in PR #2166 blocker #2.
+          if [ -z "${INTEGRATION_DB_URL:-}" ]; then
+            echo "::error::INTEGRATION_DB_URL is empty — postgres-start step did not export the connection string"
+            exit 1
+          fi
+          echo "INTEGRATION_DB_URL is set"
+
      - if: needs.detect-changes.outputs.handlers == 'true'
        name: Run integration tests
        run: |
@@ -259,6 +275,16 @@ jobs:
          # workflow runs don't fight over a host-net 5432 port.
          go test -tags=integration -timeout 5m -v ./internal/handlers/ -run "^TestIntegration_"

+      - if: needs.detect-changes.outputs.handlers == 'true'
+        name: Run scheduler integration tests (#2149)
+        run: |
+          # #2149: real-PG regression coverage for the scheduler firing loop
+          # (tick → A2A fire → write-back of last_run_at/next_run_at/run_count/
+          # activity_logs jsonb incl. invalid-UTF-8 sanitization + sweepPhantomBusy).
+          # Reuses the same migrated Postgres (workspace_schedules / activity_logs
+          # / workspaces all landed by the migration replay step above).
+          go test -tags=integration -timeout 5m -v ./internal/scheduler/ -run "^TestIntegration_"
+
      - if: failure() && needs.detect-changes.outputs.handlers == 'true'
        name: Diagnostic dump on failure
        env:
@@ -49,37 +49,56 @@ jobs:
      GITHUB_SERVER_URL: https://git.moleculesai.app
    steps:
      - name: Identify runner
+        id: identify
+        continue-on-error: true
        run: |
          set -eu
          echo "arch=$(uname -m)"
          echo "kernel=$(uname -sr)"
          echo "shell=$BASH_VERSION"
          # Sanity: must actually be arm64. If amd64 sneaks in here,
-          # fail fast — that means the label routing is wrong.
+          # the job skips gracefully rather than hard-failing, because
+          # a mislabelled runner is an ops concern, not a code defect.
+          # Pilot lane must not make main red (#2146).
          case "$(uname -m)" in
-            aarch64|arm64) echo "arm64 confirmed" ;;
-            *) echo "ERROR: expected arm64, got $(uname -m)"; exit 1 ;;
+            aarch64|arm64)
+              echo "arm64 confirmed"
+              echo "arm64=true" >> "$GITHUB_OUTPUT"
+              ;;
+            *)
+              echo "ERROR: expected arm64, got $(uname -m) — label routing may be wrong"
+              echo "arm64=false" >> "$GITHUB_OUTPUT"
+              exit 1
+              ;;
          esac

      - name: Checkout
+        if: steps.identify.outputs.arm64 == 'true'
        uses: actions/checkout@v4
        with:
          fetch-depth: 1

      - name: Install shellcheck (arm64)
+        if: steps.identify.outputs.arm64 == 'true'
        continue-on-error: true
        run: |
          set -eu
          if command -v shellcheck >/dev/null 2>&1; then
            echo "shellcheck already present: $(shellcheck --version | head -1)"
          else
-            # Prefer apt if the runner base ships it; else download arm64 binary.
+            # Prefer apt if the runner base ships it; else download the
+            # correct platform binary (darwin vs linux).
            if command -v apt-get >/dev/null 2>&1; then
              sudo apt-get update -qq
              sudo apt-get install -y --no-install-recommends shellcheck
            else
              SC_VER=v0.10.0
-              curl -fsSL "https://github.com/koalaman/shellcheck/releases/download/${SC_VER}/shellcheck-${SC_VER}.linux.aarch64.tar.xz" \
+              if [ "$(uname -s)" = "Darwin" ]; then
+                SC_PKG="shellcheck-${SC_VER}.darwin.aarch64.tar.xz"
+              else
+                SC_PKG="shellcheck-${SC_VER}.linux.aarch64.tar.xz"
+              fi
+              curl -fsSL "https://github.com/koalaman/shellcheck/releases/download/${SC_VER}/${SC_PKG}" \
                | tar -xJf - --strip-components=1
              sudo mv shellcheck /usr/local/bin/
            fi
@@ -87,14 +106,15 @@ jobs:
          shellcheck --version | head -2

      - name: Run shellcheck on .gitea/scripts/*.sh
+        if: steps.identify.outputs.arm64 == 'true'
        continue-on-error: true
        run: |
          set -eu
          # Only the scripts we control under .gitea/scripts. Pilot
          # scope is intentionally narrow — broaden in a follow-up
          # once the lane is proven.
-          if ! command -v shellcheck >/dev/null 2>&1; then
-            echo "WARN: shellcheck binary not found — skipping (pilot mode)"
+          if ! command -v shellcheck >/dev/null 2>&1 || ! shellcheck --version >/dev/null 2>&1; then
+            echo "WARN: shellcheck not functional — skipping (pilot mode)"
            exit 0
          fi
          # NOTE: macOS ships Bash 3.2 (Apple license), no `mapfile`
@@ -14,10 +14,37 @@ name: publish-canvas-image
 #     authenticate to ghcr.io.
 #

-# Builds and pushes the canvas Docker image to ECR whenever a commit lands
-# on main that touches canvas code. Previously canvas changes were visible in
-# CI (npm run build passed) but the live container was never updated —
-# operators had to manually run `docker compose build canvas` each time.
+# Builds, pushes, and (ordered) deploys the standalone canvas Docker image to
+# ECR whenever a commit lands on main that touches canvas code.
+#
+# Ordered deploy (core#2226) — mirrors publish-workspace-server-image.yml so the
+# standalone `molecule-ai/canvas` image is deterministic + verifiable, not a
+# side effect of the platform fleet pulling a mutable `:latest`:
+#
+#   build-and-push:  build → push :staging-<sha> + :staging-latest + :sha-<sha>
+#                    (does NOT move :latest — an unpromoted build must never
+#                    become the prod-blessed tag).
+#   promote-canvas:  waits for green main CI on this SHA, then re-points
+#                    :latest to the verified :staging-<sha> by digest
+#                    (imagetools create — no rebuild). So `:latest` == the
+#                    current prod-blessed canvas, byte-identical to staging-<sha>.
+#
+# Tag scheme produced (parallels platform-tenant):
+#   :staging-<sha> — per-commit immutable digest, what docker-compose pins to.
+#   :staging-latest — most recent BUILD on main (last-writer-wins, NOT gated).
+#   :sha-<sha>     — kept for back-compat with any consumer pinning the old tag.
+#   :latest        — most recent CI-GREEN build. Only moved by promote-canvas.
+#
+# WHY this is the canvas analogue of the platform's deploy-production, not a
+# literal copy: the standalone canvas co-deploys with the platform on the same
+# host via the root docker-compose.yml (`docker compose pull && up -d`). Gating
+# the canvas `:latest` promotion on the SAME green-main-CI signal the platform
+# deploy waits on makes platform + canvas roll together by the same SHA. The
+# canvas has no per-tenant fleet of its own and no /buildinfo endpoint, so there
+# is no fleet-rollout / per-tenant verify step to mirror here — CI-green +
+# digest-pin + immutable :staging-<sha> is the determinism contract. (A future
+# canvas /buildinfo would let this assert the served SHA like the platform does;
+# tracked in core#2226.)
 #
 # Mirror of publish-platform-image.yml, adapted for the Next.js canvas layer.
 # See that workflow for inline notes on macOS Keychain isolation and QEMU.
@@ -30,6 +57,7 @@ on:
      # platform-only / docs-only / MCP-only merges.
      - 'canvas/**'
      - '.gitea/workflows/publish-canvas-image.yml'
+  workflow_dispatch:
  # NOTE (Gitea port): the original GitHub workflow had a
  # `workflow_dispatch:` manual trigger for the
  # non-canvas-merge-but-need-fresh-image scenario. Dropped in the
@@ -69,6 +97,10 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
    # mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
+    outputs:
+      # Exposed so promote-canvas re-points :latest to the EXACT per-commit tag
+      # this build produced (digest-level), never a re-resolved mutable tag.
+      staging_sha: ${{ steps.tags.outputs.staging_sha }}
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -140,6 +172,7 @@ jobs:
        shell: bash
        run: |
          echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
+          echo "staging_sha=staging-${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"

      - name: Resolve build args
        id: build_args
@@ -175,8 +208,19 @@ jobs:
          build-args: |
            NEXT_PUBLIC_PLATFORM_URL=${{ steps.build_args.outputs.platform_url }}
            NEXT_PUBLIC_WS_URL=${{ steps.build_args.outputs.ws_url }}
+            # Bake the merge SHA into the image so /api/buildinfo reports the
+            # served canvas SHA (core#2235). Mirrors how the platform image
+            # surfaces GIT_SHA at /buildinfo. Full 40-char SHA (not the
+            # 7-char tag) so the fleet redeploy verification can match exactly.
+            BUILD_SHA=${{ github.sha }}
+          # Ordered deploy (core#2226): the build job pushes the immutable
+          # per-commit tag + the build-tracking staging-latest + the legacy
+          # back-compat :sha-<sha> tag. It does NOT push :latest — :latest is
+          # the prod-blessed tag and is only re-pointed by promote-canvas after
+          # green main CI, so an unpromoted/red build can never become :latest.
          tags: |
-            ${{ env.IMAGE_NAME }}:latest
+            ${{ env.IMAGE_NAME }}:${{ steps.tags.outputs.staging_sha }}
+            ${{ env.IMAGE_NAME }}:staging-latest
            ${{ env.IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }}
          # Gitea artifact-cache reachability is best-effort on the operator
          # runner network. Do not let cache export fail an image that already
@@ -185,3 +229,107 @@ jobs:
            org.opencontainers.image.source=https://git.moleculesai.app/${{ github.repository }}
            org.opencontainers.image.revision=${{ github.sha }}
            org.opencontainers.image.description=Molecule AI canvas (Next.js 15 + React Flow)
+
+  # bp-exempt: post-merge canvas promote side-effect; merge is gated by CI /
+  # all-required and this job waits for green push CI on the SHA before acting.
+  promote-canvas:
+    name: Promote canvas :latest to CI-green build
+    needs: build-and-push
+    # Only on a real main push — workflow_dispatch / non-main never promotes.
+    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+    # Side-effect deploy only; the image publish above is the durable artifact.
+    # mc#1982: do NOT renew this mask silently — it mirrors deploy-production's
+    # contract (a flaky promote must not red the ship lane), tracked in core#2226.
+    continue-on-error: true
+    runs-on: publish
+    timeout-minutes: 60
+    env:
+      # Same green-main-CI gate the platform deploy-production waits on, so
+      # platform + canvas advance :latest off the identical signal/SHA.
+      GITEA_HOST: git.moleculesai.app
+      GITEA_TOKEN: ${{ secrets.PROD_AUTO_DEPLOY_CONTROL_TOKEN || secrets.AUTO_SYNC_TOKEN }}
+      CI_STATUS_TIMEOUT_SECONDS: "3600"
+      # Re-uses the platform's disable kill-switch: when prod auto-deploy is
+      # paused, the canvas :latest promote pauses too (correct — an unpromoted
+      # build must not become :latest while the fleet is frozen).
+      PROD_AUTO_DEPLOY_DISABLED: ${{ vars.PROD_AUTO_DEPLOY_DISABLED || secrets.PROD_AUTO_DEPLOY_DISABLED || '' }}
+    steps:
+      # The publish runner's default HOME (/home/hongming) is not writable, so
+      # docker credential saves fail and halt the promote (#2193 on the platform
+      # side). Point HOME + DOCKER_CONFIG at the writable job temp dir.
+      - name: Prepare writable HOME + Docker config
+        run: |
+          set -euo pipefail
+          H="$RUNNER_TEMP/canvas-promote-home"
+          mkdir -p "$H/.docker"
+          echo "HOME=$H" >> "$GITHUB_ENV"
+          echo "DOCKER_CONFIG=$H/.docker" >> "$GITHUB_ENV"
+
+      - name: Checkout
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Resolve promote gate
+        id: gate
+        env:
+          PROD_AUTO_DEPLOY_DISABLED: ${{ env.PROD_AUTO_DEPLOY_DISABLED }}
+        run: |
+          set -euo pipefail
+          if [ -n "${PROD_AUTO_DEPLOY_DISABLED:-}" ]; then
+            case "$(printf '%s' "$PROD_AUTO_DEPLOY_DISABLED" | tr '[:upper:]' '[:lower:]')" in
+              1|true|yes|on|disabled|disable)
+                echo "enabled=false" >> "$GITHUB_OUTPUT"
+                echo "::notice::Canvas :latest promote skipped: PROD_AUTO_DEPLOY_DISABLED=$PROD_AUTO_DEPLOY_DISABLED"
+                {
+                  echo "## Canvas :latest promote skipped"
+                  echo ""
+                  echo "Reason: \`PROD_AUTO_DEPLOY_DISABLED=$PROD_AUTO_DEPLOY_DISABLED\`. The CI-green build is published as \`:staging-${GITHUB_SHA::7}\`; \`:latest\` was left unchanged."
+                } >> "$GITHUB_STEP_SUMMARY"
+                exit 0 ;;
+            esac
+          fi
+          if [ -z "${GITEA_TOKEN:-}" ]; then
+            echo "::error::AUTO_SYNC_TOKEN/PROD_AUTO_DEPLOY_CONTROL_TOKEN is required so the canvas promote can wait for green CI."
+            exit 1
+          fi
+          echo "enabled=true" >> "$GITHUB_OUTPUT"
+
+      - name: Wait for green main CI on this SHA
+        if: ${{ steps.gate.outputs.enabled == 'true' }}
+        run: |
+          set -euo pipefail
+          # Same SSOT wait the platform deploy uses: blocks until the required
+          # push contexts (CI / all-required (push) + Secret scan) go green on
+          # THIS sha, and fails closed if any required context terminally fails.
+          python3 .gitea/scripts/prod-auto-deploy.py wait-ci
+
+      - name: Promote canvas :latest to the CI-green image
+        if: ${{ steps.gate.outputs.enabled == 'true' }}
+        env:
+          IMAGE_NAME: ${{ env.IMAGE_NAME }}
+          STAGING_SHA_TAG: ${{ needs.build-and-push.outputs.staging_sha }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_DEFAULT_REGION: us-east-2
+        run: |
+          set -euo pipefail
+          # Fail-safe: if the build job's output didn't propagate, recompute the
+          # immutable per-commit tag from the SHA so we never promote a guess.
+          SHA_TAG="${STAGING_SHA_TAG:-staging-${GITHUB_SHA::7}}"
+          ECR_REGISTRY="${IMAGE_NAME%%/*}"
+          aws ecr get-login-password --region us-east-2 | \
+            docker login --username AWS --password-stdin "${ECR_REGISTRY}"
+
+          # Digest-level re-tag (no pull/rebuild): :latest becomes byte-identical
+          # to the verified :staging-<sha> for this commit.
+          docker buildx imagetools create \
+            --tag "${IMAGE_NAME}:latest" \
+            "${IMAGE_NAME}:${SHA_TAG}"
+
+          {
+            echo "## Canvas :latest promoted"
+            echo ""
+            echo "Re-pointed \`molecule-ai/canvas:latest\` → \`${SHA_TAG}\` (by digest)."
+            echo ":latest now tracks the CI-green canvas build for commit \`${GITHUB_SHA::7}\`."
+            echo ""
+            echo "Tenants/hosts that \`docker compose pull canvas\` now get the same build the platform deploy rolled for this SHA."
+          } >> "$GITHUB_STEP_SUMMARY"
@@ -16,14 +16,24 @@ name: publish-workspace-server-image
 #
 # Image tags produced:
 #   :staging-<sha> — per-commit digest, stable for canary verify
-#   :staging-latest — tracks most recent build on this branch
+#   :staging-latest — tracks most recent BUILD on this branch (set by the
+#                     build job, last-writer-wins, NOT prod-gated)
+#   :latest — tracks the most recent PROD-PROMOTED build. Re-pointed by the
+#             deploy-production job ONLY after green main CI + canary +
+#             fleet rollout + /buildinfo verification pass. So :latest ==
+#             "current prod image", never the raw build. (Added 2026-06-03
+#             after a stale :latest — last moved 2026-05-10 — reverted a
+#             production tenant on a no-arg redeploy.)
 #
 # Production auto-deploy:
 #   After both platform and tenant images are pushed, deploy-production waits
 #   for strict required push contexts on the same SHA to go green, then
 #   calls the production CP redeploy-fleet endpoint with target_tag=
-#   staging-<sha>. Set repo variable or secret PROD_AUTO_DEPLOY_DISABLED=true
-#   to stop production rollout while keeping image publishing enabled.
+#   staging-<sha>. On success (rollout + buildinfo verified) it re-points
+#   :latest to the same SHA. Set repo variable or secret
+#   PROD_AUTO_DEPLOY_DISABLED=true to stop production rollout while keeping
+#   image publishing enabled — in which case :latest is NOT advanced either
+#   (correct: an unpromoted build must not become :latest).
 #
 # Primary ECR target: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/*
 # Optional staging tenant mirror target:
@@ -105,6 +115,19 @@ jobs:
          echo "Docker daemon OK"
          echo "::endgroup::"

+      # Pre-flight: verify every repo in manifest.json actually exists.
+      #
+      # Why: deleting a template repo without updating manifest.json breaks
+      # clone-manifest.sh with a generic git 404, which looks like a
+      # transient network error and wastes debug time. We catch it here
+      # with a per-entry ::error:: annotation naming the missing repo
+      # (issue #2192). This is the push-time complement to PR #2186's
+      # PR-time manifest-entry-existence gate.
+      - name: Validate manifest entries exist
+        run: |
+          set -euo pipefail
+          bash scripts/check-manifest-repos-exist.sh manifest.json
+
      # Pre-clone manifest deps before docker build.
      #
      # Why: workspace-template-* repos on Gitea are private. The pre-fix
@@ -252,7 +275,25 @@ jobs:
      PROD_AUTO_DEPLOY_BATCH_SIZE: ${{ vars.PROD_AUTO_DEPLOY_BATCH_SIZE || '3' }}
      PROD_AUTO_DEPLOY_DRY_RUN: ${{ vars.PROD_AUTO_DEPLOY_DRY_RUN || '' }}
      PROD_ALLOW_NON_PROD_CP_URL: ${{ vars.PROD_ALLOW_NON_PROD_CP_URL || '' }}
+      # #2213: per-tenant /buildinfo settle budget. A freshly-swapped tenant can
+      # keep serving the old image at the edge for a short drain window; the
+      # verify step polls each tenant up to this budget before declaring it stale.
+      PROD_AUTO_DEPLOY_VERIFY_BUDGET_SECONDS: ${{ vars.PROD_AUTO_DEPLOY_VERIFY_BUDGET_SECONDS || '240' }}
+      PROD_AUTO_DEPLOY_VERIFY_INTERVAL_SECONDS: ${{ vars.PROD_AUTO_DEPLOY_VERIFY_INTERVAL_SECONDS || '20' }}
    steps:
+      # The publish runner's default HOME (/home/hongming) is not writable, so
+      # git/docker credential saves fail (`Error saving credentials: mkdir
+      # /home/hongming: permission denied`) and halt the production rollout
+      # (#2193). Point HOME + DOCKER_CONFIG at the writable job temp dir —
+      # mirrors build-and-push's "Prepare writable Docker config" fix above.
+      - name: Prepare writable HOME + Docker config
+        run: |
+          set -euo pipefail
+          H="$RUNNER_TEMP/auto-deploy-home"
+          mkdir -p "$H/.docker"
+          echo "HOME=$H" >> "$GITHUB_ENV"
+          echo "DOCKER_CONFIG=$H/.docker" >> "$GITHUB_ENV"
+
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

@@ -297,8 +338,50 @@ jobs:
          set -euo pipefail
          python3 .gitea/scripts/prod-auto-deploy.py wait-ci

-      - name: Call production CP redeploy-fleet
+      # Superseded-job guard — BEFORE any production side effect (#2213).
+      #
+      # This workflow has no `concurrency:` (see header: Gitea 1.22.6 cancels
+      # queued prod deploys). So two close main pushes run BOTH deploy-production
+      # jobs. The verify step already skips its strict /buildinfo check when this
+      # job is superseded (#2194) — but that guard was AFTER the redeploy and the
+      # :latest promote, so an OLDER job that started late still:
+      #   1. rolled the whole fleet BACKWARD to its older tag (canary hongming
+      #      was reverted from the newer SHA — the #2213 red), then
+      #   2. promoted :latest backward to the older image,
+      # and only THEN skipped verify and exited green. A superseded job must do
+      # NEITHER. We re-check the branch head here, immediately before the rollout,
+      # and skip every side effect when a newer commit already owns main.
+      #
+      # exit 0 + non-empty stdout => superseded (newer head printed); the redeploy
+      # and promote steps are gated off via this output. exit 10 => this job is
+      # still the latest, proceed to roll the fleet. Fail-safe: a head that can't
+      # be read returns NOT-superseded (exit 10), so a genuine deploy is never
+      # silently skipped. (Re-checked again at verify time to catch a newer job
+      # that lands DURING this rollout.)
+      - name: Check superseded before production side effects
+        id: supersede
        if: ${{ steps.plan.outputs.enabled == 'true' }}
+        run: |
+          set -euo pipefail
+          set +e
+          NEWER_HEAD="$(python3 .gitea/scripts/prod-auto-deploy.py check-superseded)"
+          SUPERSEDED_EXIT=$?
+          set -e
+          if [ "$SUPERSEDED_EXIT" -eq 0 ] && [ -n "$NEWER_HEAD" ]; then
+            echo "superseded=true" >> "$GITHUB_OUTPUT"
+            echo "::notice::Superseded before rollout: main head is now ${NEWER_HEAD:0:7} (this job deploys ${GITHUB_SHA:0:7}). Skipping redeploy + :latest promote so an older job never rolls the fleet backward."
+            {
+              echo "## Production auto-deploy skipped — superseded before rollout"
+              echo ""
+              echo "This deploy job's SHA \`${GITHUB_SHA:0:7}\` is no longer the head of \`main\` (now \`${NEWER_HEAD:0:7}\`)."
+              echo "A newer deploy job owns the fleet; rolling it backward to this older build would revert tenants and \`:latest\`. No side effects performed."
+            } >> "$GITHUB_STEP_SUMMARY"
+          else
+            echo "superseded=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Call production CP redeploy-fleet
+        if: ${{ steps.plan.outputs.enabled == 'true' && steps.supersede.outputs.superseded != 'true' }}
        run: |
          set -euo pipefail
          python3 .gitea/scripts/prod-auto-deploy.py assert-enabled
@@ -357,18 +440,66 @@ jobs:
          fi

      - name: Verify reachable tenants report this SHA
-        if: ${{ steps.plan.outputs.enabled == 'true' }}
+        # Skip when superseded BEFORE rollout: the redeploy step did not run, so
+        # there is no redeploy-fleet response to verify against and the newer job
+        # owns verification (#2213). The in-step guard below still catches the
+        # case where a newer job lands DURING this job's rollout.
+        if: ${{ steps.plan.outputs.enabled == 'true' && steps.supersede.outputs.superseded != 'true' }}
        env:
          TENANT_DOMAIN: moleculesai.app
        run: |
          set -euo pipefail
          RESP="$RUNNER_TEMP/prod-redeploy-response.json"
+
+          # Superseded-job guard. This workflow has no `concurrency:` (header
+          # explains why: Gitea 1.22.6 cancels queued prod deploys). So two
+          # close main pushes run BOTH deploy-production jobs. The newer one
+          # rolls the fleet to its (newer) build first; this older job's strict
+          # equality check below would then see tenants on the NEWER SHA and
+          # false-red "$slug is stale" even though the fleet is AHEAD, not
+          # behind (git SHAs aren't ordered; /buildinfo exposes only git_sha).
+          #
+          # If main's current head is no longer THIS job's SHA, a newer commit
+          # has landed and this deploy is superseded — the newest job's verify
+          # is authoritative. Skip strict verify and succeed. exit 0 => newer
+          # head printed (superseded); exit 10 => still the latest, proceed to
+          # the strict verify so a genuinely-behind tenant still fails loudly.
+          set +e
+          NEWER_HEAD="$(python3 .gitea/scripts/prod-auto-deploy.py check-superseded)"
+          SUPERSEDED_EXIT=$?
+          set -e
+          if [ "$SUPERSEDED_EXIT" -eq 0 ] && [ -n "$NEWER_HEAD" ]; then
+            echo "::notice::Superseded deploy: main head is now ${NEWER_HEAD:0:7} (this job deployed ${GITHUB_SHA:0:7}). The fleet is at or ahead of this build; the newer deploy job's verify is authoritative. Skipping strict SHA verify."
+            {
+              echo ""
+              echo "### Buildinfo verification skipped — superseded deploy"
+              echo ""
+              echo "This deploy job's SHA \`${GITHUB_SHA:0:7}\` is no longer the head of \`main\` (now \`${NEWER_HEAD:0:7}\`)."
+              echo "A newer deploy job is rolling the fleet forward; its verify is authoritative."
+            } >> "$GITHUB_STEP_SUMMARY"
+            exit 0
+          fi
+
          mapfile -t SLUGS < <(jq -r '.results[]? | .slug' "$RESP")
          if [ ${#SLUGS[@]} -eq 0 ]; then
            echo "::error::No tenants returned from redeploy-fleet; refusing to mark production deploy verified."
            exit 1
          fi

+          # Per-tenant settle/retry budget (#2213). A tenant whose container the
+          # CP just swapped can keep serving the OLD image at the edge for a short
+          # window while the old container drains — /buildinfo returns HTTP 200
+          # with the previous SHA, which `curl --retry` does NOT retry (it only
+          # retries connection/5xx failures, not a stale-but-200 body). Without a
+          # settle window a still-rolling tenant false-reds "stale" on the very
+          # first poll. So poll each tenant's /buildinfo until it reports the
+          # target SHA or the budget is exhausted; only THEN declare it stale or
+          # unreachable. This never masks a genuinely stuck tenant — a tenant that
+          # never reaches the target within the budget still fails loud (and the
+          # superseded-job revert class is already blocked before rollout above).
+          SETTLE_BUDGET_SECONDS="${PROD_AUTO_DEPLOY_VERIFY_BUDGET_SECONDS:-240}"
+          SETTLE_INTERVAL_SECONDS="${PROD_AUTO_DEPLOY_VERIFY_INTERVAL_SECONDS:-20}"
+
          STALE_COUNT=0
          UNREACHABLE_COUNT=0
          UNHEALTHY_COUNT=0
@@ -380,18 +511,36 @@ jobs:
              continue
            fi
            url="https://${slug}.${TENANT_DOMAIN}/buildinfo"
-            body="$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$url" || true)"
-            actual="$(echo "$body" | jq -r '.git_sha // ""' 2>/dev/null || echo "")"
-            if [ -z "$actual" ]; then
-              echo "::error::$slug did not return /buildinfo after deploy."
-              UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
-              continue
-            fi
-            if [ "$actual" != "$GITHUB_SHA" ]; then
-              echo "::error::$slug is stale: actual=${actual:0:7}, expected=${GITHUB_SHA:0:7}"
-              STALE_COUNT=$((STALE_COUNT + 1))
-            else
+            deadline=$(( $(date +%s) + SETTLE_BUDGET_SECONDS ))
+            actual=""
+            last_actual=""
+            on_target=false
+            while :; do
+              body="$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$url" || true)"
+              actual="$(echo "$body" | jq -r '.git_sha // ""' 2>/dev/null || echo "")"
+              [ -n "$actual" ] && last_actual="$actual"
+              if [ "$actual" = "$GITHUB_SHA" ]; then
+                on_target=true
+                break
+              fi
+              now=$(date +%s)
+              if [ "$now" -ge "$deadline" ]; then
+                break
+              fi
+              # Still rolling (stale 200) or transiently unreachable — wait and
+              # re-poll within the settle budget rather than failing on first read.
+              remaining=$(( deadline - now ))
+              echo "$slug: waiting for target SHA (have '${actual:0:7}', want ${GITHUB_SHA:0:7}; ${remaining}s left)"
+              sleep "$SETTLE_INTERVAL_SECONDS"
+            done
+            if [ "$on_target" = true ]; then
              echo "$slug: ${actual:0:7}"
+            elif [ -z "$last_actual" ]; then
+              echo "::error::$slug did not return /buildinfo after deploy (waited ${SETTLE_BUDGET_SECONDS}s)."
+              UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
+            else
+              echo "::error::$slug is stale: actual=${last_actual:0:7}, expected=${GITHUB_SHA:0:7} (waited ${SETTLE_BUDGET_SECONDS}s)"
+              STALE_COUNT=$((STALE_COUNT + 1))
            fi
          done

@@ -409,3 +558,69 @@ jobs:
          if [ "$STALE_COUNT" -gt 0 ] || [ "$UNHEALTHY_COUNT" -gt 0 ] || [ "$UNREACHABLE_COUNT" -gt 0 ]; then
            exit 1
          fi
+
+      # Re-point :latest to the just-promoted image — ONLY after the
+      # production rollout + buildinfo verification above have passed.
+      #
+      # WHY HERE (promote point), not at build time:
+      #   The platform-tenant ECR `:latest` tag was last moved 2026-05-10
+      #   and went 3.5 weeks stale because the build step only pushes
+      #   :staging-<sha> + :staging-latest and never re-points :latest. A
+      #   no-arg POST /cp/admin/tenants/:slug/redeploy (whose default tag
+      #   fell through to "latest") then pulled the 3.5-week-old image and
+      #   REVERTED the tenant (incident: molecule-adk-demo, 2026-06-03).
+      #
+      #   The defense-in-depth half of this fix changes that redeploy
+      #   default to :staging-latest, but :latest itself must also be
+      #   kept meaningful. We make :latest track the PROD-BLESSED build,
+      #   not the raw build: by living at the end of deploy-production —
+      #   after `wait-ci` (green main CI), the canary-first batched fleet
+      #   rollout, AND the /buildinfo SHA verification — :latest only ever
+      #   advances to a SHA that is actually green and confirmed running
+      #   across the live fleet. So `:latest` == "current prod image",
+      #   and any consumer that pulls :latest (legacy callers, manual
+      #   `docker pull`, a redeploy that somehow still resolves "latest")
+      #   gets the blessed image instead of whatever happened to build.
+      #
+      #   Re-tag is digest-level (imagetools create), so no rebuild and
+      #   :latest is byte-identical to :staging-<sha> for this commit.
+      # Gate on supersede: a superseded older job must NOT move :latest backward
+      # to its older image (#2213 — 275383 promoted :latest → the older
+      # staging-7a72516 after a newer job had already shipped). :latest must only
+      # ever advance under the job that owns main's head.
+      - name: Promote :latest to the verified prod image
+        if: ${{ steps.plan.outputs.enabled == 'true' && steps.supersede.outputs.superseded != 'true' }}
+        env:
+          TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }}
+          STAGING_TENANT_IMAGE_NAME: ${{ env.STAGING_TENANT_IMAGE_NAME }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_DEFAULT_REGION: us-east-2
+        run: |
+          set -euo pipefail
+          SHA_TAG="staging-${GITHUB_SHA::7}"
+          PROD_ECR_REGISTRY="${TENANT_IMAGE_NAME%%/*}"
+          STAGING_ECR_REGISTRY="${STAGING_TENANT_IMAGE_NAME%%/*}"
+
+          aws ecr get-login-password --region us-east-2 | \
+            docker login --username AWS --password-stdin "${PROD_ECR_REGISTRY}"
+          aws ecr get-login-password --region us-east-2 | \
+            docker login --username AWS --password-stdin "${STAGING_ECR_REGISTRY}"
+
+          # imagetools create copies the source manifest to the new tag by
+          # digest (no pull/rebuild). :latest now points at the exact image
+          # that just passed the prod gate.
+          docker buildx imagetools create \
+            --tag "${TENANT_IMAGE_NAME}:latest" \
+            "${TENANT_IMAGE_NAME}:${SHA_TAG}"
+          docker buildx imagetools create \
+            --tag "${STAGING_TENANT_IMAGE_NAME}:latest" \
+            "${STAGING_TENANT_IMAGE_NAME}:${SHA_TAG}"
+
+          {
+            echo ""
+            echo "### :latest promoted"
+            echo ""
+            echo "Re-pointed \`platform-tenant:latest\` → \`${SHA_TAG}\` (prod + staging ECR)."
+            echo ":latest now tracks the prod-blessed, fleet-verified image."
+          } >> "$GITHUB_STEP_SUMMARY"
@@ -33,11 +33,20 @@
 #                           2026-05-17 (internal#189 Phase 1).
 #
 # BURN-IN CLOSED 2026-05-17 (internal#189 Phase 1): The 7-day burn-in
-# window closed. continue-on-error: true has been removed from the
-# tier-check job; AND-composition is now fully enforced. If you need
-# to temporarily re-introduce a mask, file a tracker and follow the
-# mc#1982 protocol (Tier 2e lint requires a current tracker within
-# 2 lines of any continue-on-error: true).
+# window closed. As of 2026-06-04 the residual masks left behind by the
+# burn-in are removed for real (the comment previously claimed this while
+# the masks still persisted — that was stale):
+#   - continue-on-error: true on the jq-install step (redundant; the step
+#     already exits 0) and on the tier-check step (the burn-in mask).
+#   - the `|| true` after the sop-tier-check.sh invocation, which masked
+#     real tier-gate verdicts.
+# AND-composition is now fully enforced and the tier-check step can
+# honestly red CI on a real SOP-6 violation. SOP_FAIL_OPEN=1 is RETAINED
+# as sanctioned infra-resilience: it fails-open only on token/network/jq
+# faults, never on a real gate verdict. If you need to temporarily
+# re-introduce a mask, file a tracker and follow the mc#1982 protocol
+# (Tier 2e lint requires a current tracker within 2 lines of any
+# continue-on-error: true).

 name: sop-tier-check

@@ -90,10 +99,11 @@ jobs:
        # GitHub releases may be unreachable from some runner networks
        # (infra#241 follow-up: GitHub timeout after 3s on 5.78.80.188
        # runners). The sop-tier-check script has its own fallback as a
-        # third line of defense. continue-on-error: true ensures this step
-        # failing does not block the job.
-        # mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
-        continue-on-error: true
+        # third line of defense, and this step's final command
+        # (`jq --version ... || echo`) already exits 0 unconditionally — so
+        # the step cannot fail the job on its own.
+        # continue-on-error REMOVED 2026-06-04 (mc#1982 directive: root-fix
+        # and remove, do not renew). It was redundant masking, not a gate.
        run: |
          # apt-get is the primary method — Ubuntu package mirrors are reliably
          # reachable from runner containers. GitHub releases may be blocked
@@ -110,11 +120,11 @@ jobs:
          jq --version 2>/dev/null || echo "::notice::jq not yet available — script fallback will retry"

      - name: Verify tier label + reviewer team membership
-        # continue-on-error: true at step level — job-level is ignored by Gitea
-        # Actions (quirk #10, internal runbooks). Belt-and-suspenders with
-        # SOP_FAIL_OPEN=1 + || true below.
-        # mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
-        continue-on-error: true
+        # continue-on-error REMOVED 2026-06-04 (expired internal#189 Phase 1
+        # burn-in, window closed 2026-05-17; mc#1982 directive: root-fix and
+        # remove, do not renew). SOP_FAIL_OPEN=1 below still fails-open on
+        # token/network/infra errors only (never on a real tier-gate verdict),
+        # so this step can now honestly fail CI on a genuine SOP-6 violation.
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
          GITEA_HOST: git.moleculesai.app
@@ -123,9 +133,13 @@ jobs:
          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
          SOP_DEBUG: '0'
          SOP_LEGACY_CHECK: '0'
-          # SOP_FAIL_OPEN=1 makes the script always exit 0. The UI enforces
-          # the actual merge gate. Combined with continue-on-error: true
-          # above, this step never fails the job regardless of script exit.
+          # SOP_FAIL_OPEN=1 fails-open ONLY on infra faults (empty/invalid
+          # token, unreachable Gitea API, missing jq) — see the guarded
+          # `exit 0` branches in sop-tier-check.sh. It does NOT mask a real
+          # tier-gate verdict: a missing tier label, no approving review, or
+          # an unsatisfied AND-clause still `exit 1`. Kept as sanctioned
+          # infra-resilience; the `|| true` mask was REMOVED with the burn-in
+          # COE (2026-06-04) so a genuine SOP-6 violation now reds CI.
          SOP_FAIL_OPEN: '1'
        run: |
-          bash .gitea/scripts/sop-tier-check.sh || true
+          bash .gitea/scripts/sop-tier-check.sh
@@ -60,6 +60,7 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  # bp-required: pending #718 — soak-then-promote, not in BP yet.
  compare:
    name: Compare synced providers.yaml against controlplane canonical
    runs-on: ubuntu-latest
@@ -67,6 +67,7 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  # bp-required: pending #718 — soak-then-promote, not in BP yet.
  verify:
    name: Regenerate providers artifact and fail on drift
    runs-on: ubuntu-latest
@@ -24,6 +24,17 @@ COPY --from=builder /app/public ./public
 EXPOSE 3000
 ENV PORT=3000
 ENV HOSTNAME="0.0.0.0"
+# Git SHA the image was built from, surfaced at /api/buildinfo so canvas
+# deploys are verifiable by the served SHA the same way workspace-server's
+# /buildinfo is (core#2235). Wired from `${{ github.sha }}` in
+# publish-canvas-image.yml. Server-only (not NEXT_PUBLIC_) — the route
+# handler reads it at runtime on the standalone Node server, so it stays
+# out of the client bundle. Set on the final stage (not the builder) so it
+# lives in the runtime env that force-dynamic reads per request. Default
+# "dev" matches the route + workspace-server sentinel: an unwired build
+# fails the SHA comparison closed instead of looking deployed.
+ARG BUILD_SHA=dev
+ENV BUILD_SHA=$BUILD_SHA
 # Non-root runtime — use addgroup/adduser without fixed GID/UID to avoid conflicts with base image
 RUN addgroup canvas 2>/dev/null || true && adduser -G canvas -s /bin/sh -D canvas 2>/dev/null || true
 USER canvas
@@ -101,10 +101,19 @@ test.describe("Desktop ChatTab", () => {
    await textarea.fill("Trigger activity");
    await page.getByRole("button", { name: /Send/ }).first().click();

-    // Activity log container should appear during the send flow.
-    await expect(page.locator("[data-testid='activity-log']").first()).toBeVisible({ timeout: 10_000 }).catch(() => {
-      // Activity log may not be present in all layouts.
-    });
+    // FALSE-GREEN FIX: the prior `.catch(() => {})` swallowed the assertion
+    // entirely, so this test passed whether or not the activity log ever
+    // rendered. The activity-log container is optional per layout, so we
+    // gate on its presence in the DOM: if it's not part of this layout,
+    // skip explicitly (a recorded skip, not a silent pass); if it IS
+    // present, it MUST become visible during the send flow — that's the
+    // behaviour this test exists to protect.
+    const activityLog = page.locator("[data-testid='activity-log']").first();
+    if ((await activityLog.count()) === 0) {
+      test.skip(true, "activity-log not part of this layout");
+      return;
+    }
+    await expect(activityLog).toBeVisible({ timeout: 10_000 });
  });
 });

@@ -60,11 +60,26 @@ test.describe("MobileChat", () => {

    await expect(page.getByText("Echo: Mobile persistence")).toBeVisible({ timeout: 15_000 });

+    // Reload and deterministically wait for the chat-history GET that
+    // rehydrates the transcript to come back 2xx, rather than racing a
+    // fixed-timeout render assertion against an in-flight fetch. The
+    // server now persists the a2a_receive row SYNCHRONOUSLY before the
+    // send's 200 (workspace-server logA2ASuccess), so the row is
+    // guaranteed present by the time this GET runs — the wait is for
+    // hydration latency, not for a still-racing write.
+    const historyResponse = page.waitForResponse(
+      (resp) =>
+        resp.url().includes("/chat-history") &&
+        resp.request().method() === "GET" &&
+        resp.status() === 200,
+      { timeout: 15_000 },
+    );
    await page.reload();
    await page.waitForSelector("[data-testid='chat-panel']", { timeout: 10_000 });
+    await historyResponse;

-    await expect(page.getByText("Mobile persistence", { exact: true })).toBeVisible({ timeout: 5_000 });
-    await expect(page.getByText("Echo: Mobile persistence")).toBeVisible({ timeout: 5_000 });
+    await expect(page.getByText("Mobile persistence", { exact: true })).toBeVisible();
+    await expect(page.getByText("Echo: Mobile persistence")).toBeVisible();
  });

  test("composer auto-grows with multi-line text", async ({ page }) => {
@@ -0,0 +1,329 @@
+/**
+ * Staging canvas E2E — REAL desktop take-control path (core#2261 "Gap 1").
+ *
+ * This is the live-e2e gate that the existing staging-tabs.spec.ts does NOT
+ * provide. staging-tabs only opens the 13 declared workspace-panel tabs
+ * (TAB_IDS at staging-tabs.spec.ts:24-38 — `display` is NOT among them) and
+ * asserts they render without a "Failed to load" toast. It never acquires
+ * display control, never opens the noVNC WebSocket, and never asserts a
+ * framebuffer frame arrives. The companion unit test
+ * canvas/src/components/tabs/__tests__/DisplayTab.test.tsx mocks the RFB
+ * constructor (vi.mock("@novnc/novnc"), see its lines 8/20-39) so NO real
+ * WebSocket is ever opened there either. Result: a broken take-control path
+ * (acquire → noVNC WS upgrade → ws-proxy → EIC → websockify → x11vnc → Xvfb)
+ * ships GREEN. This spec closes that gap by exercising the REAL wire path
+ * end to end against a live, desktop-capable staging workspace.
+ *
+ * What it asserts (the real path, no mocks):
+ *   1. POST /workspaces/<id>/display/control/acquire returns 200 with a
+ *      session_url that carries the signed token in its `#token=` fragment
+ *      (mirrors workspace_display_control.go:signedDisplaySessionURL).
+ *   2. Opening the noVNC WebSocket at session_url with the subprotocols
+ *      ["binary", "molecule-display-token.<token>"] (exactly what the canvas
+ *      sends — DisplayTab.tsx:339) UPGRADES (onopen fires, readyState===OPEN,
+ *      no immediate 1006 abnormal close). A 1006 / 403 means the handshake
+ *      failed somewhere in the proxy chain.
+ *   3. At least one BINARY framebuffer message arrives on that socket — a
+ *      real frame off x11vnc, not just a panel mount. RFB sends a
+ *      ProtocolVersion banner ("RFB 003.00x\n") as the first server message,
+ *      which proves the upstream VNC server is live behind the EIC tunnel.
+ *
+ * Auth model (important): the WS upgrade is gated by workspace-server
+ * middleware.AdminAuth. A browser WebSocket CANNOT set an Authorization
+ * header, so in production the canvas WS upgrade passes AdminAuth via the
+ * same-origin-canvas path (wsauth_middleware.go:isSameOriginCanvas, which
+ * keys off the Origin header the browser sets automatically on a same-origin
+ * WS upgrade). We therefore open the socket from inside the browser page via
+ * page.evaluate AFTER navigating to the tenant origin — so the browser sends
+ * `Origin: https://<slug>.staging.moleculesai.app`, exactly as production
+ * does. The acquire POST (which CAN carry a header) uses the per-tenant admin
+ * bearer set on the context. This is the faithful production handshake, not a
+ * synthetic one.
+ *
+ * Gate / cost: this test only runs when STAGING_DISPLAY_WORKSPACE_ID points
+ * at a STANDING desktop-capable workspace (compute.display.mode ==
+ * "desktop-control"). We deliberately do NOT provision one in the shared
+ * staging-setup.ts: a desktop AMI boots in ~12-15 min and would tax the
+ * existing tabs harness on every run. Standing that workspace up is a cost
+ * item for the CTO (one always-on desktop EC2 on staging). Until that exists,
+ * the test SKIPS loud. When the env IS present, any failure in
+ * provision/acquire/upgrade is a HARD error — fail-closed, never silently
+ * green (no "flaky" disposition: a 1006 names a broken proxy hop).
+ */
+
+import { test, expect } from "@playwright/test";
+
+const STAGING = process.env.CANVAS_E2E_STAGING === "1";
+
+// The standing desktop-capable workspace id. Absent => skip loud. This is
+// the single knob that activates the gate; see file header for the cost note.
+const DISPLAY_WS_ID = process.env.STAGING_DISPLAY_WORKSPACE_ID;
+
+test.skip(!STAGING, "CANVAS_E2E_STAGING not set — skipping staging-only tests");
+test.skip(
+  !DISPLAY_WS_ID,
+  "STAGING_DISPLAY_WORKSPACE_ID not set — no standing desktop-capable staging " +
+    "workspace to exercise the take-control path. Set it to a workspace whose " +
+    "compute.display.mode == 'desktop-control' to activate this real-e2e gate. " +
+    "(Standing that workspace up is a CTO cost item — one always-on desktop EC2.)",
+);
+
+// How long we wait for the WS to upgrade + deliver the first frame. The EIC
+// tunnel + websockify handshake adds real latency on top of the edge; budget
+// generously but bounded, so a genuinely-dead path fails LOUD instead of
+// hanging to the suite timeout.
+const WS_UPGRADE_TIMEOUT_MS = 30_000;
+const FIRST_FRAME_TIMEOUT_MS = 30_000;
+
+test.describe("staging desktop take-control (real noVNC path)", () => {
+  test("acquire → WS upgrades → first framebuffer frame arrives", async ({
+    page,
+    context,
+  }) => {
+    // The standing desktop workspace lives in its OWN standing org (it can't
+    // live in the per-run ephemeral org — that gets torn down each run). When
+    // STAGING_DISPLAY_SLUG is configured, staging-setup.ts resolves that org's
+    // tenant URL / admin token / org id and exports them under STAGING_DISPLAY_*.
+    // Fall back to the ephemeral org's exports only if the display org wasn't
+    // separately configured (e.g. the desktop workspace happens to live in the
+    // run's own tenant — not the expected topology, but supported).
+    const tenantURL =
+      process.env.STAGING_DISPLAY_TENANT_URL || process.env.STAGING_TENANT_URL;
+    const tenantToken =
+      process.env.STAGING_DISPLAY_TENANT_TOKEN || process.env.STAGING_TENANT_TOKEN;
+    const orgID =
+      process.env.STAGING_DISPLAY_ORG_ID || process.env.STAGING_ORG_ID;
+
+    // Fail-closed: when the gate env IS present (we got past the skips above),
+    // the rest of the staging context MUST be wired or this is a hard error,
+    // never a silent pass. Mirrors staging-tabs.spec.ts:53-57.
+    if (!tenantURL || !tenantToken) {
+      throw new Error(
+        "STAGING_DISPLAY_WORKSPACE_ID is set but no tenant URL/token is available " +
+          "for the take-control gate. Set STAGING_DISPLAY_SLUG so staging-setup.ts " +
+          "resolves STAGING_DISPLAY_TENANT_URL / STAGING_DISPLAY_TENANT_TOKEN for the " +
+          "standing desktop org (or ensure the ephemeral STAGING_TENANT_* exports exist).",
+      );
+    }
+
+    const workspaceId = DISPLAY_WS_ID as string;
+
+    // The per-tenant admin bearer satisfies AdminAuth for the acquire POST
+    // (which can carry a header). The WS upgrade below relies on Origin
+    // (same-origin canvas), NOT this header.
+    await context.setExtraHTTPHeaders({
+      Authorization: `Bearer ${tenantToken}`,
+      // X-Molecule-Org-Id is required by workspace-server TenantGuard for
+      // cross-org requests routed through the CP edge; staging-setup exports it.
+      // Harmless (and correct) to send on the same-origin tenant box too.
+      ...(orgID ? { "X-Molecule-Org-Id": orgID } : {}),
+    });
+
+    // 0. Sanity: the workspace must actually be display-enabled, else the
+    //    whole gate is meaningless. Hit the availability endpoint first so a
+    //    mis-pointed STAGING_DISPLAY_WORKSPACE_ID fails with a precise message
+    //    instead of an opaque acquire error.
+    const availResp = await page.request.get(
+      `${tenantURL}/workspaces/${workspaceId}/display`,
+    );
+    expect(
+      availResp.status(),
+      `GET /display for ${workspaceId} should be 200`,
+    ).toBe(200);
+    const avail = await availResp.json();
+    expect(
+      avail.available,
+      `workspace ${workspaceId} is not display-available (reason=${avail.reason}). ` +
+        "STAGING_DISPLAY_WORKSPACE_ID must point at a workspace with " +
+        "compute.display.mode == 'desktop-control' AND a live instance_id.",
+    ).toBe(true);
+
+    // 1. Acquire display control. The handler returns session_url +
+    //    expires_at; session_url embeds the signed token in its #token=
+    //    fragment (workspace_display_control.go:signedDisplaySessionURL).
+    const acquireResp = await page.request.post(
+      `${tenantURL}/workspaces/${workspaceId}/display/control/acquire`,
+      { data: { controller: "user", ttl_seconds: 300 } },
+    );
+    expect(
+      acquireResp.status(),
+      `acquire should be 200; body: ${await acquireResp.text()}`,
+    ).toBe(200);
+    const acquire = await acquireResp.json();
+    expect(acquire.controller, "controller should be 'user'").toBe("user");
+    expect(
+      typeof acquire.session_url,
+      `acquire response missing session_url: ${JSON.stringify(acquire)}`,
+    ).toBe("string");
+
+    // The token rides in the URL fragment (#token=...), never as a query
+    // param — confirm the contract the client (DisplayTab.tsx:459-466)
+    // depends on so a server-side change to the URL shape fails HERE.
+    const sessionUrl: string = acquire.session_url;
+    expect(
+      sessionUrl,
+      `session_url should carry the token in a #token= fragment: ${sessionUrl}`,
+    ).toContain("#token=");
+
+    // 2. Open the REAL noVNC WebSocket from inside the page, so the browser
+    //    sends Origin: <tenant> and the same-origin-canvas AdminAuth path
+    //    accepts the upgrade (a browser WS can't set Authorization). We
+    //    navigate to the tenant origin first purely to anchor the Origin
+    //    header; we don't need the canvas bundle to hydrate.
+    await page.goto(tenantURL, { waitUntil: "domcontentloaded" });
+
+    // Reproduce DisplayTab.tsx:459-466 (displayWebSocketConnection): resolve
+    // session_url against the tenant origin, pull the token out of the
+    // fragment, strip the fragment, switch http(s)->ws(s). Then connect with
+    // the exact subprotocols the canvas uses (DisplayTab.tsx:339).
+    const result = await page.evaluate(
+      async ({ rawSessionUrl, upgradeTimeoutMs, frameTimeoutMs }) => {
+        const u = new URL(rawSessionUrl, window.location.href);
+        const token =
+          new URLSearchParams(u.hash.replace(/^#/, "")).get("token") ?? "";
+        if (!token) {
+          return { ok: false, stage: "token-parse", detail: "no #token in session_url" };
+        }
+        u.hash = "";
+        u.protocol = window.location.protocol === "https:" ? "wss:" : "ws:";
+        const wsUrl = u.toString();
+
+        return await new Promise<{
+          ok: boolean;
+          stage: string;
+          detail: string;
+          frameBytes?: number;
+          frameKind?: string;
+          closeCode?: number;
+        }>((resolve) => {
+          let upgraded = false;
+          let settled = false;
+          const finish = (r: {
+            ok: boolean;
+            stage: string;
+            detail: string;
+            frameBytes?: number;
+            frameKind?: string;
+            closeCode?: number;
+          }) => {
+            if (settled) return;
+            settled = true;
+            try {
+              ws.close();
+            } catch {
+              /* ignore */
+            }
+            resolve(r);
+          };
+
+          let ws: WebSocket;
+          try {
+            ws = new WebSocket(wsUrl, [`binary`, `molecule-display-token.${token}`]);
+          } catch (e) {
+            resolve({ ok: false, stage: "construct", detail: String(e) });
+            return;
+          }
+          ws.binaryType = "arraybuffer";
+
+          const upgradeTimer = setTimeout(() => {
+            finish({
+              ok: false,
+              stage: "upgrade-timeout",
+              detail: `WS did not open within ${upgradeTimeoutMs}ms (readyState=${ws.readyState})`,
+            });
+          }, upgradeTimeoutMs);
+
+          let frameTimer: ReturnType<typeof setTimeout> | null = null;
+
+          ws.onopen = () => {
+            upgraded = true;
+            clearTimeout(upgradeTimer);
+            // Now wait for the first server message. RFB's ProtocolVersion
+            // banner is the first thing x11vnc sends; if nothing arrives the
+            // tunnel opened but the VNC server behind it is dead.
+            frameTimer = setTimeout(() => {
+              finish({
+                ok: false,
+                stage: "frame-timeout",
+                detail: `WS upgraded but no framebuffer message within ${frameTimeoutMs}ms`,
+              });
+            }, frameTimeoutMs);
+          };
+
+          ws.onmessage = (ev) => {
+            if (frameTimer) clearTimeout(frameTimer);
+            let bytes = 0;
+            let kind: string = typeof ev.data;
+            if (ev.data instanceof ArrayBuffer) {
+              bytes = ev.data.byteLength;
+              kind = "ArrayBuffer";
+            } else if (typeof Blob !== "undefined" && ev.data instanceof Blob) {
+              bytes = ev.data.size;
+              kind = "Blob";
+            } else if (typeof ev.data === "string") {
+              bytes = ev.data.length;
+              kind = "string";
+            }
+            finish({
+              ok: bytes > 0,
+              stage: "frame",
+              detail:
+                bytes > 0
+                  ? "received framebuffer message"
+                  : "first message was empty",
+              frameBytes: bytes,
+              frameKind: kind,
+            });
+          };
+
+          ws.onclose = (ev) => {
+            // A close BEFORE open === failed upgrade (1006 abnormal / 403
+            // forbidden surface here). A close AFTER we already saw a frame is
+            // benign (our own finish() triggered it).
+            if (!upgraded) {
+              clearTimeout(upgradeTimer);
+              finish({
+                ok: false,
+                stage: "upgrade-close",
+                detail: `WS closed before upgrade (code=${ev.code}, reason="${ev.reason}") — handshake rejected somewhere in edge → ws-proxy → EIC → websockify → x11vnc`,
+                closeCode: ev.code,
+              });
+            }
+          };
+
+          ws.onerror = () => {
+            if (!upgraded) {
+              clearTimeout(upgradeTimer);
+              finish({
+                ok: false,
+                stage: "upgrade-error",
+                detail: "WS error before upgrade — proxy chain rejected the handshake",
+              });
+            }
+          };
+        });
+      },
+      {
+        rawSessionUrl: sessionUrl,
+        upgradeTimeoutMs: WS_UPGRADE_TIMEOUT_MS,
+        frameTimeoutMs: FIRST_FRAME_TIMEOUT_MS,
+      },
+    );
+
+    // 3. Assert the real outcome. No "flaky" escape hatch: each failure stage
+    //    names the broken hop so a reviewer can act on it directly.
+    expect(
+      result.ok,
+      `take-control failed at stage="${result.stage}": ${result.detail}` +
+        (result.closeCode ? ` (close code ${result.closeCode})` : ""),
+    ).toBe(true);
+    expect(
+      result.stage,
+      `expected to reach the 'frame' stage; got '${result.stage}' (${result.detail})`,
+    ).toBe("frame");
+    expect(
+      result.frameBytes ?? 0,
+      `framebuffer message should be non-empty (kind=${result.frameKind})`,
+    ).toBeGreaterThan(0);
+  });
+});
@@ -241,7 +241,14 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
      name: "E2E Canvas Test",
      runtime: "hermes",
      tier: 2,
-      model: "gpt-4o",
+      // Provider-registry SSOT (internal#718) registers ONLY Kimi models for
+      // the hermes runtime — `moonshot/kimi-k2.6` is the platform-managed
+      // entry (workspace-server/internal/providers/providers.yaml, hermes ->
+      // platform). The old `gpt-4o` was never a registered hermes model and
+      // now 422s UNREGISTERED_MODEL_FOR_RUNTIME (core#2225). This workspace
+      // defaults closed to platform_managed (see the boot-shape note below),
+      // so a platform-namespaced model id is the registry-correct choice.
+      model: "moonshot/kimi-k2.6",
    }),
  });
  if (ws.status >= 400 || !ws.body?.id) {
@@ -250,7 +257,38 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
  const workspaceId = ws.body.id as string;
  console.log(`[staging-setup] Workspace created: ${workspaceId}`);

-  // 6. Wait for workspace online
+  // 6. Wait for workspace RENDERABLE.
+  //
+  // This harness exists to verify the canvas *tab UI* renders (staging-
+  // tabs.spec.ts: open each of the 13 workspace-panel tabs, assert no hard
+  // crash / no "Failed to load" toast). It does NOT exercise the agent —
+  // no LLM call is made, the spec even mocks /cp/auth/me and 401→200. All
+  // it needs is a workspace ROW that the canvas lists so the node renders
+  // and the side-panel tabs open. A fully-`online` agent is NOT required.
+  //
+  // That distinction became load-bearing on 2026-06-03: workspace-server
+  // #2162 (fix(provision): platform-managed workspace must fail-closed when
+  // CP proxy env absent) made a platform_managed workspace ABORT AT BOOT
+  // with MISSING_PLATFORM_PROXY when MOLECULE_LLM_BASE_URL /
+  // MOLECULE_LLM_USAGE_TOKEN are not present in the tenant's env. The
+  // canvas E2E creates a bare hermes/moonshot platform workspace, which defaults
+  // closed to platform_managed (workspace_provision.go:~1009), and the
+  // staging tenant does not carry the CP proxy env — so the agent never
+  // starts. Pre-#2162 this same workspace booted credential-less (the bug
+  // #2162 fixed) and the tabs rendered fine; #2162 is a correct production
+  // safety fix, but it surfaced here as `status:"failed", uptime_seconds:0,
+  // last_sample_error:null` — the pre-start credential-abort shape — and the
+  // old hard-throw turned a UI-irrelevant boot skip into a main-red
+  // (core#2199). The agent boot stage is simply not what this test gates.
+  //
+  // So: online is the happy path. A `failed` row that is the PRE-START
+  // credential-abort shape (the agent process never ran: uptime_seconds==0
+  // AND no last_sample_error) is treated as RENDERABLE — the row exists,
+  // the node + tabs render, proceed. We do NOT mask a real boot regression:
+  // any `failed` carrying a last_sample_error, OR a non-zero uptime (the
+  // agent started then crashed — image pull, panic, PYTHONPATH, etc.),
+  // still hard-throws. Genuine *infra* provision failure is already caught
+  // loud one step earlier at the org level (instance_status === "failed").
  await waitFor<boolean>(
    async () => {
      const r = await jsonFetch(`${tenantURL}/workspaces/${workspaceId}`, {
@@ -259,6 +297,24 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
      if (r.status !== 200) return null;
      if (r.body?.status === "online") return true;
      if (r.body?.status === "failed") {
+        const uptime = Number(r.body?.uptime_seconds ?? 0);
+        const sampleErr = r.body?.last_sample_error;
+        const preStartCredentialAbort = uptime === 0 && !sampleErr;
+        if (preStartCredentialAbort) {
+          // Agent never started (no LLM cred on this staging tenant — the
+          // expected #2162 platform-proxy gap). The workspace row still
+          // renders, which is all the tab-UI test needs. Proceed, but log
+          // loudly so a real "agent never booted because of something else"
+          // is not silently normalized.
+          console.warn(
+            `[staging-setup] workspace ${workspaceId} is 'failed' with the pre-start ` +
+              `credential-abort shape (uptime_seconds=0, no last_sample_error) — agent did ` +
+              `not boot (expected on staging without CP LLM proxy env, post workspace-server ` +
+              `#2162). The tab-UI test does not exercise the agent; proceeding with the ` +
+              `workspace row, which renders regardless. full body: ${JSON.stringify(r.body)}`,
+          );
+          return true;
+        }
        // last_sample_error is often empty when the failure happens before
        // the agent emits a sample (e.g. boot crash, image pull error,
        // missing PYTHONPATH, OpenAI quota at startup). Dumping the full
@@ -266,8 +322,8 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
        // needs without a second probe. Otherwise this propagates as a
        // bare "Workspace failed: " — the exact useless message that
        // sent #2632 to the issue tracker.
-        const detail = r.body.last_sample_error
-          ? r.body.last_sample_error
+        const detail = sampleErr
+          ? sampleErr
          : `(no last_sample_error) full body: ${JSON.stringify(r.body)}`;
        throw new Error(`Workspace failed: ${detail}`);
      }
@@ -277,17 +333,103 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
    10_000,
    "workspace online",
  );
-  console.log(`[staging-setup] Workspace online`);
+  console.log(`[staging-setup] Workspace renderable`);

  // 7. Hand state off to tests + teardown — overwrite the slug-only
  // bootstrap state with the full state spec tests need.
-  writeFileSync(
-    stateFile,
-    JSON.stringify({ slug, tenantURL, workspaceId, tenantToken }, null, 2),
-  );
+  //
+  // FAIL-CLOSED handoff: every field the spec reads must be non-empty. If
+  // any is missing here, the spec's env-presence guard would throw with a
+  // generic "did setup run?" message that hides WHICH field was lost. Catch
+  // it at the source — a partial provision must hard-fail setup, never hand
+  // off a half-built state that the spec then has to diagnose (or worse,
+  // skip). This is the loud, fail-closed contract: STAGING was requested,
+  // so an incomplete provision is an error, not a skip.
+  const handoff = { slug, tenantURL, workspaceId, tenantToken };
+  const missingFields = Object.entries(handoff)
+    .filter(([, v]) => !v)
+    .map(([k]) => k);
+  if (missingFields.length > 0) {
+    throw new Error(
+      `[staging-setup] provision incomplete — empty handoff field(s): ` +
+        `${missingFields.join(", ")}. Refusing to hand off a partial state ` +
+        `that would surface downstream as an opaque spec failure.`,
+    );
+  }
+  writeFileSync(stateFile, JSON.stringify(handoff, null, 2));
  process.env.STAGING_SLUG = slug;
  process.env.STAGING_TENANT_URL = tenantURL;
  process.env.STAGING_WORKSPACE_ID = workspaceId;
  process.env.STAGING_TENANT_TOKEN = tenantToken;
+  // The ephemeral org's UUID — exported so specs that route through the CP
+  // edge can send X-Molecule-Org-Id (workspace-server TenantGuard). The tabs
+  // harness hits the tenant box same-origin and doesn't need it, but the
+  // take-control gate (staging-display.spec.ts) does.
+  process.env.STAGING_ORG_ID = orgID;
  console.log(`[staging-setup] Ready — ${stateFile}`);
+
+  // 8. (core#2261 Gap 1) Resolve the STANDING desktop-capable org, if one is
+  // configured, for the live take-control e2e (staging-display.spec.ts).
+  //
+  // This block is FULLY env-gated and additive: it provisions NOTHING and is
+  // a no-op unless STAGING_DISPLAY_SLUG is set. We deliberately do NOT spin a
+  // desktop workspace inside this shared setup — a desktop AMI boots in
+  // ~12-15 min and would tax every tabs run. Instead an operator stands up
+  // one always-on desktop org once (a CTO cost item) and points
+  // STAGING_DISPLAY_SLUG + STAGING_DISPLAY_WORKSPACE_ID at it. Here we just
+  // resolve that standing org's tenant URL, admin token, and org id so the
+  // display spec can reach it. Fail-closed: if STAGING_DISPLAY_SLUG is set but
+  // we can't resolve its token/id, we THROW — the gate must never silently
+  // fall back to the (non-desktop) ephemeral org and pass.
+  const displaySlug = process.env.STAGING_DISPLAY_SLUG;
+  if (displaySlug) {
+    console.log(`[staging-setup] Resolving standing desktop org: ${displaySlug}`);
+
+    // org id for the standing slug (admin-orgs row carries it + status).
+    const orgsRes = await jsonFetch(`${CP_URL}/cp/admin/orgs`, { headers: adminAuth });
+    if (orgsRes.status !== 200) {
+      throw new Error(
+        `STAGING_DISPLAY_SLUG=${displaySlug} set, but GET /cp/admin/orgs returned ` +
+          `${orgsRes.status} — cannot resolve the standing desktop org for the ` +
+          `take-control gate.`,
+      );
+    }
+    const displayRow = (orgsRes.body?.orgs || []).find(
+      (o: any) => o.slug === displaySlug,
+    );
+    if (!displayRow?.id) {
+      throw new Error(
+        `STAGING_DISPLAY_SLUG=${displaySlug} not found in /cp/admin/orgs — the ` +
+          `standing desktop org for the take-control gate does not exist. Provision ` +
+          `it (one always-on desktop EC2) or unset STAGING_DISPLAY_SLUG/` +
+          `STAGING_DISPLAY_WORKSPACE_ID to skip the gate.`,
+      );
+    }
+    if (displayRow.instance_status !== "running") {
+      throw new Error(
+        `Standing desktop org ${displaySlug} is '${displayRow.instance_status}', ` +
+          `not 'running' — the take-control gate needs a live desktop tenant. ` +
+          `full row: ${JSON.stringify(displayRow)}`,
+      );
+    }
+
+    const displayTokRes = await jsonFetch(
+      `${CP_URL}/cp/admin/orgs/${displaySlug}/admin-token`,
+      { headers: adminAuth },
+    );
+    if (displayTokRes.status !== 200 || !displayTokRes.body?.admin_token) {
+      throw new Error(
+        `admin-token fetch for standing desktop org ${displaySlug} returned ` +
+          `${displayTokRes.status}: ${JSON.stringify(displayTokRes.body)}`,
+      );
+    }
+
+    process.env.STAGING_DISPLAY_ORG_ID = displayRow.id;
+    process.env.STAGING_DISPLAY_TENANT_URL = `https://${displaySlug}.${TENANT_DOMAIN}`;
+    process.env.STAGING_DISPLAY_TENANT_TOKEN = displayTokRes.body.admin_token;
+    console.log(
+      `[staging-setup] Standing desktop org resolved: ${displaySlug} ` +
+        `(org_id=${displayRow.id}, url=${process.env.STAGING_DISPLAY_TENANT_URL})`,
+    );
+  }
 }
@@ -1,7 +1,8 @@
 /**
- * Staging canvas E2E — opens each of the 13 workspace-panel tabs against a
- * fresh staging org provisioned in the global setup. Asserts each tab
- * renders without throwing and captures a screenshot for visual review.
+ * Staging canvas E2E — opens each workspace-panel tab against a fresh
+ * staging org provisioned in the global setup. Asserts each tab renders
+ * REAL content (not an empty container, not an error state) and captures a
+ * screenshot for visual review.
 *
 * Auth model: the tenant platform's AdminAuth middleware accepts a bearer
 * token OR a WorkOS session cookie. Playwright can't mint a WorkOS
@@ -10,17 +11,39 @@
 * Bearer header via context.setExtraHTTPHeaders(). Every browser
 * request inherits the header.
 *
- * Known SaaS gaps — documented in #1369 and allowed to render errored
- * content without failing the test (the gate is "no hard crash, no
- * 'Failed to load' toast"):
+ * PROMOTION-READINESS (see § at bottom of file): this suite is being
+ * hardened toward becoming a HARD merge-gate. It currently runs under
+ * `continue-on-error: true` (RFC internal#219 §1, non-gating) — that is a
+ * deliberate, CTO-owned call and is NOT changed here. The hardening makes
+ * every assertion deterministic so that WHEN promotion happens the gate
+ * does not flap. See the PROMOTION-READINESS block at the foot of this
+ * file for what is now reliable and what still blocks promotion.
+ *
+ * Known SaaS gaps — documented in #1369. These tabs legitimately cannot
+ * load real content in SaaS mode and are allowed an in-panel empty/error
+ * state (NOT a hard crash, NOT an ErrorBoundary):
 *   - Files tab: empty (platform can't docker exec into a remote EC2)
 *   - Terminal tab: WS connect fails
 *   - Peers tab: 401 without workspace-scoped token
+ * These are enumerated in KNOWN_DEGRADED_TABS below and asserted with a
+ * weaker (but still non-trivial) contract: the panel renders and does not
+ * crash the app. Every OTHER tab must render real content.
 */

-import { test, expect } from "@playwright/test";
+import { test, expect, type Page } from "@playwright/test";

 // Tab ids as declared in canvas/src/components/SidePanel.tsx TABS.
+//
+// NOTE (drift guard): this list is asserted-complete against the live DOM
+// below (see "tab list parity" step) so it cannot silently drift out of
+// sync with SidePanel.tsx TABS the way a hand-maintained constant does.
+// `display` and `container-config` are intentionally EXCLUDED here:
+//   - `display` is owned by the in-flight take-control e2e (PR #2275 /
+//     staging-display.spec.ts); asserting it here would collide.
+//   - `container-config` only renders when selectedNodeId is set AND is
+//     gated on tier; it is covered by container-config-specific specs.
+// The parity check accounts for these via EXPECTED_EXTRA_TABS so a NEW
+// tab appearing in SidePanel still trips the guard.
 const TAB_IDS = [
  "chat",
  "activity",
@@ -37,12 +60,131 @@ const TAB_IDS = [
  "audit",
 ] as const;

+// Tabs present in the DOM that this spec intentionally does not drive.
+// Keeping this explicit means a genuinely-new tab (not one of these) makes
+// the parity assertion fail LOUD instead of being silently un-tested.
+const EXPECTED_EXTRA_TABS = ["display", "container-config"] as const;
+
+// Tabs that are KNOWN to degrade in SaaS mode (#1369). They get the weaker
+// "renders + no crash" contract instead of the "real content" contract.
+// Anything NOT in this set must render real content or the test fails.
+const KNOWN_DEGRADED_TABS = new Set<string>(["terminal", "files"]);
+
 const STAGING = process.env.CANVAS_E2E_STAGING === "1";

-test.skip(!STAGING, "CANVAS_E2E_STAGING not set — skipping staging-only tests");
+// IMPORTANT — fail-closed, not skip-green.
+//
+// `test.skip(!STAGING)` is correct ONLY when the operator never asked for a
+// staging run (CANVAS_E2E_STAGING unset). In that case the workflow's
+// detect-changes / token-check gates have already decided not to exercise
+// staging, and skipping is the documented contract.
+//
+// But if STAGING *is* requested (CANVAS_E2E_STAGING=1) and global setup did
+// NOT hand off the tenant state, that is a HARD failure, not a skip — see
+// the explicit env-presence throw inside the test body. A silent skip there
+// would let a broken provision ship green, which is exactly the
+// weak-gate failure this hardening removes (§ No flakes / internal#828).
+test.skip(!STAGING, "CANVAS_E2E_STAGING not set — staging-only suite, not requested");
+
+/**
+ * Assert the panel for `tabId` rendered real content.
+ *
+ * Deterministic contract (no fixed waits — every step is condition-based
+ * with Playwright's built-in retry / expect.poll):
+ *   1. The tabpanel container is visible.
+ *   2. The global ErrorBoundary did NOT trip ("Something went wrong").
+ *   3. No visible error alert is shown in the panel.
+ *   4. For non-degraded tabs: the panel settles to non-empty,
+ *      non-spinner content (so an empty <div/> or a stuck "Loading…"
+ *      spinner FAILS instead of passing as it did before).
+ */
+async function assertPanelRendered(page: Page, tabId: string): Promise<void> {
+  const panel = page.locator(`#panel-${tabId}`);
+
+  // (1) Container visible. Built-in retry up to the expect timeout — no
+  // arbitrary waitForTimeout. Mechanism: replaces any reliance on a fixed
+  // settle delay with a real visibility condition.
+  await expect(panel, `panel for ${tabId} never became visible`).toBeVisible({
+    timeout: 10_000,
+  });
+
+  // (2) ErrorBoundary trip = hard crash anywhere in the React subtree.
+  // canvas/src/components/ErrorBoundary.tsx renders "Something went wrong".
+  // The OLD gate only looked for a "Failed to load" toast and would ship
+  // an ErrorBoundary-crashed panel GREEN. Mechanism: assert the crash
+  // surface is absent, retried via expect.poll so a late-mounting crash
+  // banner is still caught.
+  await expect
+    .poll(
+      async () =>
+        page.getByText("Something went wrong", { exact: false }).count(),
+      {
+        message: `tab ${tabId}: ErrorBoundary tripped (Something went wrong)`,
+        timeout: 5_000,
+      },
+    )
+    .toBe(0);
+
+  // (3) No visible error alert inside the panel. Tabs surface load errors
+  // as role="alert" with the real error text (EventsTab/ChannelsTab/
+  // ConfigTab/...). The OLD gate matched ONLY [role=alert]:has-text("Failed
+  // to load") — it missed (a) error messages that don't contain that exact
+  // phrase and (b) error divs that omit role="alert" entirely (e.g.
+  // ActivityTab). We replace it with a broader, but still SaaS-gap-aware,
+  // check: any *visible* alert OR red error banner inside the panel.
+  //
+  // Degraded tabs (#1369) are allowed an error state — for those we only
+  // require no app-level crash (covered by step 2). For every other tab a
+  // visible error alert is a real regression.
+  if (!KNOWN_DEGRADED_TABS.has(tabId)) {
+    const visibleAlerts = panel.locator('[role="alert"]:visible');
+    await expect
+      .poll(async () => visibleAlerts.count(), {
+        message:
+          `tab ${tabId}: a visible error alert is shown in the panel ` +
+          `(was a weak "Failed to load"-only check before)`,
+        timeout: 5_000,
+      })
+      .toBe(0);
+  }
+
+  // (4) Real content. The tabpanel CONTAINER always mounts, so the old
+  // toBeVisible() on the container passed even when the child rendered
+  // nothing. Assert the panel's trimmed innerText is non-empty AND not
+  // stuck on a loading spinner. expect.poll retries until the async
+  // fetch+render settles — replacing the implicit "the network finished
+  // by now" timing assumption with an explicit polled condition.
+  //
+  // Degraded tabs may legitimately be empty (Files in SaaS mode), so they
+  // are exempt from the non-empty requirement; step 2 still guards them
+  // against a hard crash.
+  if (!KNOWN_DEGRADED_TABS.has(tabId)) {
+    await expect
+      .poll(
+        async () => {
+          const text = ((await panel.innerText()) || "").trim();
+          // A panel still showing only a loading spinner has not settled.
+          const stillLoading = /^(loading\b|loading…|loading\.\.\.)/i.test(
+            text,
+          );
+          return text.length > 0 && !stillLoading;
+        },
+        {
+          message:
+            `tab ${tabId}: panel rendered empty or stuck on a loading ` +
+            `spinner — no real content settled (weak "container visible" ` +
+            `gate would have passed this)`,
+          // Generous: real tabs fetch from the tenant over the network.
+          // Polled, so it returns as soon as content appears.
+          timeout: 20_000,
+        },
+      )
+      .toBe(true);
+  }
+}

 test.describe("staging canvas tabs", () => {
-  test("each workspace-panel tab renders without error", async ({
+  test("each workspace-panel tab renders real content", async ({
    page,
    context,
  }) => {
@@ -50,9 +192,16 @@ test.describe("staging canvas tabs", () => {
    const tenantToken = process.env.STAGING_TENANT_TOKEN;
    const workspaceId = process.env.STAGING_WORKSPACE_ID;

+    // FAIL-CLOSED (not skip): STAGING was requested but global setup did
+    // not export tenant state. A silent skip here would paint a broken
+    // provision GREEN. This is the loud-fail the hardening mandates.
    if (!tenantURL || !tenantToken || !workspaceId) {
      throw new Error(
-        "staging-setup.ts did not export STAGING_TENANT_URL / STAGING_TENANT_TOKEN / STAGING_WORKSPACE_ID — did global setup run?",
+        "staging-setup.ts did not export STAGING_TENANT_URL / " +
+          "STAGING_TENANT_TOKEN / STAGING_WORKSPACE_ID. CANVAS_E2E_STAGING=1 " +
+          "was set (staging WAS requested) but global setup produced no " +
+          "tenant — this is a provisioning failure, NOT a reason to skip. " +
+          "Check the [staging-setup] log above for the real error.",
      );
    }

@@ -152,11 +301,19 @@ test.describe("staging canvas tabs", () => {
    // omit the URL, so we'd otherwise be flying blind. Logged to the
    // test's stdout (visible in the workflow log under the failed step).
    page.on("requestfailed", (req) => {
-      console.log(`[e2e/requestfailed] ${req.method()} ${req.url()}: ${req.failure()?.errorText ?? "?"}`);
+      console.log(
+        `[e2e/requestfailed] ${req.method()} ${req.url()}: ${
+          req.failure()?.errorText ?? "?"
+        }`,
+      );
    });
    page.on("response", (res) => {
      if (res.status() >= 400) {
-        console.log(`[e2e/response-${res.status()}] ${res.request().method()} ${res.url()}`);
+        console.log(
+          `[e2e/response-${res.status()}] ${res
+            .request()
+            .method()} ${res.url()}`,
+        );
      }
    });

@@ -173,9 +330,8 @@ test.describe("staging canvas tabs", () => {
    // hydrated, even with zero workspaces) or the hydration-error
    // banner — whichever wins first. Previous version of this wait
    // used `[role="tablist"]`, but that selector only appears AFTER
-    // a workspace node is clicked (which happens below at L100), so
-    // the wait would always time out at 45s before any meaningful
-    // failure surfaced.
+    // a workspace node is clicked, so the wait would always time out
+    // at 45s before any meaningful failure surfaced.
    await page.waitForSelector(
      '[aria-label="Molecule AI workspace canvas"], [data-testid="hydration-error"]',
      { timeout: 45_000 },
@@ -189,10 +345,20 @@ test.describe("staging canvas tabs", () => {
      "canvas hydration failed — check staging CP + tenant reachability",
    ).toBe(0);

+    // The global ErrorBoundary must not have tripped at the app root
+    // either — a crash before the side panel even opens would otherwise
+    // be invisible until a tab assertion happened to notice it.
+    await expect(
+      page.getByText("Something went wrong", { exact: false }),
+      "app-level ErrorBoundary tripped during hydration",
+    ).toHaveCount(0);
+
    // Click the workspace node to open the side panel. Try a data
    // attribute first, fall back to a generic role-based selector so
    // the test doesn't break when the node-card markup changes.
-    const byDataAttr = page.locator(`[data-workspace-id="${workspaceId}"]`).first();
+    const byDataAttr = page
+      .locator(`[data-workspace-id="${workspaceId}"]`)
+      .first();
    if ((await byDataAttr.count()) > 0) {
      await byDataAttr.click({ timeout: 10_000 });
    } else {
@@ -202,19 +368,56 @@ test.describe("staging canvas tabs", () => {
      await firstNode.click({ timeout: 10_000 });
    }

-    await page.waitForSelector('[role="tablist"]', { timeout: 15_000 });
+    // The tablist appears once the side panel mounts. Condition-based
+    // wait — no fixed delay.
+    const tablist = page.locator('[role="tablist"]');
+    await expect(
+      tablist,
+      "side panel tablist never appeared after clicking the workspace node",
+    ).toBeVisible({ timeout: 15_000 });
+
+    // Tab-list parity guard. The hand-maintained TAB_IDS constant used to
+    // be able to drift silently out of sync with SidePanel.tsx TABS — a
+    // tab could be added to the UI and never get an assertion, shipping
+    // broken-but-untested. Read the actual tab ids from the DOM and assert
+    // every live tab is either driven by this spec (TAB_IDS) or explicitly
+    // excluded (EXPECTED_EXTRA_TABS). A genuinely-new tab fails LOUD.
+    const liveTabIds = (
+      await tablist.locator('[role="tab"][id^="tab-"]').evaluateAll((els) =>
+        els.map((el) => el.id.replace(/^tab-/, "")),
+      )
+    ).sort();
+    const accountedFor = new Set<string>([
+      ...TAB_IDS,
+      ...EXPECTED_EXTRA_TABS,
+    ]);
+    const unaccounted = liveTabIds.filter((id) => !accountedFor.has(id));
+    expect(
+      unaccounted,
+      `SidePanel exposes tab(s) this spec neither drives nor excludes: ` +
+        `${unaccounted.join(", ")}. Add them to TAB_IDS (and assert their ` +
+        `content) or to EXPECTED_EXTRA_TABS with a reason.`,
+    ).toHaveLength(0);
+    // And the inverse: every TAB_ID we intend to drive must actually exist
+    // in the DOM, so a renamed/removed tab fails here instead of timing out
+    // on a missing #tab-<id> selector with an opaque message.
+    const missing = TAB_IDS.filter((id) => !liveTabIds.includes(id));
+    expect(
+      missing,
+      `TAB_IDS references tab(s) not present in SidePanel: ${missing.join(
+        ", ",
+      )} — the spec's tab list has drifted from SidePanel.tsx TABS.`,
+    ).toHaveLength(0);

    for (const tabId of TAB_IDS) {
      await test.step(`tab: ${tabId}`, async () => {
        const tabButton = page.locator(`#tab-${tabId}`);
-        // The TABS bar is `overflow-x-auto` (SidePanel.tsx:~tabs
-        // wrapper) — tabs after position ~3 are clipped behind the
-        // right-edge fade gradient on smaller viewports. Playwright's
-        // `toBeVisible()` returns false for clipped elements, so a
-        // bare visibility check fails on `skills` and later tabs in
-        // CI. scrollIntoViewIfNeeded brings the button into view
-        // before the visibility check, mirroring what SidePanel's own
-        // keyboard handler does on arrow-key navigation.
+        // The TABS bar is `overflow-x-auto` — tabs past position ~3 are
+        // clipped behind the right-edge fade gradient on smaller
+        // viewports. Playwright's toBeVisible() returns false for clipped
+        // elements, so a bare visibility check fails on later tabs in CI.
+        // scrollIntoViewIfNeeded brings the button into view before the
+        // visibility check.
        await tabButton.scrollIntoViewIfNeeded({ timeout: 5_000 });
        await expect(
          tabButton,
@@ -222,18 +425,34 @@ test.describe("staging canvas tabs", () => {
        ).toBeVisible({ timeout: 5_000 });
        await tabButton.click();

-        const panel = page.locator(`#panel-${tabId}`);
-        await expect(panel, `panel for ${tabId} never rendered`).toBeVisible({
-          timeout: 10_000,
-        });
+        // Confirm the click actually activated this tab before asserting
+        // its content — aria-selected flips on the active tab. This closes
+        // a race where a slow click handler left the PREVIOUS tab's panel
+        // mounted and we asserted the wrong panel's content. Built-in
+        // retry, condition-based, no fixed wait.
+        await expect(
+          tabButton,
+          `tab-${tabId} did not become the selected tab after click`,
+        ).toHaveAttribute("aria-selected", "true", { timeout: 5_000 });

-        // "Failed to load" toast = hard crash. Known SaaS-mode gaps
-        // (Files empty, Terminal disconnected, Peers 401) surface as
-        // in-panel content, not toasts.
+        // Real-content assertion (the core hardening). See
+        // assertPanelRendered: container visible + no ErrorBoundary + no
+        // visible error alert + settled non-empty content for non-degraded
+        // tabs. Replaces the old "panel visible + no Failed-to-load toast"
+        // pair, which shipped empty/errored panels green.
+        await assertPanelRendered(page, tabId);
+
+        // Belt to the braces: the original toast check stays. A global
+        // "Failed to load" toast (role=alert outside the panel) is still a
+        // crash signal worth catching even though the in-panel checks above
+        // now do the heavy lifting.
        const errorToasts = await page
          .locator('[role="alert"]:has-text("Failed to load")')
          .count();
-        expect(errorToasts, `tab ${tabId}: "Failed to load" toast`).toBe(0);
+        expect(
+          errorToasts,
+          `tab ${tabId}: a global "Failed to load" toast is showing`,
+        ).toBe(0);

        await page.screenshot({
          path: `test-results/staging-tab-${tabId}.png`,
@@ -267,3 +486,56 @@ test.describe("staging canvas tabs", () => {
    ).toHaveLength(0);
  });
 });
+
+/*
+ * PROMOTION-READINESS — staging canvas E2E → HARD merge-gate
+ * ----------------------------------------------------------
+ * NOW RELIABLE (deterministic; these no longer flap on timing):
+ *   - Every wait is condition-based (toBeVisible / toHaveAttribute /
+ *     expect.poll). There is NO fixed waitForTimeout / sleep in the spec;
+ *     the only setTimeout is the bounded poll-interval inside
+ *     staging-setup.ts waitFor(), which has a hard deadline.
+ *   - Tabs are asserted on REAL settled content (non-empty, non-spinner),
+ *     not just "container is visible" — an empty or stuck-loading panel now
+ *     fails instead of shipping green.
+ *   - The ErrorBoundary ("Something went wrong") is asserted absent at app
+ *     hydration AND per tab — a React subtree crash can no longer pass.
+ *   - Visible error alerts inside a panel fail non-degraded tabs (was a
+ *     weak [role=alert]:has-text("Failed to load")-only check that missed
+ *     both other error phrasings and role-less error divs).
+ *   - The driven tab list is parity-checked against the live DOM, so a new
+ *     SidePanel tab can't ship un-tested and a removed one fails loud.
+ *   - Click→activation is confirmed (aria-selected) before asserting the
+ *     panel, removing a wrong-panel race.
+ *   - The suite is fail-closed: CANVAS_E2E_STAGING=1 with no tenant state
+ *     hard-errors (never skips→green); CANVAS_E2E_STAGING unset cleanly
+ *     skips (operator did not request staging).
+ *
+ * STILL BLOCKS PROMOTION-TO-REQUIRED (do NOT flip continue-on-error here —
+ * CTO-owned, RFC internal#219 §1):
+ *   - INFRA DEPENDENCY: each run provisions a real staging EC2 tenant
+ *     (12-20 min cold boot). Required-gate latency + AWS/Cloudflare/CP
+ *     availability become merge-blockers. A staging outage would freeze
+ *     main even though the code is fine — unacceptable for a required check
+ *     until staging has an SLA or this runs against a warm pre-provisioned
+ *     pool.
+ *   - SHARED-RESOURCE FLAKE SURFACE: TLS/DNS/ACME propagation on a shared
+ *     staging zone (staging-setup TLS_TIMEOUT_MS) is outside this repo's
+ *     control. Deterministic here ≠ deterministic upstream.
+ *   - SECRET DEPENDENCY: CP_STAGING_ADMIN_API_TOKEN must be present on the
+ *     runner. The workflow's skip-if-absent (core#2225) keeps a missing
+ *     secret from painting red — correct for non-gating, but a REQUIRED
+ *     check must instead guarantee the secret is always present, else it
+ *     skip-greens the very thing it is supposed to enforce.
+ *   - SINGLE-WORKSPACE COVERAGE: one hermes/platform_managed workspace that
+ *     does NOT boot an agent on staging (no CP LLM proxy env, workspace-
+ *     server #2162). Tabs render, but agent-dependent content paths (live
+ *     chat round-trip, traces from a real run) are not exercised.
+ *
+ * PROMOTION CHECKLIST (when CTO signs off on making this required):
+ *   1. Warm pre-provisioned tenant pool OR a staging SLA bounding boot time.
+ *   2. Guarantee CP_STAGING_ADMIN_API_TOKEN on the gating runner; turn the
+ *      skip-if-absent into a hard error for the required path.
+ *   3. Decide whether agent-dependent tabs need a wired LLM proxy on the
+ *      staging tenant (covers chat/traces real content) before gating them.
+ */
@@ -7,6 +7,14 @@ export default defineConfig({
  fullyParallel: false,
  workers: 1,
  retries: 0,
+  // Fail CLOSED when an explicit spec selection matches zero tests.
+  // Playwright defaults this to true, so `playwright test e2e/chat-*.spec.ts`
+  // would exit 0 (green) if those files were renamed/moved/deleted — a
+  // false-green that would silently gut the e2e-chat gate after a refactor.
+  // forbidOnly likewise stops a stray `test.only` from green-ing the suite
+  // while skipping every other case.
+  passWithNoTests: false,
+  forbidOnly: !!process.env.CI,
  use: {
    baseURL: process.env.PLAYWRIGHT_BASE_URL || "http://localhost:3000",
    headless: true,
@@ -1,12 +1,17 @@
 /**
 * Canvas /api/buildinfo — version-display endpoint mirroring
 * workspace-server's /buildinfo. Lets `curl <url>/api/buildinfo`
- * confirm which git SHA is live on a canvas deployment.
+ * confirm which git SHA is live on a canvas deployment (core#2235).
 */
 import { describe, it, expect, beforeEach, afterEach } from "vitest";
 import { GET } from "../route";

-const ENV_KEYS = ["VERCEL_GIT_COMMIT_SHA", "VERCEL_GIT_COMMIT_REF", "VERCEL_ENV"];
+const ENV_KEYS = [
+  "BUILD_SHA",
+  "VERCEL_GIT_COMMIT_SHA",
+  "VERCEL_GIT_COMMIT_REF",
+  "VERCEL_ENV",
+];

 describe("GET /api/buildinfo", () => {
  let saved: Record<string, string | undefined>;
@@ -23,13 +28,24 @@ describe("GET /api/buildinfo", () => {
    }
  });

-  it("returns dev sentinel when Vercel env vars are unset", async () => {
+  it("returns dev sentinel when no SHA source is set", async () => {
    const res = await GET();
    const body = await res.json();
    expect(body).toEqual({ git_sha: "dev", git_ref: "", vercel_env: "local" });
  });

-  it("reports the SHA Vercel injected at build time", async () => {
+  it("reports BUILD_SHA baked into the Docker image (fleet deploy path)", async () => {
+    // BUILD_SHA is the authoritative source for the ECR-image fleet deploy,
+    // which never runs on Vercel. It must win even when a Vercel var is also
+    // present in the environment.
+    process.env.BUILD_SHA = "deadbeefcafe";
+    process.env.VERCEL_GIT_COMMIT_SHA = "should-not-win";
+    const res = await GET();
+    const body = await res.json();
+    expect(body.git_sha).toBe("deadbeefcafe");
+  });
+
+  it("falls back to the SHA Vercel injected when BUILD_SHA is unset", async () => {
    process.env.VERCEL_GIT_COMMIT_SHA = "abc1234567890";
    process.env.VERCEL_GIT_COMMIT_REF = "main";
    process.env.VERCEL_ENV = "production";
@@ -1,17 +1,36 @@
 import { NextResponse } from "next/server";

 // Mirror of workspace-server's GET /buildinfo (PR #2398). Lets a developer
-// confirm which git SHA is live on a canvas deployment with the same
-// `curl <url>/buildinfo` flow they use against tenant workspaces.
+// or the fleet redeploy workflow confirm which git SHA is live on a canvas
+// deployment with the same `curl <url>/api/buildinfo` flow used against
+// tenant workspaces (core#2235; cross-ref core#2226).
 //
-// Vercel injects VERCEL_GIT_COMMIT_SHA / _REF / VERCEL_ENV at build time
-// from the deploying commit; outside Vercel (local `next dev`, harness)
-// these are unset and the endpoint reports `git_sha: "dev"`. Same sentinel
-// the workspace-server uses pre-ldflags-injection so both surfaces speak
-// the same vocabulary.
+// SHA source, in priority order:
+//   1. BUILD_SHA — server-only env baked into the canvas Docker image at
+//      build time (Dockerfile `ARG BUILD_SHA` → `ENV BUILD_SHA`, wired
+//      from `${{ github.sha }}` in publish-canvas-image.yml). This is the
+//      authoritative source for the fleet's ECR-image deploy path, which
+//      does NOT run on Vercel. Read server-side here (App Router route
+//      handler runs on the standalone Node server, `output: "standalone"`),
+//      so it is intentionally NOT a NEXT_PUBLIC_ var — keeping it out of
+//      the client bundle.
+//   2. VERCEL_GIT_COMMIT_SHA — Vercel injects this at build time when the
+//      canvas is deployed via Vercel rather than the Docker image.
+//   3. "dev" — local `next dev` / test harness, where neither is set. Same
+//      sentinel workspace-server uses pre-ldflags-injection, so both
+//      surfaces speak the same vocabulary and an unconfigured deploy
+//      fails the SHA comparison closed instead of round-tripping "".
+//
+// force-dynamic so the response is evaluated at request time against the
+// runtime env of the standalone server (where ENV BUILD_SHA lives), not
+// frozen into a static asset at `next build`.
+export const dynamic = "force-dynamic";
+
 export async function GET() {
+  const sha =
+    process.env.BUILD_SHA ?? process.env.VERCEL_GIT_COMMIT_SHA ?? "dev";
  return NextResponse.json({
-    git_sha: process.env.VERCEL_GIT_COMMIT_SHA ?? "dev",
+    git_sha: sha,
    git_ref: process.env.VERCEL_GIT_COMMIT_REF ?? "",
    vercel_env: process.env.VERCEL_ENV ?? "local",
  });
@@ -8,9 +8,13 @@ import { ExternalConnectModal, type ExternalConnectionInfo } from "./ExternalCon
 import {
  ProviderModelSelector,
  buildProviderCatalog,
+  buildProviderCatalogFromRegistry,
  findProviderForModel,
+  isPlatformManagedProvider,
  type SelectorModel,
  type SelectorValue,
+  type RegistryProvider,
+  type RegistryModel,
 } from "./ProviderModelSelector";

 interface WorkspaceOption {
@@ -32,6 +36,16 @@ interface TemplateSpec {
  model?: string;
  models?: SelectorModel[];
  providers?: string[];
+  // internal#718 P3 registry-served fields (additive; absent on older
+  // backends and for non-registry runtimes). When registry_backed is true the
+  // provider→model catalog is built from registry_providers/registry_models so
+  // each model's DERIVED provider (e.g. moonshot/kimi-k2.6 → "platform") drives
+  // the dropdown bucket and the create payload's llm_provider — instead of the
+  // legacy inferVendor heuristic that slash-splits the id into "moonshot".
+  // Mirrors ConfigTab's RuntimeOption loader (RFC#340 Fix C).
+  registry_backed?: boolean;
+  registry_providers?: RegistryProvider[];
+  registry_models?: RegistryModel[];
 }

 const DEFAULT_RUNTIME = "claude-code";
@@ -168,15 +182,53 @@ export function CreateWorkspaceButton() {
    }),
    [runtime, templateSpecs],
  );
-  const llmModels = useMemo(
-    () => {
-      const sourceSpec = selectedTemplateSpec ?? selectedRuntimeTemplateSpec;
-      if (!sourceSpec?.models?.length) return [];
-      return sourceSpec.models;
-    },
+  // The /templates row backing the LLM picker: an explicitly-selected
+  // workspace template wins, else the base runtime template row.
+  const llmSourceSpec = useMemo<TemplateSpec | null>(
+    () => selectedTemplateSpec ?? selectedRuntimeTemplateSpec,
    [selectedRuntimeTemplateSpec, selectedTemplateSpec],
  );
-  const llmCatalog = useMemo(() => buildProviderCatalog(llmModels), [llmModels]);
+  // internal#718 P3 / RFC#340 Fix C: a runtime is registry-backed when the
+  // /templates row says so AND it served a non-empty registry_models set.
+  // Mirrors ConfigTab's `registryBacked` derivation exactly.
+  const registryBacked = useMemo(
+    () =>
+      llmSourceSpec?.registry_backed === true &&
+      (llmSourceSpec.registry_models?.length ?? 0) > 0,
+    [llmSourceSpec],
+  );
+  // Models fed to the selector dropdown. For a registry-backed runtime use the
+  // registry-served native set, carrying each model's DERIVED provider so the
+  // selector buckets it correctly (moonshot/kimi-k2.6 → "platform", not the
+  // inferVendor "moonshot"). Otherwise fall back to the template-served
+  // models[] + the legacy heuristic — same fallback ConfigTab keeps.
+  const llmModels = useMemo<SelectorModel[]>(
+    () => {
+      if (registryBacked) {
+        return (llmSourceSpec?.registry_models ?? []).map((m) => ({
+          id: m.id,
+          name: m.name,
+          ...(m.provider ? { provider: m.provider } : {}),
+        }));
+      }
+      return llmSourceSpec?.models?.length ? llmSourceSpec.models : [];
+    },
+    [registryBacked, llmSourceSpec],
+  );
+  // Registry-backed path: build the catalog from registry_providers/
+  // registry_models so dropdown labels + billing + the derived provider come
+  // from the provider-registry SSOT (restores the "Platform" bucket). Legacy
+  // path: re-infer from models[] via buildProviderCatalog (inferVendor).
+  const llmCatalog = useMemo(
+    () =>
+      registryBacked
+        ? buildProviderCatalogFromRegistry(
+            llmSourceSpec?.registry_providers ?? [],
+            llmSourceSpec?.registry_models ?? [],
+          )
+        : buildProviderCatalog(llmModels),
+    [registryBacked, llmSourceSpec, llmModels],
+  );
  const selectedLLMProvider = useMemo(
    () => llmCatalog.find((p) => p.id === llmSelection.providerId) ?? llmCatalog[0],
    [llmCatalog, llmSelection.providerId],
@@ -184,7 +236,7 @@ export function CreateWorkspaceButton() {

  useEffect(() => {
    if (llmCatalog.length === 0) return;
-    const sourceDefault = (selectedTemplateSpec ?? selectedRuntimeTemplateSpec)?.model?.trim();
+    const sourceDefault = llmSourceSpec?.model?.trim();
    const platformProvider = llmCatalog.find((p) => p.vendor === "platform");
    const matched = sourceDefault ? findProviderForModel(llmCatalog, sourceDefault) : null;
    const next = platformProvider ?? matched ?? llmCatalog[0];
@@ -197,7 +249,7 @@ export function CreateWorkspaceButton() {
      envVars: next.envVars,
    });
    setLLMSecret("");
-  }, [llmCatalog, selectedRuntimeTemplateSpec, selectedTemplateSpec]);
+  }, [llmCatalog, llmSourceSpec]);

  // Reset form and load workspaces whenever dialog opens
  useEffect(() => {
@@ -239,7 +291,15 @@ export function CreateWorkspaceButton() {
      setError("Model is required");
      return;
    }
-    if (!isExternal && selectedLLMProvider?.envVars.length && !llmSecret.trim()) {
+    // Platform-managed providers need NO user credential — the platform injects
+    // its own usage token (MOLECULE_LLM_USAGE_TOKEN = tenant admin_token) at
+    // provision time. Only BYOK providers require a user-supplied key. (#2245)
+    if (
+      !isExternal &&
+      !isPlatformManagedProvider(selectedLLMProvider) &&
+      selectedLLMProvider?.envVars.length &&
+      !llmSecret.trim()
+    ) {
      setError("Provider credential is required");
      return;
    }
@@ -274,7 +334,11 @@ export function CreateWorkspaceButton() {
          ? {
              model: llmSelection.model.trim(),
              llm_provider: nativeProvider.vendor,
-              ...(nativeProvider.envVars.length > 0
+              // Only BYOK providers carry a user secret. For platform-managed
+              // the token is provisioner-injected; sending an (empty) secret
+              // here would clobber it — so omit it entirely. (#2245)
+              ...(nativeProvider.envVars.length > 0 &&
+              !isPlatformManagedProvider(nativeProvider)
                ? { secrets: { [nativeProvider.envVars[0]]: llmSecret.trim() } }
                : {}),
            }
@@ -461,6 +525,7 @@ export function CreateWorkspaceButton() {
                </div>
                <ProviderModelSelector
                  models={llmModels}
+                  catalog={registryBacked ? llmCatalog : undefined}
                  value={llmSelection}
                  onChange={(next) => {
                    setLLMSelection(next);
@@ -469,20 +534,26 @@ export function CreateWorkspaceButton() {
                  idPrefix="create-workspace-llm"
                  variant="stack"
                />
-                {selectedLLMProvider.envVars.length > 0 && (
-                  <div>
-                    <label htmlFor="llm-secret-input" className="text-[11px] text-ink-mid block mb-1">
-                      {selectedLLMProvider.envVars[0]}
-                    </label>
-                    <input
-                      id="llm-secret-input"
-                      type="password"
-                      value={llmSecret}
-                      onChange={(e) => setLLMSecret(e.target.value)}
-                      autoComplete="off"
-                      className="w-full bg-surface-card/60 border border-line/50 rounded-lg px-3 py-2 text-sm text-ink placeholder-ink-soft focus:outline-none focus:border-accent/60 focus:ring-1 focus:ring-accent/20 transition-colors font-mono"
-                    />
+                {isPlatformManagedProvider(selectedLLMProvider) ? (
+                  <div className="text-[11px] text-ink-soft">
+                    Platform-managed — no API key required.
                  </div>
+                ) : (
+                  selectedLLMProvider.envVars.length > 0 && (
+                    <div>
+                      <label htmlFor="llm-secret-input" className="text-[11px] text-ink-mid block mb-1">
+                        {selectedLLMProvider.envVars[0]}
+                      </label>
+                      <input
+                        id="llm-secret-input"
+                        type="password"
+                        value={llmSecret}
+                        onChange={(e) => setLLMSecret(e.target.value)}
+                        autoComplete="off"
+                        className="w-full bg-surface-card/60 border border-line/50 rounded-lg px-3 py-2 text-sm text-ink placeholder-ink-soft focus:outline-none focus:border-accent/60 focus:ring-1 focus:ring-accent/20 transition-colors font-mono"
+                      />
+                    </div>
+                  )
                )}
              </div>
            )}
@@ -55,6 +55,21 @@ export interface ProviderEntry {
  billingMode?: "platform_managed" | "byok";
 }

+/** A provider is "platform-managed" when the Molecule platform proxies the LLM
+ *  call and injects its own usage credential — the tenant admin_token, surfaced
+ *  to the workspace as MOLECULE_LLM_USAGE_TOKEN by the CP provisioner
+ *  (controlplane ec2.go: `MOLECULE_LLM_USAGE_TOKEN="$ADMIN_TOKEN"`). The user
+ *  supplies NO key for these: the credential is internal plumbing, not a user
+ *  input. Detected by vendor==="platform" (the platform proxy provider, which
+ *  declares MOLECULE_LLM_USAGE_TOKEN in its AuthEnv) OR
+ *  billingMode==="platform_managed" (registry-backed, internal#718 P3). BYOK
+ *  providers return false and DO require a user-supplied credential. */
+export function isPlatformManagedProvider(
+  p?: Pick<ProviderEntry, "vendor" | "billingMode"> | null,
+): boolean {
+  return p?.vendor === "platform" || p?.billingMode === "platform_managed";
+}
+
 /** RegistryProvider mirrors one entry of GET /templates `registry_providers`
 *  (workspace-server registryProviderView): the registry's native provider for
 *  a runtime, with its display label, auth-env NAMES, and billing mode. This is
@@ -2,6 +2,7 @@
 import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
 import { render, screen, fireEvent, waitFor, cleanup } from "@testing-library/react";
 import { CreateWorkspaceButton } from "../CreateWorkspaceDialog";
+import { isPlatformManagedProvider } from "../ProviderModelSelector";

 vi.mock("@/lib/api", () => ({
  api: {
@@ -65,6 +66,34 @@ const SAMPLE_TEMPLATES = [
      { id: "moonshot/kimi-k2.6", name: "Kimi K2.6", provider: "platform", required_env: [] },
    ],
  },
+  // #2245 fixtures. The real registry `platform` provider declares
+  // MOLECULE_LLM_USAGE_TOKEN in its auth_env — the default mock above masks the
+  // bug by using required_env:[]. This template gives the platform provider a
+  // non-empty auth env (matching production) so the credential-suppression
+  // logic is actually exercised.
+  {
+    id: "platform-managed-test",
+    name: "Platform Managed Test",
+    runtime: "claude-code",
+    model: "moonshot/kimi-k2.6",
+    providers: ["platform", "minimax"],
+    models: [
+      { id: "moonshot/kimi-k2.6", name: "Kimi K2.6", provider: "platform", required_env: ["MOLECULE_LLM_USAGE_TOKEN"] },
+      { id: "MiniMax-M2.7", name: "MiniMax M2.7", required_env: ["MINIMAX_API_KEY"] },
+    ],
+  },
+  // BYOK-only template (no platform provider) — the credential requirement
+  // MUST still hold for these (no-regression guard).
+  {
+    id: "byok-only-test",
+    name: "BYOK Only Test",
+    runtime: "claude-code",
+    model: "openai/gpt-4o",
+    providers: ["openai"],
+    models: [
+      { id: "openai/gpt-4o", name: "GPT-4o", required_env: ["OPENAI_API_KEY"] },
+    ],
+  },
 ];

 beforeEach(() => {
@@ -454,6 +483,182 @@ describe("CreateWorkspaceDialog — dynamic runtime provider picker", () => {
  });
 });

+// ---------------------------------------------------------------------------
+// Registry-backed provider catalog (RFC#340 Fix C)
+//
+// Regression guard for the mis-bucketing bug: when a registry-backed
+// claude-code template serves `moonshot/kimi-k2.6` whose DERIVED provider is
+// `platform`, the dialog must build the dropdown from registry_providers/
+// registry_models (buildProviderCatalogFromRegistry) — NOT the legacy
+// inferVendor heuristic which slash-splits the id into "moonshot". The
+// distinguishing trait of this fixture: the plain `models[]` array does NOT
+// carry an explicit `provider` field, so the LEGACY path would bucket the
+// model under "moonshot" and send llm_provider:"moonshot". Only the
+// registry-backed path yields the Platform bucket + llm_provider:"platform".
+// ---------------------------------------------------------------------------
+
+// claude-code template whose plain models[] is UN-annotated (no explicit
+// provider). The derived-provider annotation lives ONLY in registry_models.
+const REGISTRY_TEMPLATE = {
+  id: "claude-code-default",
+  name: "Claude Code Agent",
+  runtime: "claude-code",
+  model: "moonshot/kimi-k2.6",
+  // Legacy fields — note: NO explicit provider on the platform model, so the
+  // legacy inferVendor path would slash-split it into "moonshot".
+  providers: ["platform", "minimax", "anthropic"],
+  models: [
+    { id: "moonshot/kimi-k2.6", name: "Kimi K2.6", required_env: [] },
+    { id: "MiniMax-M2.7", name: "MiniMax M2.7", required_env: ["MINIMAX_API_KEY"] },
+    { id: "claude-sonnet-4-6", name: "Claude Sonnet 4.6", required_env: ["ANTHROPIC_API_KEY"] },
+  ],
+  // Registry-served SSOT (internal#718 P3). DeriveProvider resolved
+  // moonshot/kimi-k2.6 → "platform"; MiniMax-M2.7 → "minimax".
+  registry_backed: true,
+  registry_providers: [
+    { name: "platform", display_name: "Platform", auth_env: [], billing_mode: "platform_managed" },
+    { name: "minimax", display_name: "MiniMax", auth_env: ["MINIMAX_API_KEY"], billing_mode: "byok" },
+    { name: "anthropic", display_name: "Anthropic API", auth_env: ["ANTHROPIC_API_KEY"], billing_mode: "byok" },
+  ],
+  registry_models: [
+    { id: "moonshot/kimi-k2.6", name: "Kimi K2.6", provider: "platform", billing_mode: "platform_managed" },
+    { id: "MiniMax-M2.7", name: "MiniMax M2.7", provider: "minimax", billing_mode: "byok" },
+    { id: "claude-sonnet-4-6", name: "Claude Sonnet 4.6", provider: "anthropic", billing_mode: "byok" },
+  ],
+};
+
+// Registry-backed platform provider WITH a non-empty auth_env — this matches
+// the PRODUCTION provider view, which ships the raw AuthEnv
+// ([MOLECULE_LLM_USAGE_TOKEN]). REGISTRY_TEMPLATE above uses auth_env:[] so it
+// never exercises suppression; this one drives the billingMode==="platform_
+// managed" branch end-to-end through buildProviderCatalogFromRegistry. (#2245)
+const REGISTRY_TEMPLATE_PLATFORM_AUTHENV = {
+  ...REGISTRY_TEMPLATE,
+  registry_providers: [
+    {
+      name: "platform",
+      display_name: "Platform",
+      auth_env: ["MOLECULE_LLM_USAGE_TOKEN"],
+      billing_mode: "platform_managed",
+    },
+    { name: "minimax", display_name: "MiniMax", auth_env: ["MINIMAX_API_KEY"], billing_mode: "byok" },
+    { name: "anthropic", display_name: "Anthropic API", auth_env: ["ANTHROPIC_API_KEY"], billing_mode: "byok" },
+  ],
+};
+
+describe("CreateWorkspaceDialog — registry-backed provider catalog (RFC#340 Fix C)", () => {
+  beforeEach(() => {
+    mockGet.mockImplementation(async (url: string) => {
+      if (url === "/templates") {
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        return [REGISTRY_TEMPLATE] as any;
+      }
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      return SAMPLE_WORKSPACES as any;
+    });
+  });
+
+  it("shows the Platform provider bucket for the registry-backed claude-code runtime", async () => {
+    await openDialog();
+    const providerSelect = await waitFor(() => {
+      const sel = document.querySelector("[data-testid='provider-select']") as HTMLSelectElement;
+      expect(sel).toBeTruthy();
+      return sel;
+    });
+    const labels = Array.from(providerSelect.options).map((o) => o.text.trim());
+    // Registry display_name "Platform" appears — NOT "moonshot" from the
+    // legacy slash-split heuristic.
+    expect(labels).toContain("Platform");
+    expect(labels).not.toContain("moonshot");
+    // Bucket id is the registry-keyed id, vendor is the bare provider name.
+    const values = Array.from(providerSelect.options).map((o) => o.value);
+    expect(values).toContain("registry|platform");
+  });
+
+  it("sends llm_provider: platform (not moonshot) for moonshot/kimi-k2.6", async () => {
+    await openDialog();
+    fireEvent.change(screen.getByPlaceholderText("e.g. SEO Agent"), {
+      target: { value: "Kimi Agent" },
+    });
+    // Wait for the registry default to settle on the Platform bucket + model.
+    await waitFor(() => {
+      const modelSelect = document.querySelector("[data-testid='model-select']") as HTMLSelectElement;
+      expect(modelSelect?.value).toBe("moonshot/kimi-k2.6");
+    });
+
+    const createBtn = screen.getAllByRole("button").find((b) => b.textContent === "Create");
+    fireEvent.click(createBtn!);
+
+    await waitFor(() => expect(mockPost).toHaveBeenCalled());
+    const body = mockPost.mock.calls[0][1] as Record<string, unknown>;
+    expect(body.model).toBe("moonshot/kimi-k2.6");
+    expect(body.llm_provider).toBe("platform");
+    // Platform is auth-env-free → no BYOK secret.
+    expect(body.secrets).toBeUndefined();
+  });
+
+  it("buckets MiniMax-M2.7 under its derived provider and sends llm_provider: minimax", async () => {
+    await openDialog();
+    fireEvent.change(screen.getByPlaceholderText("e.g. SEO Agent"), {
+      target: { value: "MiniMax Agent" },
+    });
+    await waitFor(() => {
+      const sel = document.querySelector("[data-testid='provider-select']") as HTMLSelectElement;
+      expect(Array.from(sel.options).map((o) => o.value)).toContain("registry|minimax");
+    });
+    fireEvent.change(document.querySelector("[data-testid='provider-select']") as HTMLSelectElement, {
+      target: { value: "registry|minimax" },
+    });
+    fireEvent.change(document.getElementById("llm-secret-input") as HTMLInputElement, {
+      target: { value: "sk-minimax-test" },
+    });
+
+    const createBtn = screen.getAllByRole("button").find((b) => b.textContent === "Create");
+    fireEvent.click(createBtn!);
+
+    await waitFor(() => expect(mockPost).toHaveBeenCalled());
+    const body = mockPost.mock.calls[0][1] as Record<string, unknown>;
+    expect(body.model).toBe("MiniMax-M2.7");
+    expect(body.llm_provider).toBe("minimax");
+    expect(body.secrets).toEqual({ MINIMAX_API_KEY: "sk-minimax-test" });
+  });
+
+  it("suppresses the credential for a registry-backed platform provider that declares an auth_env — billingMode path (#2245)", async () => {
+    // Override the default REGISTRY_TEMPLATE (auth_env:[]) with the production-
+    // shaped one whose platform provider declares MOLECULE_LLM_USAGE_TOKEN.
+    mockGet.mockImplementation(async (url: string) => {
+      if (url === "/templates") {
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        return [REGISTRY_TEMPLATE_PLATFORM_AUTHENV] as any;
+      }
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      return SAMPLE_WORKSPACES as any;
+    });
+    await openDialog();
+    fireEvent.change(screen.getByPlaceholderText("e.g. SEO Agent"), {
+      target: { value: "Registry Platform Agent" },
+    });
+    // Platform is the default bucket; even with a non-empty auth_env the key
+    // field must NOT render (suppressed via billingMode==="platform_managed").
+    await waitFor(() => {
+      const sel = document.querySelector("[data-testid='provider-select']") as HTMLSelectElement;
+      expect(sel?.value).toBe("registry|platform");
+    });
+    expect(screen.getByText("Platform-managed — no API key required.")).toBeTruthy();
+    expect(document.getElementById("llm-secret-input")).toBeNull();
+
+    const createBtn = screen.getAllByRole("button").find((b) => b.textContent === "Create");
+    fireEvent.click(createBtn!);
+
+    await waitFor(() => expect(mockPost).toHaveBeenCalled());
+    expect(screen.queryByText("Provider credential is required")).toBeNull();
+    const body = mockPost.mock.calls[0][1] as Record<string, unknown>;
+    expect(body.llm_provider).toBe("platform");
+    // The provisioner-injected MOLECULE_LLM_USAGE_TOKEN must NOT be clobbered.
+    expect(body.secrets).toBeUndefined();
+  });
+});
+
 // ---------------------------------------------------------------------------
 // budget_limit field tests (#541)
 // ---------------------------------------------------------------------------
@@ -535,3 +740,70 @@ describe("CreateWorkspaceDialog — budget_limit field", () => {
    expect(budgetInput.value).toBe("");
  });
 });
+
+describe("CreateWorkspaceDialog — platform-managed credential suppression (#2245)", () => {
+  describe("isPlatformManagedProvider", () => {
+    it("is true for the platform proxy vendor", () => {
+      expect(isPlatformManagedProvider({ vendor: "platform" })).toBe(true);
+    });
+    it("is true for a registry billingMode of platform_managed", () => {
+      expect(
+        isPlatformManagedProvider({ vendor: "minimax", billingMode: "platform_managed" }),
+      ).toBe(true);
+    });
+    it("is false for a BYOK provider", () => {
+      expect(isPlatformManagedProvider({ vendor: "anthropic", billingMode: "byok" })).toBe(false);
+      expect(isPlatformManagedProvider({ vendor: "minimax" })).toBe(false);
+    });
+    it("is false for null/undefined", () => {
+      expect(isPlatformManagedProvider(null)).toBe(false);
+      expect(isPlatformManagedProvider(undefined)).toBe(false);
+    });
+  });
+
+  it("platform-managed provider with a declared auth env requires NO credential, hides the key field, and sends NO secret", async () => {
+    await openDialog();
+    await setTemplate("platform-managed-test");
+    fireEvent.change(screen.getByPlaceholderText("e.g. SEO Agent"), {
+      target: { value: "Platform Agent" },
+    });
+
+    // The credential input must NOT render for platform-managed; a "no key
+    // required" note appears instead.
+    await waitFor(() =>
+      expect(screen.getByText("Platform-managed — no API key required.")).toBeTruthy(),
+    );
+    expect(screen.queryByLabelText("MOLECULE_LLM_USAGE_TOKEN")).toBeNull();
+
+    const createBtn = screen.getAllByRole("button").find((b) => b.textContent === "Create");
+    fireEvent.click(createBtn!);
+
+    await waitFor(() => expect(mockPost).toHaveBeenCalled());
+    // No validation error, and the provisioner-injected token is NOT clobbered
+    // by an empty secret.
+    expect(screen.queryByText("Provider credential is required")).toBeNull();
+    const body = mockPost.mock.calls[0][1] as Record<string, unknown>;
+    expect(body.llm_provider).toBe("platform");
+    expect(body.secrets).toBeUndefined();
+  });
+
+  it("BYOK provider still requires a credential and renders the key field (no-regression)", async () => {
+    await openDialog();
+    await setTemplate("byok-only-test");
+    fireEvent.change(screen.getByPlaceholderText("e.g. SEO Agent"), {
+      target: { value: "BYOK Agent" },
+    });
+
+    // The credential field IS rendered for BYOK...
+    await waitFor(() => expect(screen.getByLabelText("OPENAI_API_KEY")).toBeTruthy());
+
+    const createBtn = screen.getAllByRole("button").find((b) => b.textContent === "Create");
+    fireEvent.click(createBtn!);
+
+    // ...and create stays blocked until it's filled.
+    await waitFor(() =>
+      expect(screen.getByText("Provider credential is required")).toBeTruthy(),
+    );
+    expect(mockPost).not.toHaveBeenCalled();
+  });
+});
@@ -1,6 +1,6 @@
 "use client";

-import { useEffect, useRef, useState } from "react";
+import { useCallback, useEffect, useRef, useState } from "react";
 import { api } from "@/lib/api";
 import type RFB from "@novnc/novnc";

@@ -33,6 +33,11 @@ export function DisplayTab({ workspaceId }: Props) {
  const [controlBusy, setControlBusy] = useState(false);
  const [sessionUrl, setSessionUrl] = useState<string | null>(null);
  const requestGeneration = useRef(0);
+  // Freshest signed session URL (token bound to the lease's expires_at). The
+  // renewal timer keeps this current WITHOUT swapping the live stream's
+  // sessionUrl (which would needlessly reconnect the desktop); the stream uses
+  // it only when it has to reconnect after an unclean drop.
+  const latestSessionUrlRef = useRef<string | null>(null);

  useEffect(() => {
    const generation = requestGeneration.current + 1;
@@ -41,6 +46,7 @@ export function DisplayTab({ workspaceId }: Props) {
    setStatus(null);
    setControl(null);
    setSessionUrl(null);
+    latestSessionUrlRef.current = null;
    setError(null);
    setControlError(null);
    setControlBusy(false);
@@ -69,6 +75,41 @@ export function DisplayTab({ workspaceId }: Props) {
    };
  }, [workspaceId]);

+  // Acquire (or re-acquire) the display-control lease as the current holder.
+  // Re-acquiring extends the 300s server-side lock AND returns a freshly-signed
+  // session URL (token bound to the new expires_at). Used both to renew the
+  // lease on a timer and to mint a non-stale token for each reconnect — a
+  // cached URL can be past its ~300s expiry, which would make a reconnect 401.
+  const reacquireSession = useCallback(async (): Promise<string | null> => {
+    const generation = requestGeneration.current;
+    try {
+      const next = await api.post<DisplayControlStatus>(
+        `/workspaces/${workspaceId}/display/control/acquire`,
+        { controller: "user", ttl_seconds: 300 },
+      );
+      if (requestGeneration.current !== generation) return null;
+      setControl(next);
+      if (next.session_url) latestSessionUrlRef.current = next.session_url;
+      return next.session_url ?? null;
+    } catch {
+      // Transient failure, or another holder took over: the live stream keeps
+      // running on its existing connection; a reconnect re-evaluates control.
+      return null;
+    }
+  }, [workspaceId]);
+
+  // Renew the lease while we hold it. The lock is a 300s lease with no
+  // server-side auto-renewal, so without this the control (and the session
+  // token) silently expire mid-session — the user appears "kicked" every ~5
+  // minutes. We renew well inside the TTL and do not touch the live stream.
+  useEffect(() => {
+    if (!sessionUrl) return;
+    const timer = setInterval(() => {
+      void reacquireSession();
+    }, 120_000);
+    return () => clearInterval(timer);
+  }, [sessionUrl, reacquireSession]);
+
  const acquireControl = async () => {
    const generation = requestGeneration.current;
    const controlPath = `/workspaces/${workspaceId}/display/control`;
@@ -82,6 +123,7 @@ export function DisplayTab({ workspaceId }: Props) {
      if (requestGeneration.current !== generation) return;
      setControl(next);
      setSessionUrl(next.session_url || null);
+      latestSessionUrlRef.current = next.session_url || null;
    } catch (err) {
      if (requestGeneration.current !== generation) return;
      setControlError("Failed to take control");
@@ -108,6 +150,7 @@ export function DisplayTab({ workspaceId }: Props) {
      if (requestGeneration.current !== generation) return;
      setControl(next);
      setSessionUrl(null);
+      latestSessionUrlRef.current = null;
    } catch (err) {
      if (requestGeneration.current !== generation) return;
      setControlError("Failed to release control");
@@ -235,7 +278,11 @@ export function DisplayTab({ workspaceId }: Props) {
        />
      </div>
      {sessionUrl ? (
-        <DesktopStream sessionUrl={sessionUrl} />
+        <DesktopStream
+          sessionUrl={sessionUrl}
+          latestSessionUrlRef={latestSessionUrlRef}
+          reacquireSession={reacquireSession}
+        />
      ) : (
        <div className="flex flex-1 items-center justify-center p-8 text-center">
          <div>
@@ -311,7 +358,15 @@ function DisplayControlBar({
  );
 }

-function DesktopStream({ sessionUrl }: { sessionUrl: string }) {
+function DesktopStream({
+  sessionUrl,
+  latestSessionUrlRef,
+  reacquireSession,
+}: {
+  sessionUrl: string;
+  latestSessionUrlRef: { current: string | null };
+  reacquireSession: () => Promise<string | null>;
+}) {
  const containerRef = useRef<HTMLDivElement | null>(null);
  const rfbRef = useRef<RFB | null>(null);
  const [streamError, setStreamError] = useState<string | null>(null);
@@ -329,20 +384,37 @@ function DesktopStream({ sessionUrl }: { sessionUrl: string }) {
      clipboardTimer = setTimeout(() => setClipboardStatus(null), 2500);
    };

-    async function connect() {
+    let attempts = 0;
+    let retryTimer: ReturnType<typeof setTimeout> | null = null;
+    const maxAttempts = 10;
+
+    async function connect(reacquire = false) {
      setStreamError(null);
      try {
+        // On a reconnect, mint a fresh lease + token first — the original token
+        // is only ~300s, so a cached URL can be expired and would 401. The
+        // initial connect already holds a fresh token from acquireControl.
+        if (reacquire) await reacquireSession();
        const mod = await import("@novnc/novnc");
        if (cancelled || !containerRef.current) return;
-        const stream = displayWebSocketConnection(sessionUrl);
+        const stream = displayWebSocketConnection(latestSessionUrlRef.current || sessionUrl);
        rfb = new mod.default(containerRef.current, stream.url, {
          wsProtocols: ["binary", `molecule-display-token.${stream.token}`],
        });
        rfbRef.current = rfb;
        rfb.scaleViewport = true;
-        rfb.resizeSession = true;
+        // Do NOT request a server-side resize: the workspace display runs a
+        // fixed Xorg modeline and x11vnc rejects SetDesktopSize ("Resize is
+        // administratively prohibited"), which spams the console on every
+        // (re)connect. scaleViewport already fits the fixed framebuffer to the
+        // container client-side, so we don't need the server to resize.
+        rfb.resizeSession = false;
        rfb.focusOnClick = true;
        rfb.focus({ preventScroll: true });
+        rfb.addEventListener("connect", () => {
+          attempts = 0;
+          if (!cancelled) setStreamError(null);
+        });
        rfb.addEventListener("clipboard", (event: Event) => {
          const text = (event as CustomEvent<{ text?: string }>).detail?.text ?? "";
          if (!text) return;
@@ -353,7 +425,20 @@ function DesktopStream({ sessionUrl }: { sessionUrl: string }) {
        });
        rfb.addEventListener("disconnect", (event: Event) => {
          const detail = (event as CustomEvent<{ clean?: boolean }>).detail;
-          if (!cancelled && !detail?.clean) setStreamError("Desktop stream disconnected.");
+          rfbRef.current = null;
+          if (cancelled || detail?.clean) return;
+          // Auto-reconnect after an unclean drop (idle/network blip, brief
+          // agent hiccup); bounded backoff so a genuinely-dead session still
+          // surfaces an error instead of looping forever.
+          if (attempts < maxAttempts) {
+            attempts += 1;
+            setStreamError(`Reconnecting to desktop… (attempt ${attempts})`);
+            retryTimer = setTimeout(() => {
+              if (!cancelled) void connect(true);
+            }, Math.min(1000 * attempts, 5000));
+          } else {
+            setStreamError("Desktop stream disconnected.");
+          }
        });
      } catch {
        if (!cancelled) setStreamError("Desktop stream could not be opened.");
@@ -363,11 +448,12 @@ function DesktopStream({ sessionUrl }: { sessionUrl: string }) {
    connect();
    return () => {
      cancelled = true;
+      if (retryTimer) clearTimeout(retryTimer);
      if (clipboardTimer) clearTimeout(clipboardTimer);
      rfbRef.current = null;
      rfb?.disconnect();
    };
-  }, [sessionUrl]);
+  }, [sessionUrl, reacquireSession, latestSessionUrlRef]);

  useEffect(() => {
    const onPaste = (event: ClipboardEvent) => {
@@ -2,12 +2,13 @@
 import { describe, it, expect, vi, beforeEach } from "vitest";
 import { cleanup, fireEvent, render, screen, waitFor } from "@testing-library/react";

-const { mockGet, mockPost, mockRFBConstructor, mockRFBClipboardPasteFrom, mockRFBFocus } = vi.hoisted(() => ({
+const { mockGet, mockPost, mockRFBConstructor, mockRFBClipboardPasteFrom, mockRFBFocus, rfbInstances } = vi.hoisted(() => ({
  mockGet: vi.fn(),
  mockPost: vi.fn(),
  mockRFBConstructor: vi.fn(),
  mockRFBClipboardPasteFrom: vi.fn(),
  mockRFBFocus: vi.fn(),
+  rfbInstances: [] as EventTarget[],
 }));

 vi.mock("@/lib/api", () => ({
@@ -31,6 +32,7 @@ vi.mock("@novnc/novnc", () => ({
      this.url = url;
      this.options = options;
      mockRFBConstructor(target, url, options);
+      rfbInstances.push(this);
    }
    clipboardPasteFrom(text: string) {
      mockRFBClipboardPasteFrom(text);
@@ -52,6 +54,7 @@ describe("DisplayTab", () => {
    mockRFBConstructor.mockReset();
    mockRFBClipboardPasteFrom.mockReset();
    mockRFBFocus.mockReset();
+    rfbInstances.length = 0;
  });

  it("renders unavailable state for non-display workspaces", async () => {
@@ -400,6 +403,62 @@ describe("DisplayTab", () => {
    });
    expect(screen.getByRole("button", { name: "Take control" })).toBeTruthy();
  });
+
+  it("auto-reconnects the desktop stream after an unclean disconnect but not a clean one", async () => {
+    mockGet
+      .mockResolvedValueOnce({
+        available: true,
+        mode: "desktop-control",
+        protocol: "novnc",
+        width: 1920,
+        height: 1080,
+      })
+      .mockResolvedValueOnce({ controller: "none" });
+    // Initial acquire returns token "signed"; the reconnect re-acquire mints a
+    // FRESH token "signed2" (the lock/token is only ~300s — reconnecting with a
+    // cached, possibly-expired token would 401 and never recover).
+    mockPost
+      .mockResolvedValueOnce({
+        controller: "user",
+        controlled_by: "admin-token",
+        expires_at: "2026-05-23T08:48:27Z",
+        session_url: "/workspaces/ws-display/display/session/websockify#token=signed",
+      })
+      .mockResolvedValue({
+        controller: "user",
+        controlled_by: "admin-token",
+        expires_at: "2026-05-23T08:53:27Z",
+        session_url: "/workspaces/ws-display/display/session/websockify#token=signed2",
+      });
+
+    render(<DisplayTab workspaceId="ws-display" />);
+    await waitFor(() => {
+      expect(screen.getByRole("button", { name: "Take control" })).toBeTruthy();
+    });
+    fireEvent.click(screen.getByRole("button", { name: "Take control" }));
+    await waitFor(() => {
+      expect(rfbInstances.length).toBe(1);
+    });
+    expect(mockRFBConstructor.mock.calls[0][2].wsProtocols).toContain("molecule-display-token.signed");
+
+    // An idle/network drop closes the websocket uncleanly. The client must
+    // re-acquire a fresh token and reconnect instead of giving up — this is the
+    // "disconnects every ~5 min and stays dead" report.
+    rfbInstances[0].dispatchEvent(new CustomEvent("disconnect", { detail: { clean: false } }));
+    await waitFor(
+      () => {
+        expect(rfbInstances.length).toBe(2);
+      },
+      { timeout: 3000 },
+    );
+    // Reconnect dialed with the FRESH token, not the stale original.
+    expect(mockRFBConstructor.mock.calls[1][2].wsProtocols).toContain("molecule-display-token.signed2");
+
+    // A clean disconnect (the user released control) must NOT reconnect.
+    rfbInstances[1].dispatchEvent(new CustomEvent("disconnect", { detail: { clean: true } }));
+    await new Promise((resolve) => setTimeout(resolve, 1100));
+    expect(rfbInstances.length).toBe(2);
+  });
 });

 function deferred<T>() {
@@ -159,15 +159,28 @@ services:

  # --- Canvas ---
  canvas:
-    # The publish-canvas-image CI workflow pushes a fresh image to GHCR on
-    # every canvas/** merge to main. To update the running container:
-    #   docker compose pull canvas && docker compose up -d canvas
-    # First-time local setup or testing unreleased changes — build from source:
-    #   docker compose build canvas && docker compose up -d canvas
+    # The publish-canvas-image CI workflow runs an ORDERED deploy (core#2226):
+    # build → push :staging-<sha> + :staging-latest → (after green main CI)
+    # re-point :latest to the verified :staging-<sha> by digest. So both tags
+    # below resolve to a CI-green, reproducible build, never a raw/red one.
+    #
+    # Reproducible deploy: pin CANVAS_IMAGE_TAG to the immutable per-commit tag
+    # the ordered deploy produced, e.g.
+    #   CANVAS_IMAGE_TAG=staging-<sha> docker compose pull canvas && docker compose up -d canvas
+    # This makes a tenant/host deploy reproducible (resolves the standing
+    # `TODO: pin canvas ECR image digest`). Unset it and the default `latest`
+    # is the prod-blessed tag the ordered deploy keeps pointed at the last
+    # green build — still deterministic vs. the old raw `:latest`.
+    #
+    # To pin by content digest instead of tag (fully immutable):
+    #   aws ecr describe-images --repository-name molecule-ai/canvas \
+    #     --image-tags staging-<sha> --region us-east-2 \
+    #     --query 'imageDetails[0].imageDigest' --output text
+    # then set CANVAS_IMAGE_TAG=staging-<sha>@<digest> (compose passes it through).
+    #
    # Note: ECR images require AWS auth — `aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 153263036946.dkr.ecr.us-east-2.amazonaws.com` before pull.
-    # Digest-pin requires: aws ecr describe-images --repository-name molecule-ai/canvas --image-tags latest --query 'imageDetails[0].imageDigest'
-    # TODO: pin canvas ECR image digest once AWS creds are available in CI.
-    image: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/canvas:latest
+    # Local dev keeps working via the `build:` context below (docker compose build canvas).
+    image: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/canvas:${CANVAS_IMAGE_TAG:-latest}
    build:
      context: ./canvas
      dockerfile: Dockerfile
@@ -175,6 +188,10 @@ services:
        NEXT_PUBLIC_PLATFORM_URL: ${NEXT_PUBLIC_PLATFORM_URL:-http://localhost:${PLATFORM_PUBLISH_PORT:-8080}}
        NEXT_PUBLIC_WS_URL: ${NEXT_PUBLIC_WS_URL:-ws://localhost:${PLATFORM_PUBLISH_PORT:-8080}/ws}
        NEXT_PUBLIC_ADMIN_TOKEN: ${ADMIN_TOKEN:-}
+        # SHA surfaced at /api/buildinfo (core#2235). CI passes the real merge
+        # SHA via the publish-canvas-image workflow build-args; local compose
+        # builds default to "dev" (the route's unwired sentinel).
+        BUILD_SHA: ${BUILD_SHA:-dev}
    depends_on:
      platform:
        condition: service_healthy
@@ -28,14 +28,10 @@
    {"name": "claude-code-default", "repo": "molecule-ai/molecule-ai-workspace-template-claude-code", "ref": "main"},
    {"name": "hermes", "repo": "molecule-ai/molecule-ai-workspace-template-hermes", "ref": "main"},
    {"name": "openclaw", "repo": "molecule-ai/molecule-ai-workspace-template-openclaw", "ref": "main"},
-    {"name": "codex", "repo": "molecule-ai/molecule-ai-workspace-template-codex", "ref": "main"},
-    {"name": "google-adk", "repo": "molecule-ai/molecule-ai-workspace-template-google-adk", "ref": "main"},
-    {"name": "seo-agent", "repo": "molecule-ai/molecule-ai-workspace-template-seo-agent", "ref": "main"}
+    {"name": "codex", "repo": "molecule-ai/molecule-ai-workspace-template-codex", "ref": "main"}
  ],
  "org_templates": [
    {"name": "molecule-dev", "repo": "molecule-ai/molecule-ai-org-template-molecule-dev", "ref": "main"},
-    {"name": "free-beats-all", "repo": "molecule-ai/molecule-ai-org-template-free-beats-all", "ref": "main"},
-    {"name": "medo-smoke", "repo": "molecule-ai/molecule-ai-org-template-medo-smoke", "ref": "main"},
    {"name": "molecule-worker-gemini", "repo": "molecule-ai/molecule-ai-org-template-molecule-worker-gemini", "ref": "main"},
    {"name": "ux-ab-lab", "repo": "molecule-ai/molecule-ai-org-template-ux-ab-lab", "ref": "main"}
  ]
@@ -0,0 +1,131 @@
+# Developer SOP — PR review gate auto-fire and stale-head handling
+
+> Last updated: 2026-06-03 (cp#2159 follow-up)
+>
+> Applies to: all core-PR authors and reviewers on `molecule-core` and sibling
+> repos using the `qa-review` + `security-review` branch-protection gates.
+
+---
+
+## 1. Gitea PR-head workflow-selection rule
+
+**Rule:** For `pull_request_target` and `pull_request_review` events, Gitea
+loads the workflow definition from the **PR's HEAD branch**, not from the
+base (`main`) branch.
+
+This is different from GitHub Actions, where `pull_request_target` always
+loads workflows from the base branch. Gitea's behaviour means:
+
+- A PR that was opened **before** the `pull_request_review` trigger was added
+to `qa-review.yml` / `security-review.yml` will **NOT** auto-fire on review,
+because its HEAD still contains the old workflow YAML (no trigger).
+
+- A PR that was opened **after** the trigger was added (or that has been
+rebased onto a commit containing the trigger) **WILL** auto-fire, because its
+HEAD contains the new workflow YAML.
+
+### Ops implication
+
+| PR head contains `pull_request_review` trigger? | Behaviour on APPROVED review |
+|---|---|
+| **Yes** (cut from current main, or rebased) | Workflows auto-queue, evaluate, and POST the `(pull_request_target)` context automatically. No slash-command needed. |
+| **No** (stale head, opened before #2157) | Nothing fires. Use `/qa-recheck` + `/security-recheck` slash-commands in a PR comment, OR rebase onto current main. |
+
+---
+
+## 2. Standard core-PR flow (post-#2157)
+
+```
+1. Author opens PR from a branch based on current main
+   → qa-review + security-review workflows run on pull_request_target
+   → status contexts post (initial eval, usually red until reviews land)
+
+2. Reviewers submit real APPROVED reviews
+   → If PR head has the trigger: workflows AUTO-FIRE on pull_request_review
+   → Contexts flip green (or stay red if reviewer is not in team)
+
+3. [Optional] If contexts did not flip (stale head, event lost, etc.):
+   → Anyone can comment `/qa-recheck` or `/security-recheck`
+   → sop-checklist.yml refires the evaluator (read-only, idempotent)
+
+4. Both qa-review + security-review contexts are green
+   → Plain Do:merge (no force-merge needed)
+```
+
+### Key point
+
+The `/qa-recheck` and `/security-recheck` commands are a **backstop**, not the
+primary path. PRs cut from current main should auto-fire without manual
+intervention.
+
+---
+
+## 3. Diagnosing a stale head
+
+If a PR has real team-member APPROVED reviews but the qa/security contexts
+remain red and no workflow run appears on the PR's "Actions" tab for the
+review event, the PR head is likely stale.
+
+### Quick check
+
+```bash
+# From the PR page, look at the head commit SHA, then:
+curl -sS "https://git.moleculesai.app/api/v1/repos/molecule-ai/molecule-core/contents/.gitea/workflows/qa-review.yml?ref=<HEAD_SHA>" \
+  | jq -r '.content' | base64 -d | grep -c 'pull_request_review'
+# 0  → stale head (no trigger in that version of the workflow)
+# >0 → trigger present; auto-fire SHOULD work (if it didn't, file a tracker)
+```
+
+### Automated diagnostic
+
+The test suite includes `test_gate_stale_head_diagnostic.py`, which reports
+"auto-fire impossible for this PR" when the head lacks the trigger. Run it
+in CI or locally with:
+
+```bash
+PR_NUMBER=123 python -m pytest .gitea/scripts/tests/test_gate_stale_head_diagnostic.py -v
+```
+
+---
+
+## 4. Rebasing vs. slash-refire
+
+| Approach | When to use | Trade-off |
+|---|---|---|
+| **Rebase onto current main** | PR is genuinely stale (head lacks trigger OR head is far behind main) | Clean history, gets all recent fixes, but requires force-push and re-approval if the branch was protected |
+| **`/qa-recheck` + `/security-recheck`** | PR head is recent but the review event was missed, or you want to avoid rebase churn | Quick, no force-push, but does NOT fix a missing trigger in the head |
+
+**Do not** use slash-refire as a substitute for rebasing a stale head. If the
+workflow YAML in the PR head does not contain `pull_request_review`, no amount
+of rechecking will make auto-fire work.
+
+---
+
+## 5. Live-fire verification
+
+The `test_gate_auto_fire_live.py` regression test exercises the full runtime
+path: it submits an APPROVED review to a test PR and polls for the
+`(pull_request_target)` status contexts. It is skipped when no API token is
+available, and is intended to catch runtime non-fire that static structural
+tests (e.g. `test_gate_review_auto_fire.py`) cannot detect.
+
+Run manually with:
+
+```bash
+export GITEA_HOST=git.moleculesai.app
+export GITEA_TOKEN=<your-token>
+export REPO=molecule-ai/molecule-core
+export LIVEFIRE_PR_NUMBER=<test-pr-number>
+python -m pytest .gitea/scripts/tests/test_gate_auto_fire_live.py -v
+```
+
+---
+
+## References
+
+- #2159 — gate auto-trigger not firing (root cause: stale PR heads lacking
+the `pull_request_review` trigger, NOT a workflow code defect)
+- #765 — static structural regression test for gate configuration
+- #2157 — merged trigger addition (`pull_request_review` types: [submitted])
+- #2020 — milestone confirming gate infrastructure is stable
+- RFC#324 — qa-review + security-review design
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+# check-manifest-repos-exist.sh — fail-fast guard: verify every repo listed in
+# manifest.json actually exists on Gitea before the expensive clone step runs.
+#
+# WHY: deleting an org-template/workspace-template repo that is still listed in
+# manifest.json breaks clone-manifest.sh with a generic git 404 error. The
+# failure is deep in the publish-workspace-server-image workflow and looks like
+# a transient network issue, wasting debug time. This script surfaces the
+# problem immediately with a per-entry ::error:: annotation naming the missing
+# repo (issue #2192).
+#
+# Usage:
+#   ./scripts/check-manifest-repos-exist.sh <manifest.json>
+#
+# Exit:
+#   0  all repos exist
+#   1  one or more repos 404 (printed to stderr)
+#   2  bad usage / missing inputs
+
+set -euo pipefail
+
+MANIFEST="${1:-manifest.json}"
+GITEA_API="${GITEA_API:-https://git.moleculesai.app/api/v1/repos}"
+
+if [ ! -f "$MANIFEST" ]; then
+    echo "::error::manifest not found: $MANIFEST" >&2
+    exit 2
+fi
+
+# Strip JSON5-style // comments before parsing (same as clone-manifest.sh)
+_strip_comments() {
+    sed 's/^[[:space:]]*\/\/.*//' "$MANIFEST"
+}
+
+MANIFEST_JSON="$(_strip_comments)"
+
+MISSING=0
+TOTAL=0
+
+# Categories to check — must match clone-manifest.sh categories
+check_category() {
+    local category="$1"
+    local count
+    count=$(echo "$MANIFEST_JSON" | jq -r ".${category} | length")
+
+    local i=0
+    while [ "$i" -lt "$count" ]; do
+        local name repo
+        name=$(echo "$MANIFEST_JSON" | jq -r ".${category}[$i].name")
+        repo=$(echo "$MANIFEST_JSON" | jq -r ".${category}[$i].repo")
+        TOTAL=$((TOTAL + 1))
+
+        # Check repo existence via Gitea API (public endpoint, no auth needed)
+        http_code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "${GITEA_API}/${repo}" 2>/dev/null || true)
+
+        if [ "$http_code" != "200" ]; then
+            echo "::error::manifest.json ${category} entry '${name}' → repo '${repo}' returned HTTP ${http_code} (expected 200). Delete the manifest entry BEFORE deleting the repo." >&2
+            MISSING=$((MISSING + 1))
+        fi
+
+        i=$((i + 1))
+    done
+}
+
+echo "==> Checking manifest repo existence against ${GITEA_API} ..."
+check_category "plugins"
+check_category "workspace_templates"
+check_category "org_templates"
+
+if [ "$MISSING" -gt 0 ]; then
+    echo "::error::${MISSING}/${TOTAL} manifest entries are missing — fix manifest.json before publishing." >&2
+    exit 1
+fi
+
+echo "✓ All ${TOTAL} manifest entries resolved (HTTP 200)."
+exit 0
@@ -11,7 +11,10 @@
 #                                    default + 401, see PR #1714.)
 #
 #   claude-code → auth-aware:
-#                  E2E_MINIMAX_API_KEY    → "MiniMax-M2"
+#                  E2E_MINIMAX_API_KEY    → "minimax:MiniMax-M2.7"
+#                                           (colon-namespaced BYOK id; bare
+#                                            "MiniMax-M2" 400s on a deploy-skewed
+#                                            staging registry — #2263)
 #                  E2E_ANTHROPIC_API_KEY  → "claude-sonnet-4-6"
 #                  otherwise              → "sonnet"
 #
@@ -23,28 +26,76 @@
 #                  their provider entries, otherwise the workspace boots
 #                  reachable but the first A2A call hits the wrong auth path.
 #
-# When E2E_MODEL_SLUG is set, it overrides this dispatch — useful when an
-# operator dispatches the workflow to test a specific slug.
+# PLATFORM-MANAGED path (E2E_LLM_PATH=platform) — the moonshot/kimi
+# NOT_CONFIGURED regression (RFC#340 Fix A #2187):
+#
+#   The branches above all exercise BYOK: a tenant key (MINIMAX/ANTHROPIC/
+#   OPENAI) is injected as a workspace secret and the model id resolves to that
+#   vendor's *BYOK* provider entry. That path NEVER exercises the platform arm —
+#   the exact arm that booted "moonshot/kimi-k2.6" into NOT_CONFIGURED in prod,
+#   because the generated config.yaml lacked the derived `provider: platform`.
+#
+#   E2E_LLM_PATH=platform selects a platform-managed model id (slash-namespaced,
+#   no tenant key — Molecule owns billing via the CP LLM proxy). The default is
+#   "moonshot/kimi-k2.6", the headline incident combo. Override the specific
+#   platform model with E2E_MODEL_SLUG. The provision branch in
+#   test_staging_full_saas.sh sends NO secrets for this path (platform-managed
+#   needs none), so the workspace must boot online purely on the proxy env the
+#   control plane injects + the manifest-derived `provider: platform` that Fix A
+#   stamps. That is the REAL boot-path assertion the deterministic unit test
+#   (workspace_provision_platform_boot_test.go) cannot make.
+#
+# When E2E_MODEL_SLUG is set, it overrides this dispatch entirely — useful when
+# an operator dispatches the workflow to test a specific slug (or a specific
+# platform model id).
 #
 # Unit tested by tests/e2e/test_model_slug.sh — every branch must stay
 # pinned because regressions silently mask as "Could not resolve
 # authentication method" + the synth-E2E gate goes red without naming
 # the slug-format mismatch.

+# Default platform-managed model for the platform-boot regression path. The
+# exact id that booted NOT_CONFIGURED in prod. Must stay a member of the
+# claude-code `platform` arm in workspace-server/internal/providers/providers.yaml
+# (the deterministic suite TestEnsureDefaultConfig_StampsProviderForEverySSOTPlatformModel
+# enforces every member of that arm derives provider=platform). Resolved INSIDE
+# pick_model_slug via ${E2E_DEFAULT_PLATFORM_MODEL:-...} so callers can override
+# it (or unset it) without tripping `set -u`.
+E2E_DEFAULT_PLATFORM_MODEL_FALLBACK="moonshot/kimi-k2.6"
+
 # Usage: pick_model_slug <runtime>
 #   stdout: the slug string
 #   E2E_MODEL_SLUG (env): if set + non-empty, used as-is (operator override)
+#   E2E_LLM_PATH=platform (env): select the platform-managed model id
+#     (E2E_DEFAULT_PLATFORM_MODEL) instead of a BYOK slug. Takes precedence over
+#     the per-key BYOK branches; E2E_MODEL_SLUG still wins over everything.
 pick_model_slug() {
  local runtime="${1:-}"
  if [ -n "${E2E_MODEL_SLUG:-}" ]; then
    printf '%s' "$E2E_MODEL_SLUG"
    return 0
  fi
+  # Platform-managed path: the slash-namespaced platform model, no tenant key.
+  # Exercises the arm the moonshot/kimi NOT_CONFIGURED bug shipped on.
+  if [ "${E2E_LLM_PATH:-}" = "platform" ]; then
+    printf '%s' "${E2E_DEFAULT_PLATFORM_MODEL:-$E2E_DEFAULT_PLATFORM_MODEL_FALLBACK}"
+    return 0
+  fi
  case "$runtime" in
    hermes)      printf 'openai/gpt-4o' ;;
    claude-code)
      if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
-        printf 'MiniMax-M2'
+        # Namespaced (colon) BYOK id, not bare "MiniMax-M2" (#2263 deploy-skew):
+        # bare ids can lag the deployed staging ws-server's compiled registry,
+        # so workspace-create's validateRegisteredModelForRuntime 400s the bare
+        # form on an older image. The colon-namespaced `minimax:MiniMax-M2.7`
+        # resolves the same way the proven-working sibling `moonshot/kimi-k2.6`
+        # does. It stays in the BYOK `minimax` arm (providers.yaml:851), so
+        # DeriveProvider -> provider_selection=minimax (BYOK) and the #1994
+        # byok-not-platform guard (test_staging_full_saas.sh:1000) still passes —
+        # unlike the slash/platform form `minimax/MiniMax-M2.7`, which resolves
+        # to provider=platform and would trip that guard.
+        printf 'minimax:MiniMax-M2.7'
      elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
        printf 'claude-sonnet-4-6'
      else
@@ -0,0 +1,332 @@
+#!/usr/bin/env bash
+set -uo pipefail
+#
+# test_keyless_feature_contracts_e2e.sh — REQUIRED-lane (E2E API Smoke Test)
+# keyless HTTP-contract coverage for feature endpoints that ship WITHOUT an
+# LLM key and had NO e2e assertion before (coverage-audit gap list).
+#
+# Why a NEW script (not added to test_api.sh): PR #2286 is concurrently
+# rewriting test_api.sh's auth helpers + _lib.sh (e2e_admin_auth_args) and the
+# test_priority_runtimes mock arm. Keeping these assertions in a standalone
+# file avoids a merge conflict with that in-flight PR and keeps the new feature
+# coverage independently reviewable. The mock-runtime A2A canned round-trip is
+# OWNED by #2286's `mock` arm (run_mock) — intentionally NOT duplicated here.
+#
+# Every endpoint below is exercised against a runtime=external workspace so NO
+# LLM key is needed. For each we assert the real HTTP contract: the happy path
+# AND a meaningful failure mode (401 without auth, 400 on bad input, or the
+# documented fail-closed status) so the test catches REAL regressions, not
+# just 200s.
+#
+# Auth model (matches workspace-server/internal/middleware/wsauth_middleware.go):
+#   * WorkspaceAuth (/workspaces/:id/*) is STRICT once a token exists — a
+#     bearer-less request 401s (devmode fail-open needs MOLECULE_ENV=dev AND
+#     ADMIN_TOKEN unset, neither of which the e2e-api job sets).
+#   * AdminAuth routes accept the platform ADMIN_TOKEN (post-#2286) OR, when no
+#     ADMIN_TOKEN is configured, any valid workspace bearer (Tier-3 fallback) —
+#     so the workspace token we mint authenticates admin routes in BOTH the
+#     pre-#2286 (no ADMIN_TOKEN) and post-#2286 (ADMIN_TOKEN set) CI shapes.
+#
+# Local-run shape (mirrors the e2e-api job — real PG+Redis+platform):
+#   DATABASE_URL=... REDIS_URL=... ADMIN_TOKEN=... ./platform-server &
+#   BASE=http://127.0.0.1:$PORT bash tests/e2e/test_keyless_feature_contracts_e2e.sh
+
+source "$(dirname "$0")/_lib.sh"  # sets BASE default
+
+PASS=0
+FAIL=0
+
+pass() { echo "PASS: $1"; PASS=$((PASS + 1)); }
+fail() { echo "FAIL: $1"; echo "  $2"; FAIL=$((FAIL + 1)); }
+
+# assert_contains DESC EXPECTED_SUBSTRING ACTUAL
+assert_contains() {
+  if printf '%s' "$3" | grep -qF "$2"; then
+    pass "$1"
+  else
+    fail "$1" "expected to contain [$2] — got: $3"
+  fi
+}
+
+# http_code METHOD URL [curl-args...] → prints the HTTP status code only.
+http_code() {
+  local method="$1" url="$2"; shift 2
+  curl -s -o /dev/null -w "%{http_code}" -X "$method" "$url" "$@"
+}
+
+# body_and_code METHOD URL [curl-args...] → prints "<body>\n<code>".
+body_and_code() {
+  local method="$1" url="$2"; shift 2
+  curl -s -w $'\n%{http_code}' -X "$method" "$url" "$@"
+}
+
+echo "=== Keyless feature HTTP-contract E2E (required lane) ==="
+echo ""
+
+# Platform admin bearer when the job set one (#2286 shape). When ADMIN_TOKEN is
+# configured, AdminAuth's Tier-1 fail-open is OFF even before the first token
+# exists, so admin-gated create / list / delete must carry it from the start.
+# Pre-#2286 (no ADMIN_TOKEN) this is empty → fail-open create works bare.
+ENV_ADMIN="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
+ENV_ADMIN_AUTH=()
+[ -n "$ENV_ADMIN" ] && ENV_ADMIN_AUTH=(-H "Authorization: Bearer $ENV_ADMIN")
+
+# Reproducible counts across reruns. e2e_cleanup_all_workspaces hits the
+# admin-gated list/delete; the platform admin bearer (if set) goes via the
+# MOLECULE_ADMIN_TOKEN/ADMIN_TOKEN env the helper already reads.
+e2e_cleanup_all_workspaces
+
+# ---------------------------------------------------------------------------
+# Fixture: one external workspace, registered → online. Keyless (external=true
+# means no container is provisioned and no LLM key is consulted).
+# ---------------------------------------------------------------------------
+R=$(curl -s -X POST "$BASE/workspaces" -H "Content-Type: application/json" \
+  ${ENV_ADMIN_AUTH[@]+"${ENV_ADMIN_AUTH[@]}"} \
+  -d '{"name":"Keyless Fixture","tier":1,"runtime":"external","external":true}')
+WS_ID=$(printf '%s' "$R" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "")
+if [ -z "$WS_ID" ]; then
+  echo "FATAL: could not create fixture workspace — got: $R" >&2
+  exit 2
+fi
+assert_contains "POST /workspaces (external fixture created)" '"status":"awaiting_agent"' "$R"
+
+# Workspace token: register returns one; else mint via the admin endpoint.
+WS_TOKEN=$(printf '%s' "$R" | e2e_extract_token)
+if [ -z "$WS_TOKEN" ]; then
+  WS_TOKEN=$(e2e_mint_workspace_token "$WS_ID" 2>/dev/null || echo "")
+fi
+if [ -z "$WS_TOKEN" ]; then
+  echo "FATAL: could not obtain workspace token for $WS_ID" >&2
+  exit 2
+fi
+AUTH=(-H "Authorization: Bearer $WS_TOKEN")
+
+# Admin bearer: explicit platform ADMIN_TOKEN if the job set one (#2286 shape),
+# else the workspace token (AdminAuth Tier-3 accepts it pre-#2286).
+ADMIN_BEARER="${ENV_ADMIN:-$WS_TOKEN}"
+ADMIN_AUTH=(-H "Authorization: Bearer $ADMIN_BEARER")
+
+# Bring the fixture online so lifecycle (hibernate) has a hibernatable state.
+curl -s -X POST "$BASE/registry/register" -H "Content-Type: application/json" "${AUTH[@]}" \
+  -d "{\"id\":\"$WS_ID\",\"url\":\"https://example.com/keyless\",\"agent_card\":{\"name\":\"Keyless Fixture\",\"skills\":[{\"id\":\"noop\",\"name\":\"Noop\"}]}}" >/dev/null
+
+# ===========================================================================
+# 1. Terminal diagnose — GET /workspaces/:id/terminal/diagnose (wsAuth)
+#    External workspace has no instance_id → diagnoseLocal path → 200 with a
+#    deterministic report (ok=false, first_failure on docker/container). The
+#    /terminal endpoint itself is a WebSocket upgrade (not HTTP-assertable
+#    keyless); diagnose is its pure-HTTP sibling and the real contract surface.
+# ===========================================================================
+echo "--- /terminal/diagnose ---"
+BC=$(body_and_code GET "$BASE/workspaces/$WS_ID/terminal/diagnose" "${AUTH[@]}")
+DIAG_CODE=$(printf '%s' "$BC" | tail -n1)
+DIAG_BODY=$(printf '%s' "$BC" | sed '$d')
+assert_contains "GET /terminal/diagnose (200 report)" "200" "$DIAG_CODE"
+assert_contains "GET /terminal/diagnose (carries workspace_id)" "\"workspace_id\":\"$WS_ID\"" "$DIAG_BODY"
+assert_contains "GET /terminal/diagnose (has steps[])" '"steps"' "$DIAG_BODY"
+# Failure mode: no bearer → 401 (WorkspaceAuth strict once a token exists).
+assert_contains "GET /terminal/diagnose (no auth → 401)" "401" \
+  "$(http_code GET "$BASE/workspaces/$WS_ID/terminal/diagnose")"
+
+# ===========================================================================
+# 2. Webhooks (public) — POST /webhooks/:type
+#    Public, no auth. telegram adapter: empty update body → (nil,nil) → 200
+#    ignored; non-JSON → parse error → 400; unknown type → 404.
+# ===========================================================================
+echo "--- /webhooks/:type ---"
+BC=$(body_and_code POST "$BASE/webhooks/telegram" -H "Content-Type: application/json" -d '{}')
+WH_CODE=$(printf '%s' "$BC" | tail -n1)
+WH_BODY=$(printf '%s' "$BC" | sed '$d')
+assert_contains "POST /webhooks/telegram (non-message update → 200)" "200" "$WH_CODE"
+assert_contains "POST /webhooks/telegram (status ignored)" '"status":"ignored"' "$WH_BODY"
+assert_contains "POST /webhooks/telegram (bad JSON → 400)" "400" \
+  "$(http_code POST "$BASE/webhooks/telegram" -H 'Content-Type: application/json' -d 'not-json')"
+assert_contains "POST /webhooks/<unknown> (→ 404)" "404" \
+  "$(http_code POST "$BASE/webhooks/nope-not-a-channel" -H 'Content-Type: application/json' -d '{}')"
+
+# ===========================================================================
+# 3. Budget — GET /workspaces/:id/budget (wsAuth) + PATCH (admin)
+#    GET: fresh workspace → multi-period view, no limits, zero spend.
+#    PATCH: set monthly limit (admin) → reflected; bad input → 400.
+# ===========================================================================
+echo "--- /budget ---"
+BUD=$(curl -s "$BASE/workspaces/$WS_ID/budget" "${AUTH[@]}")
+assert_contains "GET /budget (has periods map)" '"periods"' "$BUD"
+assert_contains "GET /budget (monthly_spend 0 on fresh ws)" '"monthly_spend":0' "$BUD"
+# PATCH is admin-gated (router.go:419). Set a monthly limit and verify echo.
+PB=$(curl -s -X PATCH "$BASE/workspaces/$WS_ID/budget" -H "Content-Type: application/json" "${ADMIN_AUTH[@]}" \
+  -d '{"budget_limits":{"monthly":2000}}')
+assert_contains "PATCH /budget (monthly limit set → echoed)" '"budget_limit":2000' "$PB"
+# Re-read confirms persistence.
+assert_contains "GET /budget (limit persisted)" '"budget_limit":2000' \
+  "$(curl -s "$BASE/workspaces/$WS_ID/budget" "${AUTH[@]}")"
+# Failure: empty body → 400 "budget_limits or budget_limit field is required".
+assert_contains "PATCH /budget (empty body → 400)" "400" \
+  "$(http_code PATCH "$BASE/workspaces/$WS_ID/budget" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{}')"
+# Failure: unknown period → 400.
+assert_contains "PATCH /budget (unknown period → 400)" "400" \
+  "$(http_code PATCH "$BASE/workspaces/$WS_ID/budget" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{"budget_limits":{"yearly":1}}')"
+# Failure: GET without bearer → 401.
+assert_contains "GET /budget (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/budget")"
+
+# ===========================================================================
+# 4. Checkpoints — POST/GET/DELETE /workspaces/:id/checkpoints* (wsAuth)
+#    Fully self-contained CRUD over workflow_checkpoints (#788). Upsert → latest
+#    → list-by-wfid → delete → 404. Failure modes: missing workflow_id → 400,
+#    empty latest → 404.
+# ===========================================================================
+echo "--- /checkpoints ---"
+WFID="kl-wf-$$"
+CP=$(curl -s -X POST "$BASE/workspaces/$WS_ID/checkpoints" -H "Content-Type: application/json" "${AUTH[@]}" \
+  -d "{\"workflow_id\":\"$WFID\",\"step_name\":\"step-a\",\"step_index\":1,\"payload\":{\"k\":\"v\"}}")
+assert_contains "POST /checkpoints (upsert → id + workflow_id)" "\"workflow_id\":\"$WFID\"" "$CP"
+assert_contains "GET /checkpoints/latest (200 newest)" "\"workflow_id\":\"$WFID\"" \
+  "$(curl -s "$BASE/workspaces/$WS_ID/checkpoints/latest" "${AUTH[@]}")"
+assert_contains "GET /checkpoints/:wfid (lists the step)" '"step_name":"step-a"' \
+  "$(curl -s "$BASE/workspaces/$WS_ID/checkpoints/$WFID" "${AUTH[@]}")"
+DEL=$(curl -s -X DELETE "$BASE/workspaces/$WS_ID/checkpoints/$WFID" "${AUTH[@]}")
+assert_contains "DELETE /checkpoints/:wfid (deleted count)" '"deleted":1' "$DEL"
+assert_contains "GET /checkpoints/:wfid (after delete → 404)" "404" \
+  "$(http_code GET "$BASE/workspaces/$WS_ID/checkpoints/$WFID" "${AUTH[@]}")"
+# Failure: missing workflow_id → 400 (binding:required).
+assert_contains "POST /checkpoints (missing workflow_id → 400)" "400" \
+  "$(http_code POST "$BASE/workspaces/$WS_ID/checkpoints" -H 'Content-Type: application/json' "${AUTH[@]}" -d '{"step_name":"x"}')"
+# Failure: no bearer → 401.
+assert_contains "POST /checkpoints (no auth → 401)" "401" \
+  "$(http_code POST "$BASE/workspaces/$WS_ID/checkpoints" -H 'Content-Type: application/json' -d '{"workflow_id":"x","step_name":"y"}')"
+
+# ===========================================================================
+# 5. Audit — GET /workspaces/:id/audit (wsAuth)
+#    EU AI Act ledger query (#594). Fresh ws → empty events, total 0,
+#    chain_valid null (AUDIT_LEDGER_SALT unset). Failure: bad RFC3339 from → 400.
+# ===========================================================================
+echo "--- /audit ---"
+AUD=$(curl -s "$BASE/workspaces/$WS_ID/audit" "${AUTH[@]}")
+assert_contains "GET /audit (total 0 on fresh ws)" '"total":0' "$AUD"
+assert_contains "GET /audit (chain_valid null without salt)" '"chain_valid":null' "$AUD"
+assert_contains "GET /audit (bad 'from' → 400)" "400" \
+  "$(http_code GET "$BASE/workspaces/$WS_ID/audit?from=not-a-date" "${AUTH[@]}")"
+assert_contains "GET /audit (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/audit")"
+
+# ===========================================================================
+# 6. Traces — GET /workspaces/:id/traces (wsAuth)
+#    Langfuse proxy (#590). No LANGFUSE_* configured → 200 [] (graceful empty),
+#    never a 5xx. Failure: no auth → 401.
+# ===========================================================================
+echo "--- /traces ---"
+BC=$(body_and_code GET "$BASE/workspaces/$WS_ID/traces" "${AUTH[@]}")
+TR_CODE=$(printf '%s' "$BC" | tail -n1)
+TR_BODY=$(printf '%s' "$BC" | sed '$d')
+assert_contains "GET /traces (200 without Langfuse)" "200" "$TR_CODE"
+assert_contains "GET /traces (empty list)" '[]' "$TR_BODY"
+assert_contains "GET /traces (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/traces")"
+
+# ===========================================================================
+# 7. Session search — GET /workspaces/:id/session-search (wsAuth)
+#    Searches activity_logs. Seed one activity row, then assert q-filter finds
+#    it and a non-matching q returns []. Failure: no auth → 401.
+# ===========================================================================
+echo "--- /session-search ---"
+curl -s -X POST "$BASE/workspaces/$WS_ID/activity" -H "Content-Type: application/json" "${AUTH[@]}" \
+  -d '{"activity_type":"agent_log","method":"inference","summary":"keyless-needle marker"}' >/dev/null
+assert_contains "GET /session-search?q=keyless-needle (finds row)" 'keyless-needle' \
+  "$(curl -s "$BASE/workspaces/$WS_ID/session-search?q=keyless-needle" "${AUTH[@]}")"
+assert_contains "GET /session-search?q=<no-match> (empty)" '[]' \
+  "$(curl -s "$BASE/workspaces/$WS_ID/session-search?q=zzz-no-such-token-zzz" "${AUTH[@]}")"
+assert_contains "GET /session-search (no auth → 401)" "401" \
+  "$(http_code GET "$BASE/workspaces/$WS_ID/session-search?q=x")"
+
+# ===========================================================================
+# 8. Rescue — GET /workspaces/:id/rescue (wsAuth)
+#    RFC internal#742. Fail-CLOSED contract: the e2e-api job has no
+#    MOLECULE_ORG_ID, so the handler returns 503 platform_misconfigured rather
+#    than leaking cross-org. That fail-closed behaviour IS the keyless contract
+#    we gate here (a regression that drops the org guard would flip this to a
+#    200/404 and turn this assertion RED). Failure mode: no auth → 401.
+# ===========================================================================
+echo "--- /rescue ---"
+BC=$(body_and_code GET "$BASE/workspaces/$WS_ID/rescue" "${AUTH[@]}")
+RES_CODE=$(printf '%s' "$BC" | tail -n1)
+RES_BODY=$(printf '%s' "$BC" | sed '$d')
+if [ "$RES_CODE" = "404" ]; then
+  # MOLECULE_ORG_ID was set in this environment → no-bundle path.
+  assert_contains "GET /rescue (no bundle → 404, org configured)" 'no rescue bundle' "$RES_BODY"
+else
+  # No MOLECULE_ORG_ID (the e2e-api default) → fail-closed 503.
+  assert_contains "GET /rescue (fail-closed 503 without MOLECULE_ORG_ID)" "503" "$RES_CODE"
+  assert_contains "GET /rescue (platform_misconfigured code)" 'platform_misconfigured' "$RES_BODY"
+fi
+assert_contains "GET /rescue (no auth → 401)" "401" "$(http_code GET "$BASE/workspaces/$WS_ID/rescue")"
+
+# ===========================================================================
+# 9. LLM billing-mode admin toggle — GET/PUT /admin/workspaces/:id/llm-billing-mode
+#    (AdminAuth). Flip to byok → read back override; bad UUID → 400; missing
+#    'mode' key → 400; unknown mode → 400.
+# ===========================================================================
+echo "--- /admin/workspaces/:id/llm-billing-mode ---"
+assert_contains "GET llm-billing-mode (resolves a mode)" '"resolved_mode"' \
+  "$(curl -s "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" "${ADMIN_AUTH[@]}")"
+PUTBM=$(curl -s -X PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H "Content-Type: application/json" "${ADMIN_AUTH[@]}" \
+  -d '{"mode":"byok"}')
+assert_contains "PUT llm-billing-mode byok (override set)" '"workspace_override":"byok"' "$PUTBM"
+assert_contains "GET llm-billing-mode (byok persisted)" '"workspace_override":"byok"' \
+  "$(curl -s "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" "${ADMIN_AUTH[@]}")"
+# Clear the override (null) so we don't leave fixture state skewed.
+curl -s -X PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H "Content-Type: application/json" "${ADMIN_AUTH[@]}" \
+  -d '{"mode":null}' >/dev/null
+# Failure: malformed UUID → 400.
+assert_contains "PUT llm-billing-mode (bad UUID → 400)" "400" \
+  "$(http_code PUT "$BASE/admin/workspaces/not-a-uuid/llm-billing-mode" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{"mode":"byok"}')"
+# Failure: missing 'mode' key → 400.
+assert_contains "PUT llm-billing-mode (missing mode → 400)" "400" \
+  "$(http_code PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{}')"
+# Failure: unknown mode string → 400.
+assert_contains "PUT llm-billing-mode (unknown mode → 400)" "400" \
+  "$(http_code PUT "$BASE/admin/workspaces/$WS_ID/llm-billing-mode" -H 'Content-Type: application/json' "${ADMIN_AUTH[@]}" -d '{"mode":"bogus-mode"}')"
+
+# ===========================================================================
+# 10. Lifecycle — Pause → Resume + Hibernate (wsAuth)
+#     Pause works backend-agnostically (StopWorkspaceAuto no-ops on no backend)
+#     → status=paused. Resume re-provisions: 200 provisioning when a provisioner
+#     is wired (the e2e-api host has Docker), or 503 provisioner-not-available
+#     otherwise — both are valid contracts, so accept either. Failure modes:
+#     resume a non-paused ws → 404; hibernate a non-online ws → 404.
+# ===========================================================================
+echo "--- lifecycle (resume / hibernate) ---"
+# Pause the (online) fixture → status paused.
+PA=$(curl -s -X POST "$BASE/workspaces/$WS_ID/pause" "${AUTH[@]}")
+assert_contains "POST /pause (online → paused)" '"status":"paused"' "$PA"
+# Resume the paused fixture — accept 200 provisioning OR 503 (no provisioner).
+BC=$(body_and_code POST "$BASE/workspaces/$WS_ID/resume" "${AUTH[@]}")
+RSM_CODE=$(printf '%s' "$BC" | tail -n1)
+RSM_BODY=$(printf '%s' "$BC" | sed '$d')
+if [ "$RSM_CODE" = "200" ]; then
+  assert_contains "POST /resume (paused → provisioning)" '"status":"provisioning"' "$RSM_BODY"
+elif [ "$RSM_CODE" = "503" ]; then
+  assert_contains "POST /resume (no provisioner → 503 contract)" 'provisioner not available' "$RSM_BODY"
+else
+  fail "POST /resume (expected 200 or 503)" "got HTTP $RSM_CODE — $RSM_BODY"
+fi
+# Failure: resume a workspace that is NOT paused → 404.
+# (After the resume above it is provisioning/online, not paused.)
+assert_contains "POST /resume (not-paused → 404)" "404" \
+  "$(http_code POST "$BASE/workspaces/$WS_ID/resume" "${AUTH[@]}")"
+# Hibernate: bring the fixture back online first, then hibernate it.
+curl -s -X POST "$BASE/registry/register" -H "Content-Type: application/json" "${AUTH[@]}" \
+  -d "{\"id\":\"$WS_ID\",\"url\":\"https://example.com/keyless\",\"agent_card\":{\"name\":\"Keyless Fixture\",\"skills\":[{\"id\":\"noop\",\"name\":\"Noop\"}]}}" >/dev/null
+HB=$(curl -s -X POST "$BASE/workspaces/$WS_ID/hibernate" "${AUTH[@]}")
+assert_contains "POST /hibernate (online → hibernated)" '"status":"hibernated"' "$HB"
+# Failure: hibernate again (now hibernated, not online/degraded) → 404.
+assert_contains "POST /hibernate (not-hibernatable → 404)" "404" \
+  "$(http_code POST "$BASE/workspaces/$WS_ID/hibernate" "${AUTH[@]}")"
+# Failure: no bearer → 401.
+assert_contains "POST /resume (no auth → 401)" "401" "$(http_code POST "$BASE/workspaces/$WS_ID/resume")"
+
+# ---------------------------------------------------------------------------
+# Cleanup — delete the fixture (admin-gated DELETE + per-workspace bearer).
+# ---------------------------------------------------------------------------
+e2e_delete_workspace "$WS_ID" "Keyless Fixture" "${ADMIN_AUTH[@]}"
+
+echo ""
+echo "=== Results: $PASS passed, $FAIL failed ==="
+[ "$FAIL" -eq 0 ]
@@ -49,13 +49,13 @@ run_test "codex → slash-form fallback"                             codex
 run_test "claude-code → OAuth/default alias"                      claude-code "sonnet"

 got=$(unset E2E_MODEL_SLUG E2E_ANTHROPIC_API_KEY; E2E_MINIMAX_API_KEY="mx-test" pick_model_slug claude-code)
-assert_eq "claude-code + MiniMax key → MiniMax model"             "$got" "MiniMax-M2"
+assert_eq "claude-code + MiniMax key → MiniMax model"             "$got" "minimax:MiniMax-M2.7"

 got=$(unset E2E_MODEL_SLUG E2E_MINIMAX_API_KEY; E2E_ANTHROPIC_API_KEY="sk-ant-test" pick_model_slug claude-code)
 assert_eq "claude-code + Anthropic API key → Anthropic API model" "$got" "claude-sonnet-4-6"

 got=$(unset E2E_MODEL_SLUG; E2E_MINIMAX_API_KEY="mx-priority" E2E_ANTHROPIC_API_KEY="sk-ant-loser" pick_model_slug claude-code)
-assert_eq "claude-code + both keys → MiniMax priority"            "$got" "MiniMax-M2"
+assert_eq "claude-code + both keys → MiniMax priority"            "$got" "minimax:MiniMax-M2.7"

 # ── Fallback for unknown runtime ──
 # Picks slash-form (hermes-shaped) since hermes is the historical
@@ -65,6 +65,28 @@ assert_eq "claude-code + both keys → MiniMax priority"            "$got" "Mini
 run_test "unknown runtime → slash-form fallback"                   gemini      "openai/gpt-4o"
 run_test "empty runtime → slash-form fallback"                     ""          "openai/gpt-4o"

+# ── Platform-managed path (E2E_LLM_PATH=platform) ──
+# The moonshot/kimi NOT_CONFIGURED regression path (RFC#340 Fix A #2187).
+# Selects the slash-namespaced platform model (default moonshot/kimi-k2.6),
+# takes precedence over the per-key BYOK branches, and is itself overridden by
+# E2E_MODEL_SLUG. These pins guard the harness's ability to drive the platform
+# arm — the one the prod bug shipped on.
+echo
+echo "Test: pick_model_slug — platform-managed path (E2E_LLM_PATH=platform)"
+echo
+
+got=$(unset E2E_MODEL_SLUG E2E_DEFAULT_PLATFORM_MODEL; E2E_LLM_PATH=platform pick_model_slug claude-code)
+assert_eq "claude-code + platform path → headline kimi model"      "$got" "moonshot/kimi-k2.6"
+
+got=$(unset E2E_MODEL_SLUG E2E_DEFAULT_PLATFORM_MODEL; E2E_LLM_PATH=platform E2E_MINIMAX_API_KEY="mx-stray" pick_model_slug claude-code)
+assert_eq "platform path beats a stray BYOK key (no mask)"         "$got" "moonshot/kimi-k2.6"
+
+got=$(unset E2E_MODEL_SLUG; E2E_LLM_PATH=platform E2E_DEFAULT_PLATFORM_MODEL="minimax/MiniMax-M3" pick_model_slug claude-code)
+assert_eq "platform path honours E2E_DEFAULT_PLATFORM_MODEL"        "$got" "minimax/MiniMax-M3"
+
+got=$(unset E2E_DEFAULT_PLATFORM_MODEL; E2E_MODEL_SLUG="anthropic/claude-opus-4-7" E2E_LLM_PATH=platform pick_model_slug claude-code)
+assert_eq "E2E_MODEL_SLUG still wins over platform path"            "$got" "anthropic/claude-opus-4-7"
+
 # ── Override via E2E_MODEL_SLUG ──
 # When the operator sets E2E_MODEL_SLUG, the per-runtime dispatch is
 # bypassed. Used during workflow_dispatch to A/B specific slugs.
@@ -234,9 +234,30 @@ elif [ -n "${E2E_OPENAI_API_KEY:-}" ]; then
  SECRETS_JSON=$(python3 -c "import json,os;k=os.environ['E2E_OPENAI_API_KEY'];print(json.dumps({'OPENAI_API_KEY':k,'OPENAI_BASE_URL':'https://api.openai.com/v1','MODEL_PROVIDER':'openai:gpt-4o','HERMES_INFERENCE_PROVIDER':'custom','HERMES_CUSTOM_BASE_URL':'https://api.openai.com/v1','HERMES_CUSTOM_API_KEY':k,'HERMES_CUSTOM_API_MODE':'chat_completions'}))")
 fi

+# Workspace-create now enforces the MODEL_REQUIRED contract: there is NO
+# platform-side default model for a runtime (feedback_workspace_model_required_
+# no_platform_default). Every create MUST carry an explicit `model`, or the CP
+# rejects it with MODEL_REQUIRED before this gate's peer-visibility assertion
+# can run. We pick a PLATFORM-MANAGED id (Molecule owns billing — no tenant key
+# needed; this gate only needs the workspace to boot + list peers, not heavy
+# LLM work), validated against the controlplane providers SSOT
+# (internal/providers/providers.yaml runtimes.<rt>.providers[platform].models):
+#   claude-code → anthropic/claude-sonnet-4-6   (platform claude model)
+#   hermes/openclaw → moonshot/kimi-k2.6         (their only platform family)
+# E2E_MODEL_SLUG overrides for operator-dispatched runs.
+pv_platform_model_for_runtime() {
+  if [ -n "${E2E_MODEL_SLUG:-}" ]; then printf '%s' "$E2E_MODEL_SLUG"; return 0; fi
+  case "$1" in
+    claude-code) printf 'anthropic/claude-sonnet-4-6' ;;
+    hermes|openclaw) printf 'moonshot/kimi-k2.6' ;;
+    *) printf 'moonshot/kimi-k2.6' ;;
+  esac
+}
+
 log "4/6 provisioning parent (claude-code) + one sibling per runtime under test..."
+PARENT_MODEL=$(pv_platform_model_for_runtime claude-code)
 P_RESP=$(tenant_call POST /workspaces \
-  -d "{\"name\":\"pv-parent\",\"runtime\":\"claude-code\",\"tier\":3,\"secrets\":$SECRETS_JSON}")
+  -d "{\"name\":\"pv-parent\",\"runtime\":\"claude-code\",\"model\":\"$PARENT_MODEL\",\"tier\":3,\"secrets\":$SECRETS_JSON}")
 PARENT_ID=$(echo "$P_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null)
 [ -n "$PARENT_ID" ] || fail "parent create failed: $(echo "$P_RESP" | head -c 300)"
 log "    PARENT_ID=$PARENT_ID"
@@ -245,8 +266,9 @@ log "    PARENT_ID=$PARENT_ID"
 declare -A WS_IDS WS_TOKENS
 ALL_WS_IDS="$PARENT_ID"
 for rt in $PV_RUNTIMES; do
+  RT_MODEL=$(pv_platform_model_for_runtime "$rt")
  R=$(tenant_call POST /workspaces \
-    -d "{\"name\":\"pv-$rt\",\"runtime\":\"$rt\",\"tier\":2,\"parent_id\":\"$PARENT_ID\",\"secrets\":$SECRETS_JSON}")
+    -d "{\"name\":\"pv-$rt\",\"runtime\":\"$rt\",\"model\":\"$RT_MODEL\",\"tier\":2,\"parent_id\":\"$PARENT_ID\",\"secrets\":$SECRETS_JSON}")
  WID=$(echo "$R" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null)
  WTOK=$(echo "$R" | extract_auth_token)
  [ -n "$WID" ] || fail "$rt workspace create failed: $(printf '%s' "$R" | head -c 300)"
@@ -300,7 +300,14 @@ rows = json.load(sys.stdin)
 def text_of(r):
    body = r.get('request_body') or {}
    parts = (body.get('params') or {}).get('message', {}).get('parts') or []
-    return ''.join(p.get('text','') for p in parts if p.get('type')=='text')
+    # A2A v0.3 keys the Part discriminator on 'kind'; legacy senders used
+    # 'type'. ProxyA2A.normalizeA2APayload (#2251) rewrites 'type' -> 'kind'
+    # on ingest, so the stored request_body carries 'kind' even when the
+    # caller posted 'type'. Accept EITHER so this parser asserts on the text
+    # payload, not on which discriminator field the server happened to store.
+    def is_text(p):
+        return p.get('kind') == 'text' or p.get('type') == 'text'
+    return ''.join(p.get('text', '') for p in parts if is_text(p))
 if len(rows) < 2:
    print('NEED2_GOT_'+str(len(rows)))
 else:
@@ -309,6 +316,29 @@ else:
 check_eq "since_id feed orders ASC (oldest-new first, newest-new last)" \
  "hello-from-e2e-2|hello-from-e2e-3" "$ASC_FIRST"

+# Wire-contract gate (#2251): the caller posted parts with the LEGACY "type"
+# discriminator, but ProxyA2A.normalizeA2APayload rewrites "type" -> "kind"
+# (A2A v0.3) BEFORE the row is durably logged. Assert the stored request_body
+# carries "kind" and no longer carries "type", so a regression that drops the
+# rename — or a feed that stops storing the normalized body — fails loudly here
+# instead of silently feeding the polling agent an untagged Part. This is the
+# end-to-end half of the Go unit tests in a2a_proxy_test.go (which assert the
+# rename in isolation); this proves it survives the durable activity_logs path.
+DISC=$(echo "$ASC_RESP" | python3 -c "
+import json, sys
+rows = json.load(sys.stdin)
+kinds, types = [], []
+for r in rows:
+    body = r.get('request_body') or {}
+    parts = (body.get('params') or {}).get('message', {}).get('parts') or []
+    for p in parts:
+        if 'kind' in p: kinds.append(p['kind'])
+        if 'type' in p: types.append(p['type'])
+print(('kind' if kinds and not types else 'BAD') + ':' + ','.join(kinds) + '/' + ','.join(types))
+")
+check_eq "stored Part uses v0.3 'kind' discriminator, never legacy 'type' (#2251)" \
+  "kind:text,text/" "$DISC"
+
 # ---------- Phase 6: stale cursor returns 410 ----------
 echo ""
 echo "--- Phase 6: Stale / unknown cursor returns 410 ---"
@@ -0,0 +1,535 @@
+#!/usr/bin/env bash
+# Live staging E2E — the CP instance-state reconciler heals a terminated EC2.
+#
+# Real-infra complement to the deterministic unit tests for core#2261
+# (workspace-server/internal/registry/cp_instance_reconciler.go). Those unit
+# tests pin the reconcile logic against fakes; THIS script proves the loop
+# actually runs in a real tenant's workspace-server and drives the EXISTING
+# offline + auto-heal machinery against real AWS.
+#
+# Root regression (core#2247): a SaaS workspace whose EC2 is terminated out
+# from under the platform (manual AWS action, spot reclaim, CP reap) fell
+# through every existing liveness pass and kept reading status='online'
+# forever, pointing at a dead instance. The reconciler closes that gap with
+# CPProvisioner.IsRunning and feeds a clean "not running" into onOffline →
+# RestartByID (existing-volume reprovision).
+#
+# What this test does:
+#   1. Provision a fresh staging org + ONE workspace (same default
+#      runtime/model as the full-saas harness, so it actually boots).
+#   2. Poll the tenant API until the workspace is status=online; capture its
+#      instance_id.
+#   3. KILL it — terminate that exact EC2 via `aws ec2 terminate-instances`.
+#   4. Assert the reconciler heals it:
+#        PRIMARY (gate)      — within ~180s the workspace status LEAVES
+#                              'online' (the reconciler detected the dead
+#                              instance via IsRunning and flipped it). This
+#                              is the core regression guard: a dead instance
+#                              must NOT keep reading 'online'.
+#        SECONDARY (best-effort) — within ~10 min it auto-reprovisions:
+#                              status returns to 'online' with a NEW
+#                              instance_id (onOffline → RestartByID
+#                              existing-volume heal). If reprovision doesn't
+#                              finish in the bound we log it clearly but let
+#                              the PRIMARY assertion stand as the gate (see
+#                              the comment at the secondary block — a future
+#                              tightening that promotes this to a hard gate is
+#                              deliberately one edit away).
+#   5. Teardown ALWAYS (EXIT trap): delete the tenant + leak-sweep so no EC2
+#      is orphaned, even on a mid-test failure.
+#
+# Auth model + provisioning conventions are copied verbatim from
+# test_staging_full_saas.sh (single MOLECULE_ADMIN_TOKEN → CP admin; per-
+# tenant admin token + X-Molecule-Org-Id header for tenant API). The kill
+# primitive + leak sweep reuse lib/aws_leak_check.sh.
+#
+# Required env:
+#   MOLECULE_CP_URL        default: https://staging-api.moleculesai.app
+#   MOLECULE_ADMIN_TOKEN   CP admin bearer — Railway staging CP_ADMIN_API_TOKEN
+#
+# Optional env (mirrors the full-saas harness where they overlap):
+#   E2E_RUNTIME                        claude-code (default)
+#   E2E_PROVISION_TIMEOUT_SECS         default 900 (cold EC2 budget)
+#   E2E_WORKSPACE_ONLINE_TIMEOUT_SECS  default 900 (15min). A workspace that
+#                     cannot reach online in 15min is a staging/boot problem,
+#                     not slow cold-boot — fail fast so the trap tears down the
+#                     EC2 instead of hanging ~1h and leaking a running instance
+#                     (observed: run 216031 hung 32min with a live e2e-rec EC2).
+#   E2E_RECONCILE_OFFLINE_TIMEOUT_SECS default 180 (PRIMARY: leave 'online'.
+#                                      Reconciler cadence is 60s — 3 cycles +
+#                                      AWS terminate-visibility slack.)
+#   E2E_REPROVISION_TIMEOUT_SECS       default 600 (SECONDARY: back to online
+#                                      with a NEW instance_id)
+#   E2E_MINIMAX_API_KEY / E2E_ANTHROPIC_API_KEY / E2E_OPENAI_API_KEY
+#                                      LLM key (same priority chain as
+#                                      full-saas; needed so the FIRST boot
+#                                      reaches online). Empty → '{}' (the
+#                                      workspace still boots online; the LLM
+#                                      key only matters for a completion,
+#                                      which this test never makes).
+#   E2E_KEEP_ORG                       1 → skip teardown (debugging only)
+#   E2E_RUN_ID                         Slug suffix; CI: ${GITHUB_RUN_ID}
+#   E2E_AWS_LEAK_CHECK                 auto (default) | required | off
+#   E2E_AWS_TERMINATE_LEAKS            1 → terminate slug-tagged leaked EC2 at
+#                                      teardown
+#
+# Exit codes:
+#   0  happy path (PRIMARY assertion held; SECONDARY logged either way)
+#   1  generic failure (incl. PRIMARY assertion failed = regression)
+#   2  missing required env
+#   3  provisioning timed out
+#   4  teardown left orphan resources
+
+set -euo pipefail
+
+CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
+ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}"
+RUNTIME="${E2E_RUNTIME:-claude-code}"
+PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
+WORKSPACE_ONLINE_TIMEOUT_SECS="${E2E_WORKSPACE_ONLINE_TIMEOUT_SECS:-900}"
+# PRIMARY bound: the reconciler ticks every 60s; it needs one cycle to see
+# the dead instance after AWS makes the terminate visible to DescribeInstances
+# (typically seconds, but can lag). 180s = ~3 cycles + slack.
+RECONCILE_OFFLINE_TIMEOUT_SECS="${E2E_RECONCILE_OFFLINE_TIMEOUT_SECS:-180}"
+# SECONDARY bound: full existing-volume reprovision (new EC2 boot + agent
+# bootstrap) is a multi-minute cold path.
+REPROVISION_TIMEOUT_SECS="${E2E_REPROVISION_TIMEOUT_SECS:-600}"
+RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
+
+# Slug MUST start with e2e- so sweep-stale-e2e-orgs.yml reaps any orphan this
+# run leaks (lint_cleanup_traps.sh enforces the e2e-/rt-e2e- prefix for any
+# staging tenant E2E; we honour it here too even though our filename isn't
+# *staging*).
+SLUG="e2e-rec-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
+SLUG=$(echo "$SLUG" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-' | head -c 32)
+
+log()  { echo "[$(date +%H:%M:%S)] $*"; }
+fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
+ok()   { echo "[$(date +%H:%M:%S)] ✅ $*"; }
+
+# Per-runtime model slug dispatch — shared with the full-saas harness.
+# shellcheck disable=SC1091
+# shellcheck source=lib/model_slug.sh
+source "$(dirname "$0")/lib/model_slug.sh"
+# AWS kill primitive + leak sweep (e2e_aws_region / e2e_ec2_instances_for_slug /
+# e2e_terminate_instances / e2e_verify_no_ec2_leaks_for_slug).
+# shellcheck disable=SC1091
+# shellcheck source=lib/aws_leak_check.sh
+source "$(dirname "$0")/lib/aws_leak_check.sh"
+
+CURL_COMMON=(-sS --fail-with-body --max-time 30)
+
+# ─── cleanup trap ───────────────────────────────────────────────────────
+# Identical teardown contract to test_staging_full_saas.sh: delete the
+# tenant (synchronous GDPR cascade), poll for the org row to disappear, then
+# assert no slug-tagged EC2 survives. A leaked resource at teardown is a CI
+# failure (exit 4). The trap is installed UP-FRONT so a mid-test failure
+# (including a failed PRIMARY assertion) still cleans up.
+CLEANUP_DONE=0
+cleanup_org() {
+  # Capture upstream exit code IMMEDIATELY — must be the first statement in
+  # the trap, before any command (including the CLEANUP_DONE check) clobbers $?.
+  local entry_rc=$?
+
+  if [ "$CLEANUP_DONE" = "1" ]; then return 0; fi
+  CLEANUP_DONE=1
+
+  if [ "${E2E_KEEP_ORG:-0}" = "1" ]; then
+    log "E2E_KEEP_ORG=1 — skipping teardown. Manually delete $SLUG when done."
+    return 0
+  fi
+
+  log "🧹 Tearing down org $SLUG..."
+
+  # 120s curl budget for the synchronous DELETE cascade (EC2 terminate alone
+  # is 30-60s), then poll up to 60s for organizations.status='purged'/gone.
+  if curl "${CURL_COMMON[@]}" --max-time 120 -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \
+    -H "Authorization: Bearer $ADMIN_TOKEN" \
+    -H "Content-Type: application/json" \
+    -d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1; then
+    ok "Teardown request accepted"
+  else
+    log "Teardown returned non-2xx (may already be gone)"
+  fi
+
+  local leak_count=1
+  local elapsed=0
+  while [ "$elapsed" -lt 60 ]; do
+    leak_count=$(curl "${CURL_COMMON[@]}" "$CP_URL/cp/admin/orgs" \
+      -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
+      | python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for o in d.get('orgs', []) if o.get('slug')=='$SLUG' and o.get('status') != 'purged'))" \
+      2>/dev/null || echo 1)
+    if [ "$leak_count" = "0" ]; then
+      break
+    fi
+    sleep 5
+    elapsed=$((elapsed + 5))
+  done
+
+  if [ "$leak_count" != "0" ]; then
+    echo "⚠️  LEAK: org $SLUG still present post-teardown after ${elapsed}s (count=$leak_count)" >&2
+    exit 4
+  fi
+  local aws_leak_rc=0
+  e2e_verify_no_ec2_leaks_for_slug "$SLUG" || aws_leak_rc=$?
+  if [ "$aws_leak_rc" != "0" ]; then
+    case "$aws_leak_rc" in
+      2) exit 2 ;;
+      *) exit 4 ;;
+    esac
+  fi
+  ok "Teardown clean — no orphan org or EC2 resources for $SLUG (${elapsed}s)"
+
+  # Normalize unexpected upstream exit codes to 1 — `set -e` propagates the
+  # raw exit code of the failing command (e.g. curl exits 22 under
+  # --fail-with-body), but this script's contract only emits {0,1,2,3,4}.
+  case "$entry_rc" in
+    0|1|2|3|4) ;;
+    *) exit 1 ;;
+  esac
+}
+trap cleanup_org EXIT INT TERM
+
+# ─── 0. Preflight ───────────────────────────────────────────────────────
+log "═══════════════════════════════════════════════════════════════════"
+log " Staging reconciler-heals-terminated-instance E2E (core#2261)"
+log "   CP:                 $CP_URL"
+log "   Slug:               $SLUG"
+log "   Runtime:            $RUNTIME"
+log "   Online timeout:     ${WORKSPACE_ONLINE_TIMEOUT_SECS}s"
+log "   PRIMARY (offline):  ${RECONCILE_OFFLINE_TIMEOUT_SECS}s"
+log "   SECONDARY (reprov): ${REPROVISION_TIMEOUT_SECS}s"
+log "═══════════════════════════════════════════════════════════════════"
+
+log "0/6 Preflight: CP reachable?"
+curl "${CURL_COMMON[@]}" "$CP_URL/health" >/dev/null || fail "CP health check failed"
+ok "CP reachable"
+
+admin_call() {
+  local method="$1"; shift
+  local path="$1"; shift
+  curl "${CURL_COMMON[@]}" -X "$method" "$CP_URL$path" \
+    -H "Authorization: Bearer $ADMIN_TOKEN" \
+    -H "Content-Type: application/json" \
+    "$@"
+}
+
+# ─── 1. Create org ──────────────────────────────────────────────────────
+log "1/6 Creating org $SLUG via /cp/admin/orgs..."
+CREATE_RESP=$(admin_call POST /cp/admin/orgs \
+  -d "{\"slug\":\"$SLUG\",\"name\":\"E2E $SLUG\",\"owner_user_id\":\"e2e-runner:$SLUG\"}")
+echo "$CREATE_RESP" | python3 -m json.tool >/dev/null || fail "Org create returned non-JSON: $CREATE_RESP"
+ORG_ID=$(echo "$CREATE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))")
+[ -z "$ORG_ID" ] && fail "Org create response missing 'id': $CREATE_RESP"
+ok "Org created (id=$ORG_ID)"
+
+# ─── 2. Wait for tenant provisioning ────────────────────────────────────
+log "2/6 Waiting for tenant provisioning (up to ${PROVISION_TIMEOUT_SECS}s)..."
+DEADLINE=$(( $(date +%s) + PROVISION_TIMEOUT_SECS ))
+LAST_STATUS=""
+while true; do
+  if [ "$(date +%s)" -gt "$DEADLINE" ]; then
+    fail "Tenant provisioning timed out after ${PROVISION_TIMEOUT_SECS}s (last: $LAST_STATUS)"
+  fi
+  LIST_JSON=$(admin_call GET /cp/admin/orgs 2>/dev/null || echo '{"orgs":[]}')
+  # /cp/admin/orgs exposes 'instance_status' (org_instances.status), NOT 'status'.
+  STATUS=$(echo "$LIST_JSON" | python3 -c "
+import json, sys
+d = json.load(sys.stdin)
+for o in d.get('orgs', []):
+    if o.get('slug') == '$SLUG':
+        print(o.get('instance_status', ''))
+        sys.exit(0)
+print('')
+" 2>/dev/null || echo "")
+  if [ "$STATUS" != "$LAST_STATUS" ]; then
+    log "    status → $STATUS"
+    LAST_STATUS="$STATUS"
+  fi
+  case "$STATUS" in
+    running)  break ;;
+    failed)
+      log "── DIAGNOSTIC BURST (step 2 — tenant provisioning failed) ──"
+      echo "$LIST_JSON" | python3 -c "
+import json, sys
+d = json.load(sys.stdin)
+for o in d.get('orgs', []):
+    if o.get('slug') == '$SLUG':
+        print(json.dumps(o, indent=2))
+        sys.exit(0)
+print('(no org row found for slug=$SLUG — DB drift?)')
+" 2>&1 | sed 's/^/  /'
+      log "── END DIAGNOSTIC ──"
+      # Tenant provisioning failures are a CP-side fault, not a reconciler
+      # regression — exit 3 (provisioning) to keep the signal honest.
+      echo "[$(date +%H:%M:%S)] ❌ Tenant provisioning failed for $SLUG (see diagnostic above)" >&2
+      exit 3
+      ;;
+    *)        sleep 15 ;;
+  esac
+done
+ok "Tenant provisioning complete"
+
+# Derive tenant domain from CP hostname (same logic as the full-saas harness).
+CP_HOST=$(echo "$CP_URL" | sed -E 's#^https?://##; s#/.*$##')
+case "$CP_HOST" in
+  api.*)         DERIVED_DOMAIN="${CP_HOST#api.}" ;;
+  staging-api.*) DERIVED_DOMAIN="staging.${CP_HOST#staging-api.}" ;;
+  *)             DERIVED_DOMAIN="$CP_HOST" ;;
+esac
+TENANT_DOMAIN="${MOLECULE_TENANT_DOMAIN:-$DERIVED_DOMAIN}"
+TENANT_URL="https://$SLUG.$TENANT_DOMAIN"
+log "    TENANT_URL=$TENANT_URL"
+
+# ─── 3. Retrieve per-tenant admin token ────────────────────────────────
+log "3/6 Fetching per-tenant admin token..."
+TENANT_TOKEN_RESP=$(admin_call GET "/cp/admin/orgs/$SLUG/admin-token")
+TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('admin_token',''))" 2>/dev/null || echo "")
+[ -z "$TENANT_TOKEN" ] && fail "Could not retrieve per-tenant admin token for $SLUG"
+ok "Tenant admin token retrieved (len=${#TENANT_TOKEN})"
+
+# Wait for tenant TLS / DNS propagation before any tenant API call.
+log "    Waiting for tenant TLS / DNS propagation..."
+TLS_DEADLINE=$(( $(date +%s) + 15 * 60 ))
+while true; do
+  if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then
+    break
+  fi
+  if [ "$(date +%s)" -gt "$TLS_DEADLINE" ]; then
+    fail "Tenant URL never responded 2xx on /health within 15m"
+  fi
+  sleep 5
+done
+ok "Tenant reachable at $TENANT_URL"
+
+tenant_call() {
+  local method="$1"; shift
+  local path="$1"; shift
+  # X-Molecule-Org-Id is REQUIRED — the tenant guard 404s anything without it
+  # (it does NOT 403, to hide tenant existence from org scanners).
+  curl "${CURL_COMMON[@]}" -X "$method" "$TENANT_URL$path" \
+    -H "Authorization: Bearer $TENANT_TOKEN" \
+    -H "X-Molecule-Org-Id: $ORG_ID" \
+    "$@"
+}
+
+# Helper: read a single field off GET /workspaces/<id>. Echoes '' on any
+# error so callers can poll without `set -e` aborting on a transient blip.
+ws_field() {
+  local wid="$1"; local field="$2"
+  tenant_call GET "/workspaces/$wid" 2>/dev/null \
+    | python3 -c "import json,sys; print(json.load(sys.stdin).get('$field') or '')" 2>/dev/null \
+    || echo ""
+}
+
+# ─── 4. Provision ONE workspace ─────────────────────────────────────────
+# Same secrets-injection priority chain as the full-saas harness so the
+# FIRST boot reaches online. We never make a completion in this test (the
+# whole exercise is instance-state, not the LLM), so an absent key is
+# tolerable — but wiring the same keys keeps boot behaviour identical to the
+# sibling and avoids a config path that only this test would exercise.
+SECRETS_JSON='{}'
+# Platform-managed path (E2E_LLM_PATH=platform, the DEFAULT for this test):
+# the workspace boots on the CP LLM proxy with NO tenant key, model
+# moonshot/kimi-k2.6 — the exact create combo test_staging_full_saas.sh uses
+# successfully. This test only needs the workspace to reach status=online so
+# it can kill the EC2 and assert the reconciler heals it; it does NOT exercise
+# a real LLM completion, so the platform path is both sufficient and the one
+# proven to create cleanly. (The BYOK key paths below 400'd at create — see
+# the create-failure capture added below — which is why platform is default.)
+if [ "${E2E_LLM_PATH:-platform}" = "platform" ]; then
+  log "    LLM path: PLATFORM-MANAGED (no tenant key; moonshot/kimi-k2.6 via proxy)"
+  SECRETS_JSON='{}'
+elif [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
+  SECRETS_JSON=$(python3 -c "import json,os; print(json.dumps({'MINIMAX_API_KEY': os.environ['E2E_MINIMAX_API_KEY']}))")
+elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
+  SECRETS_JSON=$(python3 -c "import json,os; print(json.dumps({'ANTHROPIC_API_KEY': os.environ['E2E_ANTHROPIC_API_KEY']}))")
+elif [ -n "${E2E_OPENAI_API_KEY:-}" ]; then
+  SECRETS_JSON=$(python3 -c "
+import json, os
+k = os.environ['E2E_OPENAI_API_KEY']
+print(json.dumps({
+    'OPENAI_API_KEY': k,
+    'OPENAI_BASE_URL': 'https://api.openai.com/v1',
+    'MODEL_PROVIDER': 'openai:gpt-4o',
+    'HERMES_INFERENCE_PROVIDER': 'custom',
+    'HERMES_CUSTOM_BASE_URL': 'https://api.openai.com/v1',
+    'HERMES_CUSTOM_API_KEY': k,
+    'HERMES_CUSTOM_API_MODE': 'chat_completions',
+}))
+")
+fi
+
+E2E_LLM_PATH="${E2E_LLM_PATH:-platform}" MODEL_SLUG=$(E2E_LLM_PATH="${E2E_LLM_PATH:-platform}" pick_model_slug "$RUNTIME")
+log "    MODEL_SLUG=$MODEL_SLUG"
+
+log "4/6 Provisioning workspace (runtime=$RUNTIME)..."
+# --fail-with-body makes curl exit non-zero on a 4xx/5xx but STILL writes the
+# response body to stdout; the `|| { ... }` catches that so the body is printed
+# instead of `set -e` aborting the command-substitution silently (the old bug
+# that hid the real HTTP-400 reason). $WS_RESP holds the body either way.
+WS_RESP=$(tenant_call POST /workspaces \
+  -H "Content-Type: application/json" \
+  -d "{\"name\":\"E2E Reconciler\",\"runtime\":\"$RUNTIME\",\"tier\":2,\"model\":\"$MODEL_SLUG\",\"secrets\":$SECRETS_JSON}") || {
+  rc=$?
+  fail "Workspace create failed (curl rc=$rc, model=$MODEL_SLUG). Response body: $WS_RESP"
+}
+WS_ID=$(echo "$WS_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))" 2>/dev/null)
+[ -z "$WS_ID" ] && fail "Workspace create response missing 'id' (model=$MODEL_SLUG): $WS_RESP"
+log "    WS_ID=$WS_ID"
+
+# Wait for the workspace to reach status=online and capture its instance_id.
+log "    Waiting for workspace to reach status=online (up to $((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min)..."
+ONLINE_DEADLINE=$(( $(date +%s) + WORKSPACE_ONLINE_TIMEOUT_SECS ))
+ORIGINAL_INSTANCE_ID=""
+ONLINE_SINCE=""
+# Grace before falling back to the AWS workspace tag when the tenant API
+# does not surface instance_id (observed on staging).
+INSTANCE_ID_GRACE_SECS="${E2E_INSTANCE_ID_GRACE_SECS:-45}"
+WS_LAST_STATUS=""
+while true; do
+  if [ "$(date +%s)" -gt "$ONLINE_DEADLINE" ]; then
+    WS_LAST_ERR=$(ws_field "$WS_ID" "last_sample_error")
+    fail "Workspace $WS_ID never reached status=online within ${WORKSPACE_ONLINE_TIMEOUT_SECS}s (last status=$WS_LAST_STATUS, err=$WS_LAST_ERR)"
+  fi
+  WS_STATUS=$(ws_field "$WS_ID" "status")
+  if [ "$WS_STATUS" != "$WS_LAST_STATUS" ]; then
+    log "    $WS_ID → $WS_STATUS"
+    WS_LAST_STATUS="$WS_STATUS"
+  fi
+  if [ "$WS_STATUS" = "online" ]; then
+    [ -z "$ONLINE_SINCE" ] && ONLINE_SINCE=$(date +%s)
+    ORIGINAL_INSTANCE_ID=$(ws_field "$WS_ID" "instance_id")
+    if [ -n "$ORIGINAL_INSTANCE_ID" ]; then
+      break
+    fi
+    # The workspace is online but the tenant API does not surface instance_id
+    # (observed on staging — the DB has it, the API response omits it). After a
+    # short grace, fall back to the AWS workspace-instance tag so the kill step
+    # can proceed. The reconciler reads instance_id from the DB and acts on the
+    # real EC2 regardless of what the API surfaces, so the AWS-tag instance is
+    # the correct kill target. Without this fallback the loop spins to the online
+    # deadline and fails with a misleading "never reached online".
+    if [ $(( $(date +%s) - ONLINE_SINCE )) -ge "$INSTANCE_ID_GRACE_SECS" ]; then
+      # ws-tenant-<slug>-<wsid...> is the workspace EC2 (vs tenant-<slug>).
+      ORIGINAL_INSTANCE_ID=$(e2e_ec2_instances_for_slug "$SLUG" 2>/dev/null \
+        | awk '$2 ~ /^ws-tenant-/ {print $1}' | sort -u | head -1)
+      if [ -n "$ORIGINAL_INSTANCE_ID" ]; then
+        log "    instance_id not surfaced by API after ${INSTANCE_ID_GRACE_SECS}s — using AWS workspace tag: $ORIGINAL_INSTANCE_ID"
+        break
+      fi
+    fi
+    log "    $WS_ID online but instance_id not populated yet — waiting"
+  fi
+  # 'failed' is transient on cold boot (bootstrap-watcher deadline vs heartbeat
+  # recovery, cp#245). Keep polling; only the deadline hard-fails.
+  sleep 10
+done
+ok "Workspace online (instance_id=$ORIGINAL_INSTANCE_ID)"
+
+# ─── 5. Kill the EC2 ────────────────────────────────────────────────────
+# Terminate the EXACT instance the workspace reported. Prefer the captured
+# instance_id (precise — kills only this workspace's box); fall back to the
+# slug-tag describe if the API didn't surface an id (shouldn't happen — we
+# only break out of the online-wait once instance_id is non-empty).
+log "5/6 KILLING the workspace EC2 to simulate an out-of-band termination..."
+if ! e2e_aws_creds_available; then
+  fail "AWS CLI/creds unavailable — cannot terminate the EC2 to exercise the reconciler. Set AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY (the CI workflow wires these)."
+fi
+AWS_REGION_RESOLVED=$(e2e_aws_region)
+if [ -n "$ORIGINAL_INSTANCE_ID" ]; then
+  log "    Terminating $ORIGINAL_INSTANCE_ID in $AWS_REGION_RESOLVED (aws ec2 terminate-instances)..."
+  aws ec2 terminate-instances --region "$AWS_REGION_RESOLVED" --instance-ids "$ORIGINAL_INSTANCE_ID" >/dev/null \
+    || fail "aws ec2 terminate-instances failed for $ORIGINAL_INSTANCE_ID"
+  KILLED_IDS="$ORIGINAL_INSTANCE_ID"
+else
+  # Fallback path — find by slug tag and terminate.
+  log "    instance_id was empty — falling back to slug-tag describe ($SLUG)..."
+  ROWS=$(e2e_ec2_instances_for_slug "$SLUG" 2>/dev/null || echo "")
+  KILLED_IDS=$(echo "$ROWS" | awk 'NF {print $1}' | sort -u | tr '\n' ' ')
+  [ -n "$KILLED_IDS" ] || fail "No slug-tagged EC2 found for $SLUG — nothing to terminate"
+  log "    Terminating $KILLED_IDS in $AWS_REGION_RESOLVED..."
+  e2e_terminate_instances "$KILLED_IDS" || fail "terminate-instances failed for $KILLED_IDS"
+fi
+ok "Terminated EC2: $KILLED_IDS — reconciler should now detect the dead instance"
+
+# ─── 6a. PRIMARY assertion — workspace leaves 'online' ─────────────────
+# This is THE regression gate for core#2261/#2247. The reconciler runs every
+# 60s in the tenant's workspace-server; when CPProvisioner.IsRunning returns a
+# clean "not running" for the terminated EC2, onOffline flips the row off
+# 'online'. A dead instance that keeps reading 'online' is exactly the bug.
+log "6a/6 PRIMARY: asserting workspace leaves 'online' within ${RECONCILE_OFFLINE_TIMEOUT_SECS}s (reconciler heal-detection)..."
+OFFLINE_DEADLINE=$(( $(date +%s) + RECONCILE_OFFLINE_TIMEOUT_SECS ))
+LEFT_ONLINE=0
+REC_LAST_STATUS=""
+while true; do
+  if [ "$(date +%s)" -gt "$OFFLINE_DEADLINE" ]; then
+    break
+  fi
+  REC_STATUS=$(ws_field "$WS_ID" "status")
+  if [ "$REC_STATUS" != "$REC_LAST_STATUS" ]; then
+    log "    $WS_ID status → ${REC_STATUS:-<empty>}"
+    REC_LAST_STATUS="$REC_STATUS"
+  fi
+  # Any non-online status (offline/provisioning/awaiting_agent/restarting/…)
+  # proves the reconciler acted. We deliberately don't pin the exact target
+  # status: onOffline flips offline AND kicks RestartByID, so the row may race
+  # straight into a provisioning/restarting state — all of which are "no longer
+  # falsely online".
+  if [ -n "$REC_STATUS" ] && [ "$REC_STATUS" != "online" ]; then
+    LEFT_ONLINE=1
+    ok "PRIMARY held — workspace left 'online' (now '$REC_STATUS') after EC2 termination"
+    break
+  fi
+  sleep 10
+done
+
+if [ "$LEFT_ONLINE" != "1" ]; then
+  fail "PRIMARY FAILED (core#2261 regression): workspace $WS_ID still reads status=online ${RECONCILE_OFFLINE_TIMEOUT_SECS}s after its EC2 ($KILLED_IDS) was terminated. The reconciler did NOT detect the dead instance — a terminated EC2 is masquerading as a healthy workspace."
+fi
+
+# ─── 6b. SECONDARY assertion — auto-reprovision (best-effort) ──────────
+# The onOffline → RestartByID existing-volume heal should bring the workspace
+# back to 'online' on a NEW instance_id. This is best-effort: a full EC2 cold
+# reprovision is a multi-minute path that shares the same boot-flake surface
+# as the initial provision. If it doesn't finish within the bound we LOG it
+# clearly but DO NOT fail — the PRIMARY assertion above is the gate.
+#
+# FUTURE TIGHTENING (deliberately one edit away): once this reprovision path
+# is proven reliable on staging, promote the `log "SECONDARY ..."` soft-miss
+# below to a `fail ...` so a stuck reprovision becomes a hard gate.
+log "6b/6 SECONDARY (best-effort): asserting auto-reprovision to online with a NEW instance_id within ${REPROVISION_TIMEOUT_SECS}s..."
+REPROV_DEADLINE=$(( $(date +%s) + REPROVISION_TIMEOUT_SECS ))
+REPROV_OK=0
+REPROV_LAST_STATUS=""
+NEW_INSTANCE_ID=""
+while true; do
+  if [ "$(date +%s)" -gt "$REPROV_DEADLINE" ]; then
+    break
+  fi
+  RP_STATUS=$(ws_field "$WS_ID" "status")
+  if [ "$RP_STATUS" != "$REPROV_LAST_STATUS" ]; then
+    log "    $WS_ID status → ${RP_STATUS:-<empty>}"
+    REPROV_LAST_STATUS="$RP_STATUS"
+  fi
+  if [ "$RP_STATUS" = "online" ]; then
+    NEW_INSTANCE_ID=$(ws_field "$WS_ID" "instance_id")
+    if [ -n "$NEW_INSTANCE_ID" ] && [ "$NEW_INSTANCE_ID" != "$ORIGINAL_INSTANCE_ID" ]; then
+      REPROV_OK=1
+      break
+    fi
+    # online again but instance_id either not surfaced yet or still the old
+    # (terminated) id — keep polling until the reprovision swaps it.
+  fi
+  sleep 15
+done
+
+if [ "$REPROV_OK" = "1" ]; then
+  ok "SECONDARY held — auto-reprovisioned to online on NEW instance_id=$NEW_INSTANCE_ID (was $ORIGINAL_INSTANCE_ID)"
+else
+  # Soft-miss — see FUTURE TIGHTENING note above. PRIMARY is the gate.
+  log "⚠️  SECONDARY not satisfied within ${REPROVISION_TIMEOUT_SECS}s (status=${REPROV_LAST_STATUS:-<empty>}, instance_id=${NEW_INSTANCE_ID:-<none>}, original=$ORIGINAL_INSTANCE_ID). NOT failing — the PRIMARY heal-detection assertion is the gate; reprovision is a slower, flakier cold path. Promote this to a hard fail once it's proven reliable."
+fi
+
+ok "Reconciler live E2E PASSED — PRIMARY heal-detection held (SECONDARY: $([ "$REPROV_OK" = "1" ] && echo "held" || echo "soft-miss, logged"))"
+# Teardown runs via the EXIT trap.
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+# Fail-direction / load-bearing proof for the E2E_REQUIRE_LIVE
+# fail-closed-on-skip guard in test_staging_full_saas.sh.
+#
+# WHY (harden/e2e-staging-saas-failclosed): the staging SaaS E2E is being
+# hardened to become a HARD merge-gate. A gate that can reach its final `ok`
+# WITHOUT having actually exercised a provision→online→A2A cycle is a
+# false-green — it would let a refactor that short-circuits the lifecycle
+# (or a skip path that swallows it) report PASS. require_live_or_die() is the
+# guard; this test proves it FAILS (exit 5) when milestones are missing and
+# PASSES when all fired — the watch-it-fail counterpart the dev-SOP requires.
+#
+# Runs entirely offline (no LLM, no network, no provisioning) — pure shell
+# logic — so it can run on every PR in the fast lane and locally via `bash`.
+set -uo pipefail
+
+# Scratch dir for the generated guard-runner stubs. EXIT trap guarantees
+# cleanup even when an assertion exits the test non-zero (lint_cleanup_traps).
+TMPDIR_E2E=$(mktemp -d -t require-live-guard-XXXXXX)
+trap 'rm -rf "$TMPDIR_E2E"' EXIT INT TERM
+
+PASS=0
+FAIL=0
+
+# Reproduce the EXACT guard logic from test_staging_full_saas.sh. Kept in
+# lockstep with the host script: if the host logic changes, this test must
+# change with it (and a divergence is itself a signal to re-prove the gate).
+make_guard_runner() {
+  cat <<'EOF'
+REQUIRE_LIVE="${E2E_REQUIRE_LIVE:-0}"
+LIVE_MILESTONES=""
+live_milestone() {
+  case " $LIVE_MILESTONES " in
+    *" $1 "*) ;;
+    *) LIVE_MILESTONES="$LIVE_MILESTONES $1" ;;
+  esac
+}
+require_live_or_die() {
+  [ "$REQUIRE_LIVE" = "1" ] || return 0
+  local required="provisioned tenant_online workspace_online a2a_roundtrip"
+  local m missing=""
+  for m in $required; do
+    case " $LIVE_MILESTONES " in
+      *" $m "*) ;;
+      *) missing="$missing $m" ;;
+    esac
+  done
+  if [ -n "$missing" ]; then
+    echo "MISSING:${missing}" >&2
+    exit 5
+  fi
+}
+EOF
+}
+
+# run_case <E2E_REQUIRE_LIVE value> <space-separated milestones to stamp>
+# echoes the observed exit code.
+run_case() {
+  local require_live="$1"; shift
+  local milestones="$1"; shift || true
+  local stub observed m
+  stub=$(mktemp "$TMPDIR_E2E/stub.XXXXXX")
+  {
+    echo "#!/usr/bin/env bash"
+    echo "set -uo pipefail"
+    make_guard_runner
+    for m in $milestones; do
+      echo "live_milestone $m"
+    done
+    echo "require_live_or_die"
+    echo 'echo REACHED_END'
+  } > "$stub"
+  E2E_REQUIRE_LIVE="$require_live" bash "$stub" >/dev/null 2>&1
+  observed=$?
+  rm -f "$stub"
+  echo "$observed"
+}
+
+assert_rc() {
+  local label="$1" require_live="$2" milestones="$3" expected="$4"
+  local observed
+  observed=$(run_case "$require_live" "$milestones")
+  if [ "$observed" = "$expected" ]; then
+    echo "  ✓ $label: REQUIRE_LIVE=$require_live milestones='$milestones' → rc=$observed"
+    PASS=$((PASS+1))
+  else
+    echo "  ✗ $label: REQUIRE_LIVE=$require_live milestones='$milestones' expected=$expected OBSERVED=$observed" >&2
+    FAIL=$((FAIL+1))
+  fi
+}
+
+echo "=== E2E_REQUIRE_LIVE fail-closed-on-skip guard proof ==="
+echo
+
+# DECISIVE (false-green trap): REQUIRE_LIVE=1 but NO lifecycle ran → exit 5.
+assert_rc "require-live, nothing ran → exit 5 (the false-green trap)" \
+  1 "" 5
+
+# REQUIRE_LIVE=1 with a partial lifecycle (provisioned but no A2A) → exit 5.
+assert_rc "require-live, partial lifecycle → exit 5" \
+  1 "provisioned tenant_online workspace_online" 5
+
+# REQUIRE_LIVE=1 with every required milestone → pass (rc=0).
+assert_rc "require-live, full lifecycle → pass" \
+  1 "provisioned tenant_online workspace_online a2a_roundtrip" 0
+
+# Idempotency: duplicate stamps don't break membership; full set still passes.
+assert_rc "require-live, duplicate stamps still pass" \
+  1 "provisioned provisioned tenant_online workspace_online a2a_roundtrip a2a_roundtrip" 0
+
+# Guard is a no-op when CI did not demand a live run: a non-live local run
+# with nothing stamped must NOT exit 5 (we don't break local/debug runs).
+assert_rc "no require-live, nothing ran → pass (guard is opt-in)" \
+  0 "" 0
+assert_rc "require-live unset-equivalent (0), partial → pass" \
+  0 "provisioned" 0
+
+# Extra unknown milestone is harmless as long as required set is present.
+assert_rc "require-live, extra milestone tolerated" \
+  1 "provisioned tenant_online workspace_online a2a_roundtrip extra_thing" 0
+
+echo
+echo "=== Results: $PASS passed, $FAIL failed ==="
+[ "$FAIL" -eq 0 ]
@@ -40,9 +40,25 @@
 #   E2E_INTENTIONAL_FAILURE     1 → break a step on purpose to verify
 #                               the EXIT trap still tears down (mirrors
 #                               the full-saas harness's safety net).
+#   E2E_REQUIRE_LIVE            1 → fail-closed if the harness exits 0
+#                               WITHOUT having driven all four
+#                               awaiting_agent transitions. CI sets this
+#                               so a future skip / early-return can never
+#                               masquerade as a green run. Mirrors CP
+#                               serving-e2e SERVING_E2E_REQUIRE_LIVE.
+#   E2E_STALE_POLL_DEADLINE_SECS  default 240. Upper bound for the
+#                               heartbeat-staleness READINESS poll (step
+#                               6). Replaces the old fixed sleep+one-shot
+#                               assert that raced the sweep cadence.
+#   E2E_TRANSIENT_RETRIES      default 8. Bounded retries for register /
+#                               re-register against transient edge errors
+#                               (502/503/504 from Caddy during cold TLS /
+#                               agent boot). Mirrors the full-saas
+#                               cold-start retry loop — NOT a bare sleep.
 #
 # Exit codes: 0 happy, 1 generic, 2 missing env, 3 provision timeout,
-# 4 teardown leak.
+# 4 teardown leak, 5 REQUIRE_LIVE violation (exited 0 having validated
+# nothing).

 set -euo pipefail

@@ -51,6 +67,13 @@ ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway s
 PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
 RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
 STALE_WAIT_SECS="${E2E_STALE_WAIT_SECS:-180}"
+# Readiness-poll deadline for the sweep transition (step 6). Must exceed
+# STALE_WAIT_SECS (the no-heartbeat window) by at least one sweep
+# interval so a slightly-late sweep tick is polled-for, not misread as a
+# stuck 'online'. 240 = 180s window + 60s sweep-cadence headroom.
+STALE_POLL_DEADLINE_SECS="${E2E_STALE_POLL_DEADLINE_SECS:-240}"
+TRANSIENT_RETRIES="${E2E_TRANSIENT_RETRIES:-8}"
+REQUIRE_LIVE="${E2E_REQUIRE_LIVE:-0}"

 SLUG="e2e-ext-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
 SLUG=$(echo "$SLUG" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-' | head -c 32)
@@ -59,6 +82,66 @@ log()  { echo "[$(date +%H:%M:%S)] $*"; }
 fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
 ok()   { echo "[$(date +%H:%M:%S)] ✅ $*"; }

+# REQUIRE_LIVE bookkeeping: count the four awaiting_agent transitions the
+# test is contracted to prove. The EXIT trap fails-closed (exit 5) if the
+# script reaches a clean exit without all four — so a silent skip, an
+# early `return 0`, or a refactor that drops a step can never show green.
+TRANSITIONS_VERIFIED=0
+EXPECTED_TRANSITIONS=4
+require_transition() {  # $1 = human label
+  TRANSITIONS_VERIFIED=$((TRANSITIONS_VERIFIED + 1))
+  log "    [require-live] transition ${TRANSITIONS_VERIFIED}/${EXPECTED_TRANSITIONS} proven: $1"
+}
+
+# Redact bearer tokens from any HTTP body before logging (mirrors the
+# full-saas sanitize_http_body so transient-error logs never leak creds).
+sanitize_http_body() {
+  sed -E 's/(Bearer|token)[[:space:]]+[A-Za-z0-9._-]+/\1 REDACTED/g'
+}
+
+# Bounded retry-on-transient for POST /registry/register. The tenant edge
+# (Caddy) returns 502/503/504 with an identifiable body while TLS / the
+# workspace agent finishes cold-booting — a single shot here was the
+# un-named flake (a transient edge error misread as a register failure).
+# This mirrors the full-saas cold-start loop (test_staging_full_saas.sh
+# ~L780-816): retry ONLY on a transient TRANSPORT class (5xx + body
+# match), bounded by TRANSIENT_RETRIES, and FAIL CLOSED (non-zero) once
+# the budget is spent. It deliberately does NOT retry on a 4xx — that's a
+# real contract bug (e.g. wrong payload field) and must stay red.
+# Sets REGISTER_RESP (body + trailing "HTTP_CODE=NNN" line) on success;
+# returns non-zero (caller `fail`s) when the bounded budget is exhausted.
+register_with_retry() {  # $1 = step label, $2 = request body
+  local label="$1" body="$2"
+  local attempt code resp safe
+  for attempt in $(seq 1 "$TRANSIENT_RETRIES"); do
+    set +e
+    resp=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST \
+      "$TENANT_URL/registry/register" \
+      -H "Authorization: Bearer $WS_AUTH_TOKEN" \
+      -H "X-Molecule-Org-Id: $ORG_ID" \
+      -H "Content-Type: application/json" \
+      -d "$body")
+    set -e
+    code=$(printf '%s' "$resp" | sed -n 's/^HTTP_CODE=//p' | tail -n1)
+    code=${code:-000}
+    if [ "$code" = "200" ]; then
+      REGISTER_RESP="$resp"
+      return 0
+    fi
+    safe=$(printf '%s' "$resp" | sanitize_http_body | head -c 300)
+    # Retry ONLY on a transient transport class; a 4xx is a real bug.
+    if echo "$code" | grep -Eq '^(502|503|504)$' \
+       && echo "$safe" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|workspace agent unreachable|connection refused|no healthy upstream'; then
+      log "    ${label} transient $code attempt ${attempt}/${TRANSIENT_RETRIES}: $safe"
+      [ "$attempt" -lt "$TRANSIENT_RETRIES" ] && { sleep 10; continue; }
+    fi
+    # Non-transient (4xx, or unrecognized 5xx body): stop and fail closed.
+    REGISTER_RESP="$resp"
+    return 1
+  done
+  return 1
+}
+
 CURL_COMMON=(-sS --fail-with-body --max-time 30)

 # ─── cleanup trap (mirrors full-saas) ────────────────────────────────────
@@ -98,8 +181,19 @@ cleanup_org() {
  fi
  ok "Teardown clean — no orphan resources for $SLUG (${elapsed}s)"

+  # REQUIRE_LIVE fail-closed gate. Only meaningful on an OTHERWISE-CLEAN
+  # exit (entry_rc==0): a script that completed all steps but somehow did
+  # not register all four transitions (a skip, an early return, a dropped
+  # assertion in a refactor) must NOT report success. A non-zero entry_rc
+  # already carries its own failure semantics — don't mask it with 5.
+  if [ "$entry_rc" = "0" ] && [ "${REQUIRE_LIVE}" = "1" ] \
+     && [ "$TRANSITIONS_VERIFIED" -lt "$EXPECTED_TRANSITIONS" ]; then
+    echo "❌ REQUIRE_LIVE: exited 0 but only ${TRANSITIONS_VERIFIED}/${EXPECTED_TRANSITIONS} awaiting_agent transitions were proven — refusing to report green." >&2
+    exit 5
+  fi
+
  case "$entry_rc" in
-    0|1|2|3|4) ;;
+    0|1|2|3|4|5) ;;
    *) exit 1 ;;
  esac
 }
@@ -248,6 +342,7 @@ GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
 DB_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
 [ "$DB_STATUS" != "awaiting_agent" ] && fail "DB row status=$DB_STATUS (expected awaiting_agent — migration 046 likely not applied)"
 ok "DB row stored as awaiting_agent (proof migration 046 applied)"
+require_transition "create: provisioning → awaiting_agent (DB-verified)"

 # ─── 5. Register the workspace (transitions to online) ──────────────────
 # Pre-fix this path was actually fine because it writes 'online', a value
@@ -277,20 +372,20 @@ log "5/8 Registering workspace via /registry/register..."
 #   url           — accepted but not dispatched-to in poll mode, so
 #                   example.invalid is a valid sentinel.
 REGISTER_BODY=$(printf '{"id":"%s","url":"https://example.invalid:443","delivery_mode":"poll","agent_card":{"name":"e2e-ext","skills":[{"id":"echo","name":"Echo"}]}}' "$WS_ID")
-# Disable --fail-with-body for this one call so a 4xx surfaces the response
-# body (the bare CURL_COMMON would `set -e`-kill before we could log it).
-REGISTER_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
-  -H "Authorization: Bearer $WS_AUTH_TOKEN" \
-  -H "X-Molecule-Org-Id: $ORG_ID" \
-  -H "Content-Type: application/json" \
-  -d "$REGISTER_BODY") || true
-log "    register response: $(echo "$REGISTER_RESP" | head -c 300)"
-echo "$REGISTER_RESP" | grep -q "HTTP_CODE=200" || fail "register returned non-200 — see body above"
+# Bounded retry-on-transient (see register_with_retry). The previous
+# single-shot here would `fail` on a cold-boot 502 from the tenant edge —
+# an un-named transient misread as a register break. The helper retries
+# ONLY that class and fails closed on a real 4xx or an exhausted budget.
+REGISTER_RESP=""
+register_with_retry "register" "$REGISTER_BODY" \
+  || fail "register returned non-200 after bounded retries — body: $(printf '%s' "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
+log "    register response: $(echo "$REGISTER_RESP" | sanitize_http_body | head -c 300)"

 GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
 ONLINE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
 [ "$ONLINE_STATUS" != "online" ] && fail "Expected online after register, got $ONLINE_STATUS"
 ok "Workspace transitioned to online"
+require_transition "register: awaiting_agent → online"

 # Confirm the register handler echoed back delivery_mode=poll. We read
 # this from the register RESPONSE, not the workspace GET response, because
@@ -310,38 +405,63 @@ fi
 # This is the SECOND silent-failure path (registry/healthsweep.go's
 # sweepStaleRemoteWorkspaces). Pre-migration-046 the heartbeat-staleness
 # UPDATE silently failed and the workspace stuck on 'online' forever
-# even though no agent was alive. We wait the full window + a sweep
-# interval and assert the row transitions back to 'awaiting_agent'.
-log "6/8 Waiting ${STALE_WAIT_SECS}s for heartbeat-staleness sweep (no heartbeat sent)..."
+# even though no agent was alive.
+#
+# FLAKE FIX (named: sweep-cadence race). The old code did a FIXED
+# `sleep $STALE_WAIT_SECS` then a SINGLE assert. The staleness sweep is a
+# periodic tick (REMOTE_LIVENESS_STALE_AFTER + a sweep interval); if the
+# tick that flips the row lands even one second after the fixed sleep, the
+# one-shot GET reads 'online' and the test fails — a real transition,
+# misread as a flake because the assert was racing the sweep cadence.
+# Replace with: sleep through the mandatory no-heartbeat window ONCE (the
+# sweep cannot fire before the window elapses, so polling earlier is
+# pointless), then READINESS-POLL for the awaiting_agent transition up to
+# STALE_POLL_DEADLINE_SECS, hard-failing with a clear message at the
+# deadline. Deterministic: a slow-but-working sweep passes; a genuinely
+# stuck 'online' still fails (now with how long we actually waited).
+log "6/8 Waiting ${STALE_WAIT_SECS}s no-heartbeat window, then polling for sweep (up to ${STALE_POLL_DEADLINE_SECS}s total)..."
+[ "$STALE_POLL_DEADLINE_SECS" -le "$STALE_WAIT_SECS" ] && \
+  fail "Misconfigured: STALE_POLL_DEADLINE_SECS ($STALE_POLL_DEADLINE_SECS) must exceed STALE_WAIT_SECS ($STALE_WAIT_SECS) by at least one sweep interval"
 sleep "$STALE_WAIT_SECS"

-GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
-STALE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
-[ "$STALE_STATUS" != "awaiting_agent" ] && \
-  fail "After ${STALE_WAIT_SECS}s with no heartbeat, expected status=awaiting_agent (sweep transition), got $STALE_STATUS — migration 046 likely not applied OR sweep not running"
+STALE_DEADLINE=$(( $(date +%s) + (STALE_POLL_DEADLINE_SECS - STALE_WAIT_SECS) ))
+STALE_STATUS=""
+while true; do
+  GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
+  STALE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
+  [ "$STALE_STATUS" = "awaiting_agent" ] && break
+  if [ "$(date +%s)" -gt "$STALE_DEADLINE" ]; then
+    fail "After ${STALE_POLL_DEADLINE_SECS}s with no heartbeat, status still '$STALE_STATUS' (expected awaiting_agent sweep transition) — migration 046 likely not applied OR sweep not running"
+  fi
+  sleep 10
+done
 ok "Heartbeat-staleness sweep transitioned online → awaiting_agent (proof healthsweep.go fix working)"
+require_transition "sweep: online → awaiting_agent (no heartbeat)"

 # ─── 7. Re-register and confirm we can come back online ─────────────────
 # This proves the awaiting_agent state is recoverable (re-registrable),
 # which is the whole point of using it instead of 'offline'.
 log "7/8 Re-registering after stale → confirming recovery to online..."
 # Same payload contract as step 5 (id + agent_card both required). See note
-# there for why workspace_id would 400.
-REREG_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
-  -H "Authorization: Bearer $WS_AUTH_TOKEN" \
-  -H "X-Molecule-Org-Id: $ORG_ID" \
-  -H "Content-Type: application/json" \
-  -d "$REGISTER_BODY") || true
-log "    re-register response: $(echo "$REREG_RESP" | head -c 300)"
-echo "$REREG_RESP" | grep -q "HTTP_CODE=200" || fail "re-register returned non-200 — see body above"
+# there for why workspace_id would 400. Same bounded retry-on-transient.
+REGISTER_RESP=""
+register_with_retry "re-register" "$REGISTER_BODY" \
+  || fail "re-register returned non-200 after bounded retries — body: $(printf '%s' "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
+log "    re-register response: $(echo "$REGISTER_RESP" | sanitize_http_body | head -c 300)"

 GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
 RECOVERED_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
 [ "$RECOVERED_STATUS" != "online" ] && \
  fail "Expected re-register to return workspace to online, got $RECOVERED_STATUS"
 ok "Re-register succeeded — awaiting_agent → online (operator-recoverable)"
+require_transition "re-register: awaiting_agent → online (recovery)"

 # ─── 8. Done — cleanup runs in the EXIT trap ───────────────────────────
+# REQUIRE_LIVE belt-and-braces: assert here too (in addition to the EXIT
+# trap) so the failure surfaces in step order, not only post-teardown.
+if [ "${REQUIRE_LIVE}" = "1" ] && [ "$TRANSITIONS_VERIFIED" -lt "$EXPECTED_TRANSITIONS" ]; then
+  fail "REQUIRE_LIVE: only ${TRANSITIONS_VERIFIED}/${EXPECTED_TRANSITIONS} transitions proven at end of run"
+fi
 log "8/8 All four awaiting_agent transitions verified."
 log "═══════════════════════════════════════════════════════════════════"
 ok "External-runtime E2E PASSED on $SLUG"
@@ -47,6 +47,15 @@
 #                                tear down cleanly (and exit 4 on leak).
 #                                Used by a dedicated sanity workflow
 #                                that verifies the safety net.
+#   E2E_REQUIRE_LIVE             1 → fail-closed-on-skip guard (CI sets this).
+#                                When set, the run MUST actually complete
+#                                ≥1 full provision→online→A2A cycle. A run
+#                                that reaches the end without having proven
+#                                a real round-trip (e.g. a future refactor
+#                                short-circuits a stage, or a skip path
+#                                swallows the lifecycle) exits 5 rather than
+#                                reporting a false green. Mirrors CP
+#                                serving-e2e's SERVING_E2E_REQUIRE_LIVE.
 #
 # Exit codes:
 #   0  happy path
@@ -54,6 +63,37 @@
 #   2  missing required env
 #   3  provisioning timed out
 #   4  teardown left orphan resources
+#   5  E2E_REQUIRE_LIVE set but the run validated no real lifecycle (no
+#      false-green-on-skip)
+#
+# ─────────────────────────────────────────────────────────────────────────
+# PROMOTION-READINESS (harden/e2e-staging-saas-failclosed):
+#   This harness is being hardened so `E2E Staging SaaS` + `E2E Staging
+#   Platform Boot` can become HARD merge-gates. continue-on-error is NOT
+#   flipped here — that promotion is the CTO's irreversible branch-protection
+#   call. What this branch makes fail-closed (was false-green / un-named
+#   flake before):
+#     • Provision/online waits are bounded readiness-POLLS, not fixed sleeps;
+#       each hard-fails with a named mechanism + last-seen signal on deadline,
+#       never a silent timeout (cp#245 boot-timeout class).
+#     • Peer-discovery (9b) asserts a real 2xx, not just "not 404" — a 5xx /
+#       000 / empty no longer reads as "reachable".
+#     • Activity-log (9b) is ASSERTED reachable (2xx + parseable), not
+#       logged-and-ignored behind `|| echo '[]'`.
+#     • Child activity provenance (10) is asserted (was soft-logged).
+#     • E2E_REQUIRE_LIVE=1 (CI) makes the run exit 5 if it reached the end
+#       without proving a real provision→online→A2A round-trip — no
+#       false-green-on-skip.
+#   STILL BLOCKS making it REQUIRED (must clear before the CTO flips
+#   continue-on-error→false in .gitea/workflows/e2e-staging-saas.yml):
+#     • De-flake window: N consecutive green runs on main for BOTH jobs
+#       (platform-boot shares the cp#245 boot surface — #2187 tracks its
+#       flip). This harness removes the harness-side flake mechanisms; the
+#       remaining surface is real-infra (EC2 cold boot, CF DNS) latency,
+#       already bounded by the readiness polls above.
+#     • Branch-protection required-context wiring is a repo-settings change,
+#       not a code change in this PR.
+# ─────────────────────────────────────────────────────────────────────────

 set -euo pipefail

@@ -90,6 +130,41 @@ log()  { echo "[$(date +%H:%M:%S)] $*"; }
 fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
 ok()   { echo "[$(date +%H:%M:%S)] ✅ $*"; }

+# ─── fail-closed-on-skip live-lifecycle guard ───────────────────────────
+# E2E_REQUIRE_LIVE=1 (set by CI) asserts this run ACTUALLY exercised a full
+# provision→online→A2A cycle. Each load-bearing lifecycle stage stamps a
+# milestone via live_milestone(); at the very end, require_live_or_die()
+# checks every required milestone fired. Mechanism: without this, a future
+# refactor that short-circuits a stage — or a skip/early-return path that
+# swallows the lifecycle — would let the script reach its final `ok` and
+# report GREEN having validated nothing. Mirrors CP serving-e2e's
+# SERVING_E2E_REQUIRE_LIVE (skip-if-absent must be LOUD, never silent green).
+REQUIRE_LIVE="${E2E_REQUIRE_LIVE:-0}"
+LIVE_MILESTONES=""
+live_milestone() {
+  # Idempotent set-membership append. Space-delimited; names are tokens.
+  case " $LIVE_MILESTONES " in
+    *" $1 "*) ;;
+    *) LIVE_MILESTONES="$LIVE_MILESTONES $1" ;;
+  esac
+}
+require_live_or_die() {
+  # No-op unless CI demanded a live run.
+  [ "$REQUIRE_LIVE" = "1" ] || return 0
+  local required="provisioned tenant_online workspace_online a2a_roundtrip"
+  local m missing=""
+  for m in $required; do
+    case " $LIVE_MILESTONES " in
+      *" $m "*) ;;
+      *) missing="$missing $m" ;;
+    esac
+  done
+  if [ -n "$missing" ]; then
+    echo "[$(date +%H:%M:%S)] ❌ E2E_REQUIRE_LIVE=1 but the run did NOT prove a full live lifecycle — missing milestone(s):${missing}. Reached:${LIVE_MILESTONES:-<none>}. This is a false-green-on-skip guard: a run that validates no real provision→online→A2A cycle MUST NOT report green." >&2
+    exit 5
+  fi
+}
+
 # Per-runtime model slug dispatch — see lib/model_slug.sh for the rationale.
 # Extracted so unit tests (tests/e2e/test_model_slug.sh) can pin every branch
 # without booting the full 11-step lifecycle.
@@ -197,7 +272,7 @@ cleanup_org() {
  # case statement, and opens a false-positive priority-high
  # "safety net broken" issue (#2159, 2026-04-27).
  case "$entry_rc" in
-    0|1|2|3|4) ;;          # contracted codes — let bash use entry_rc
+    0|1|2|3|4|5) ;;        # contracted codes — let bash use entry_rc
    *) exit 1 ;;            # anything else is a generic failure
  esac
 }
@@ -295,6 +370,7 @@ print('(no org row found for slug=$SLUG — DB drift?)')
  esac
 done
 ok "Tenant provisioning complete"
+live_milestone provisioned

 # Derive tenant domain from CP hostname so the same harness works in
 # both prod (api.moleculesai.app → moleculesai.app) and staging
@@ -351,6 +427,7 @@ while true; do
  sleep 5
 done
 ok "Tenant reachable at $TENANT_URL"
+live_milestone tenant_online

 # Sanity-test path: once the tenant is provisioned, poisoning the
 # tenant token proves the EXIT trap + leak assertion still fire.
@@ -476,7 +553,19 @@ wait_workspaces_online_routable() {
 # All empty → '{}' (workspace will fail at first turn with an
 # expected, actionable auth error rather than masking the test).
 SECRETS_JSON='{}'
-if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
+# Platform-managed path (E2E_LLM_PATH=platform) — the moonshot/kimi
+# NOT_CONFIGURED regression (RFC#340 Fix A #2187). Molecule owns billing via the
+# CP LLM proxy, so the workspace needs NO tenant key: provision with empty
+# secrets and let the workspace boot purely on (a) the proxy env the control
+# plane injects + (b) the manifest-derived `provider: platform` Fix A stamps into
+# the generated config.yaml. This is the path that booted NOT_CONFIGURED in prod
+# precisely because the BYOK branches below never exercise it. We deliberately
+# skip the key-injection branches so a stray E2E_*_API_KEY in the runner env
+# cannot silently convert this into a BYOK run and mask the regression.
+if [ "${E2E_LLM_PATH:-}" = "platform" ]; then
+  log "    LLM path: PLATFORM-MANAGED (no tenant key; proxy + Fix A provider stamp)"
+  SECRETS_JSON='{}'
+elif [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
  SECRETS_JSON=$(python3 -c "
 import json, os
 k = os.environ['E2E_MINIMAX_API_KEY']
@@ -558,6 +647,7 @@ fi
 WS_TO_CHECK=("$PARENT_ID")
 [ -n "$CHILD_ID" ] && WS_TO_CHECK+=("$CHILD_ID")
 wait_workspaces_online_routable "7/11 Waiting for workspace(s) to reach status=online (up to $((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min — hermes cold boot)..." "${WS_TO_CHECK[@]}"
+live_milestone workspace_online

 # ─── 7a. Real chat image upload/download round-trip ───────────────────
 # This deliberately uses the production workflow: tenant admin/session auth
@@ -858,6 +948,24 @@ fi
 if echo "$AGENT_TEXT" | grep -qiE "exceeded your current quota|insufficient_quota"; then
  fail "A2A — PROVIDER QUOTA EXHAUSTED (NOT a platform regression). Operator action: top up MOLECULE_STAGING_OPENAI_API_KEY billing or rotate to a higher-quota org at Settings → Secrets and Variables → Actions. Tracked in #2578. Raw: $AGENT_TEXT"
 fi
+# Empty-completion class — the agent runtime reached the LLM and got a
+# 2xx back, but the assistant turn carried NO text part (empty content,
+# or tool_calls/reasoning-only with no surfaced text), so the runtime
+# returns the literal "Error: message contained no text content." as its
+# reply text. Steps 0-7 passing means the platform is healthy (CP up,
+# tenant provisioned, workspace online + routable, A2A delivery e2e); the
+# break is the configured completion BACKEND returning an empty turn — a
+# model/provider-side regression, NOT a workspace-server or harness bug,
+# and NOT NOT_CONFIGURED (that fails earlier, at boot). Name it explicitly
+# so the canary alert points at the model, not the platform: a generic
+# "error-shaped response" misdirects triage to workspace-server. Observed
+# 2026-06-03/04 across every staging canary on MODEL_SLUG=MiniMax-M2 (the
+# canary default since #2710) — 100% on the parent's first cold turn,
+# identical on main's scheduled synthetic E2E and on PRs (so it is an
+# environmental backend regression, never PR-introduced).
+if echo "$AGENT_TEXT" | grep -qiF "message contained no text content"; then
+  fail "A2A — EMPTY COMPLETION (backend regression, NOT a platform/workspace-server bug). The configured model (MODEL_SLUG=${MODEL_SLUG:-?}) returned a 2xx completion with no text part; the runtime surfaced 'message contained no text content.'. Operator action: check the staging LLM backend / proxy for the canary model (the claude-code default is minimax:MiniMax-M2.7 since #2263; was bare MiniMax-M2 #2710) — empty assistant turns, not an auth/quota/boot fault. Raw: $AGENT_TEXT"
+fi
 # Generic catch-all — falls through if none of the known regressions hit.
 if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then
  fail "A2A returned an error-shaped response: $AGENT_TEXT"
@@ -922,7 +1030,14 @@ for KA_ATTEMPT in $(seq 1 6); do
  KA_SAFE_BODY=$(printf '%s' "$KA_RESP" | sanitize_http_body)
  # Retry ONLY on transient transport errors — never on an agent-level
  # error (those must surface and fail the gate).
-  if echo "$KA_CODE" | grep -Eq '^(502|503|504)$' && echo "$KA_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then
+  # #2263: include the Cloudflare-shaped literal `error code: 502/504` token so a
+  # bare edge/gateway 502 (no "Bad Gateway" body) is retried here the same way the
+  # cold-start PONG probe (line ~800) and the delegation loop (line ~1234) already
+  # do. Without it, a single un-retried edge 502 right after a healthy round-trip
+  # fell through to break and failed the gate on the first attempt (Platform Boot
+  # job, task 268859). Bounded by the existing 6-attempt / sleep-10 loop — no new
+  # sleep-as-fix; this only widens the transient-match to the sibling pattern.
+  if echo "$KA_CODE" | grep -Eq '^(502|503|504)$' && echo "$KA_SAFE_BODY" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session'; then
    log "    known-answer A2A transient $KA_CODE attempt $KA_ATTEMPT/6: $KA_SAFE_BODY"
    if [ "$KA_ATTEMPT" -lt 6 ]; then sleep 10; continue; fi
  fi
@@ -944,6 +1059,11 @@ except Exception:
 " 2>/dev/null || echo "")
 # CORE GATE: contains PINEAPPLE (real round-trip) AND no error-as-text.
 a2a_assert_real_completion "$KA_TEXT" "PINEAPPLE" "A2A known-answer (parent, $RUNTIME/$MODEL_SLUG)"
+# Real, deterministic LLM round-trip proven — the load-bearing milestone for
+# the fail-closed-on-skip guard. Stamped AFTER a2a_assert_real_completion (not
+# after the looser PONG check) so the milestone means a verified completion,
+# not just a 2xx-with-text.
+live_milestone a2a_roundtrip

 # ─── 8c. byok-routing regression guard (#1994) ─────────────────────────
 # The parent was provisioned with the customer's OWN vendor key
@@ -1011,7 +1131,7 @@ print(json.dumps({
            'messageId': f'e2e-{uuid.uuid4().hex[:8]}',
            'parts': [{'kind': 'text', 'text': 'Reply with exactly: ok'}],
        },
-        'configuration': {'max_tokens': 4}
+        'configuration': {'max_tokens': 32}
    }
 }))
 ")
@@ -1069,18 +1189,50 @@ print(json.dumps({
  ok "HMA memory write+read roundtripped"

  log "9b.  Peer discovery + activity log smoke..."
+  # FAIL-CLOSED: assert a real 2xx, not merely "not 404". The previous
+  # `[ "$PEERS_CODE" = "404" ] && fail` only caught the route-missing case —
+  # a 5xx, 000 (connection failure), or empty capture ALL fell through to
+  # "reachable" (false-green: a broken-but-present route read as healthy).
+  # Mechanism: route the http_code into its own tempfile (no stderr capture,
+  # which the old `2>&1 | head -1` could pollute with a curl error line) and
+  # require 2xx explicitly.
+  PEERS_TMP=$(e2e_tmp /tmp/e2e_peers.XXXXXX)
  set +e
-  tenant_call GET "/registry/$PARENT_ID/peers" -o /dev/null -w "%{http_code}\n" 2>&1 | head -1 > /tmp/peers_code.txt
+  PEERS_CODE=$(tenant_call GET "/registry/$PARENT_ID/peers" \
+    -o "$PEERS_TMP" -w "%{http_code}" 2>/dev/null)
+  PEERS_RC=$?
  set -e
-  PEERS_CODE=$(cat /tmp/peers_code.txt)
-  [ "$PEERS_CODE" = "404" ] && fail "Peers endpoint missing (404) — route regression"
+  PEERS_CODE=${PEERS_CODE:-000}
+  if [ "$PEERS_CODE" = "404" ]; then
+    fail "Peers endpoint missing (404) — route regression. /registry/$PARENT_ID/peers"
+  fi
+  if [ "$PEERS_RC" != "0" ] || [ "$PEERS_CODE" -lt 200 ] || [ "$PEERS_CODE" -ge 300 ]; then
+    fail "Peers endpoint unhealthy (curl_rc=$PEERS_RC, http=$PEERS_CODE) — not a clean 2xx, so 'reachable' would be a false-green. Body: $(head -c 200 "$PEERS_TMP" 2>/dev/null | sanitize_http_body)"
+  fi
  ok "Peers endpoint reachable (HTTP $PEERS_CODE)"

-  ACTIVITY=$(tenant_call GET "/activity?workspace_id=$PARENT_ID&limit=5" 2>/dev/null || echo '[]')
-  ACTIVITY_COUNT=$(echo "$ACTIVITY" | python3 -c "import json,sys
-d=json.load(sys.stdin)
-print(len(d if isinstance(d, list) else d.get('events', [])))" 2>/dev/null || echo 0)
-  log "    Activity events observed: $ACTIVITY_COUNT"
+  # FAIL-CLOSED: the activity-log read was `|| echo '[]'` then the count was
+  # only LOGGED, never asserted — a 5xx / network failure silently became an
+  # empty list and the step exited 0 having validated nothing (false-green:
+  # "validated nothing" class). Assert the endpoint returns a 2xx and a
+  # parseable activity shape. We do NOT assert count>0 (the parent may
+  # legitimately have 0 events this early — that's a real, valid state), but
+  # we DO require the call to have actually succeeded and returned valid JSON.
+  ACTIVITY_TMP=$(e2e_tmp /tmp/e2e_activity.XXXXXX)
+  set +e
+  ACTIVITY_CODE=$(tenant_call GET "/activity?workspace_id=$PARENT_ID&limit=5" \
+    -o "$ACTIVITY_TMP" -w "%{http_code}" 2>/dev/null)
+  ACTIVITY_RC=$?
+  set -e
+  ACTIVITY_CODE=${ACTIVITY_CODE:-000}
+  if [ "$ACTIVITY_RC" != "0" ] || [ "$ACTIVITY_CODE" -lt 200 ] || [ "$ACTIVITY_CODE" -ge 300 ]; then
+    fail "Activity-log endpoint unhealthy (curl_rc=$ACTIVITY_RC, http=$ACTIVITY_CODE) — was previously swallowed by '|| echo []' and reported as 0 events (false-green). Body: $(head -c 200 "$ACTIVITY_TMP" 2>/dev/null | sanitize_http_body)"
+  fi
+  ACTIVITY_COUNT=$(python3 -c "import json,sys
+d=json.load(open(sys.argv[1]))
+print(len(d if isinstance(d, list) else d.get('events', [])))" "$ACTIVITY_TMP" 2>/dev/null) \
+    || fail "Activity-log returned HTTP $ACTIVITY_CODE but body was not parseable JSON (events array / {events:[...]}). Body: $(head -c 200 "$ACTIVITY_TMP" 2>/dev/null | sanitize_http_body)"
+  log "    Activity events observed: $ACTIVITY_COUNT (endpoint 2xx + parseable ✓)"

  # ─── 9c. Workspace KV memory Edit round-trip ─────────────────────────
  # Pins the Edit affordance added to the canvas Memory tab. The UI calls
@@ -1231,14 +1383,44 @@ except Exception:
  [ -z "$DELEG_TEXT" ] && fail "Delegation returned no text. Raw: ${DELEG_RESP:0:200}"
  ok "Delegation proxy works (child responded: \"${DELEG_TEXT:0:60}\")"

-  CHILD_ACT=$(tenant_call GET "/activity?workspace_id=$CHILD_ID&limit=20" 2>/dev/null || echo '[]')
-  if echo "$CHILD_ACT" | grep -q "$PARENT_ID"; then
+  # FAIL-CLOSED via bounded readiness-POLL (was soft-logged false-green).
+  # The activity pipeline is async, so an immediate single read can miss the
+  # parent reference — but "did not reference parent" was previously just
+  # LOGGED and the step passed regardless, so a genuinely broken provenance
+  # pipeline (parent never recorded as source) read as success. Mechanism:
+  # poll the child activity log for the parent id for a bounded window
+  # (E2E_CHILD_ACTIVITY_TIMEOUT_SECS, default 60s) — this is the real
+  # readiness signal (provenance row materialised), not a fixed sleep — and
+  # hard-fail with a named mechanism if it never appears.
+  CHILD_ACT_DEADLINE=$(( $(date +%s) + ${E2E_CHILD_ACTIVITY_TIMEOUT_SECS:-60} ))
+  CHILD_ACT_SEEN=0
+  CHILD_ACT_LASTCODE="000"
+  while true; do
+    CHILD_ACT_TMP=$(e2e_tmp /tmp/e2e_child_act.XXXXXX)
+    set +e
+    CHILD_ACT_CODE=$(tenant_call GET "/activity?workspace_id=$CHILD_ID&limit=20" \
+      -o "$CHILD_ACT_TMP" -w "%{http_code}" 2>/dev/null)
+    set -e
+    CHILD_ACT_LASTCODE=${CHILD_ACT_CODE:-000}
+    if grep -q "$PARENT_ID" "$CHILD_ACT_TMP" 2>/dev/null; then
+      CHILD_ACT_SEEN=1
+      break
+    fi
+    [ "$(date +%s)" -ge "$CHILD_ACT_DEADLINE" ] && break
+    sleep 5
+  done
+  if [ "$CHILD_ACT_SEEN" = "1" ]; then
    ok "Child activity log records parent as source"
  else
-    log "Child activity log did not reference parent (pipeline may be async)"
+    fail "Child activity log never referenced parent $PARENT_ID within ${E2E_CHILD_ACTIVITY_TIMEOUT_SECS:-60}s (last http=$CHILD_ACT_LASTCODE) — delegation-provenance pipeline regression (parent not recorded as source). Previously soft-logged → false-green."
  fi
 fi

 # ─── 11. Teardown runs via trap ────────────────────────────────────────
+# Fail-closed-on-skip: before declaring PASS, assert (when CI demanded a live
+# run) that every load-bearing lifecycle milestone actually fired. A run that
+# reaches here without provision→online→A2A having truly happened exits 5
+# instead of reporting green. Teardown still runs (EXIT trap) on that exit.
+require_live_or_die
 log "11/11 All checks passed. Teardown runs via EXIT trap."
 ok "═══ STAGING $MODE-SAAS E2E PASSED ═══"
@@ -18,6 +18,7 @@ No network. No live Gitea calls.
 from __future__ import annotations

 import importlib.util
+import json
 import os
 import textwrap
 from pathlib import Path
@@ -117,6 +118,31 @@ def _write_audit_yaml(tmp_path: Path, required_checks: list[str]) -> Path:
    return p


+def _write_audit_yaml_json(tmp_path: Path, required_checks_json: dict) -> Path:
+    """Write a synthetic audit-force-merge.yml with REQUIRED_CHECKS_JSON env."""
+    block = json.dumps(required_checks_json, indent=2)
+    text = textwrap.dedent(
+        f"""\
+        name: audit-force-merge
+        on:
+          schedule:
+            - cron: '*/30 * * * *'
+        jobs:
+          audit:
+            runs-on: ubuntu-latest
+            steps:
+              - name: Run audit
+                env:
+                  REQUIRED_CHECKS_JSON: |
+                    {block.replace(chr(10), chr(10) + '                    ')}
+                run: bash .gitea/scripts/audit-force-merge.sh
+        """
+    )
+    p = tmp_path / "audit-force-merge.yml"
+    p.write_text(text, encoding="utf-8")
+    return p
+
+
 def _make_stub_api(responses: dict):
    """Build a fake `api()` callable.

@@ -363,6 +389,107 @@ def test_happy_path_no_drift(drift_module, tmp_path, monkeypatch):
    assert findings == [], findings


+# --------------------------------------------------------------------------
+# REQUIRED_CHECKS_JSON variant drift tests
+# --------------------------------------------------------------------------
+def test_f3a_env_wider_than_protection_json_variant(drift_module, tmp_path, monkeypatch):
+    """F3a: REQUIRED_CHECKS_JSON env has a context NOT in protection."""
+    ci = _write_ci_yaml(
+        tmp_path,
+        jobs={"build": {"runs-on": "ubuntu-latest"}},
+        sentinel_needs=["build"],
+    )
+    audit = _write_audit_yaml_json(
+        tmp_path,
+        {"main": ["ci / build (pull_request)", "ci / ghost (pull_request)"]},
+    )
+    _patch_paths(drift_module, monkeypatch, ci, audit)
+
+    stub = _make_stub_api({
+        ("GET", "/repos/owner/repo/branch_protections/main"): (
+            200,
+            {"status_check_contexts": ["ci / build (pull_request)"]},
+        ),
+    })
+    monkeypatch.setattr(drift_module, "api", stub)
+
+    findings, _ = drift_module.detect_drift("main")
+    assert any("F3a" in f and "ghost" in f for f in findings), findings
+
+
+def test_f3b_protection_wider_than_env_json_variant(drift_module, tmp_path, monkeypatch):
+    """F3b: protection has a context NOT in REQUIRED_CHECKS_JSON env."""
+    ci = _write_ci_yaml(
+        tmp_path,
+        jobs={
+            "build": {"runs-on": "ubuntu-latest"},
+            "test": {"runs-on": "ubuntu-latest"},
+        },
+        sentinel_needs=["build", "test"],
+    )
+    audit = _write_audit_yaml_json(
+        tmp_path,
+        {"main": ["ci / build (pull_request)"]},
+    )
+    _patch_paths(drift_module, monkeypatch, ci, audit)
+
+    stub = _make_stub_api({
+        ("GET", "/repos/owner/repo/branch_protections/main"): (
+            200,
+            {
+                "status_check_contexts": [
+                    "ci / build (pull_request)",
+                    "ci / test (pull_request)",
+                ]
+            },
+        ),
+    })
+    monkeypatch.setattr(drift_module, "api", stub)
+
+    findings, _ = drift_module.detect_drift("main")
+    assert any("F3b" in f and "ci / test (pull_request)" in f for f in findings), findings
+
+
+def test_happy_path_no_drift_json_variant(drift_module, tmp_path, monkeypatch):
+    """Happy path with REQUIRED_CHECKS_JSON: all aligned."""
+    ci = _write_ci_yaml(
+        tmp_path,
+        jobs={
+            "build": {"runs-on": "ubuntu-latest"},
+            "test": {"runs-on": "ubuntu-latest"},
+        },
+        sentinel_needs=["build", "test"],
+    )
+    audit = _write_audit_yaml_json(
+        tmp_path,
+        {
+            "main": [
+                "ci / build (pull_request)",
+                "ci / test (pull_request)",
+                "ci / all-required (pull_request)",
+            ]
+        },
+    )
+    _patch_paths(drift_module, monkeypatch, ci, audit)
+
+    stub = _make_stub_api({
+        ("GET", "/repos/owner/repo/branch_protections/main"): (
+            200,
+            {
+                "status_check_contexts": [
+                    "ci / build (pull_request)",
+                    "ci / test (pull_request)",
+                    "ci / all-required (pull_request)",
+                ]
+            },
+        ),
+    })
+    monkeypatch.setattr(drift_module, "api", stub)
+
+    findings, _ = drift_module.detect_drift("main")
+    assert findings == [], findings
+
+
 # --------------------------------------------------------------------------
 # MUST-FIX 1: find_open_issue must raise on transient HTTP errors
 # --------------------------------------------------------------------------
@@ -26,11 +26,12 @@ import (
 //     the update cycle — no ssh, no re-provision, no ops toil.
 //
 // Contract (paired with cp-side GET /cp/tenants/config):
-//   Request:  GET {MOLECULE_CP_URL or https://api.moleculesai.app}/cp/tenants/config
-//             Authorization: Bearer <ADMIN_TOKEN>
-//             X-Molecule-Org-Id: <MOLECULE_ORG_ID>
-//   Response: 200 {"MOLECULE_CP_SHARED_SECRET":"…","MOLECULE_CP_URL":"…", …}
-//             401 on bearer mismatch or unknown org
+//
+//	Request:  GET {MOLECULE_CP_URL or https://api.moleculesai.app}/cp/tenants/config
+//	          Authorization: Bearer <ADMIN_TOKEN>
+//	          X-Molecule-Org-Id: <MOLECULE_ORG_ID>
+//	Response: 200 {"MOLECULE_CP_SHARED_SECRET":"…","MOLECULE_CP_URL":"…", …}
+//	          401 on bearer mismatch or unknown org
 //
 // Best-effort: any failure logs and returns — main() keeps booting.
 // Self-hosted deploys without MOLECULE_ORG_ID or ADMIN_TOKEN set
@@ -105,3 +106,53 @@ func refreshEnvFromCP() error {
 	log.Printf("CP env refresh: applied %d values from %s/cp/tenants/config", applied, base)
 	return nil
 }
+
+// requiredLLMEnvVars is the set of LLM proxy env vars a managed SaaS
+// tenant must have populated after refreshEnvFromCP. cp#469 (tenant
+// proxy-env delivery) — guaranteed CP-delivered creds reach the
+// tenant process env on boot. Per Researcher Task #37 / Spec 2 and
+// Task #46 (watch-fail-first test).
+//
+// Key set byte-matched against Researcher's verified emission in
+// controlplane tenant_config.go:140-144 (Researcher REQUEST_CHANGES
+// iterate body, 3987f59c). The four keys below ARE the LLM-proxy
+// subset of the 8 CP-emitted keys; OPENAI_BASE_URL / OPENAI_API_KEY /
+// ANTHROPIC_BASE_URL / ANTHROPIC_API_KEY are out of scope for cp#469
+// (different feature surfaces — direct-to-provider fallbacks, not
+// the proxy). v2 fix: MOLECULE_LLM_USAGE_TOKEN, MOLECULE_LLM_USAGE_URL,
+// MOLECULE_LLM_BASE_URL, MOLECULE_LLM_ANTHROPIC_BASE_URL — note the
+// 4th key is namespaced MOLECULE_LLM_ANTHROPIC_BASE_URL, NOT bare
+// ANTHROPIC_BASE_URL. Bare ANTHROPIC_BASE_URL is a separate CP-emitted
+// key for direct-provider use, not the LLM proxy.
+var requiredLLMEnvVars = []string{
+	"MOLECULE_LLM_USAGE_TOKEN",
+	"MOLECULE_LLM_USAGE_URL", // CRITICAL fix v2: was MOLECULE_LLM_URL in v1
+	"MOLECULE_LLM_BASE_URL",
+	"MOLECULE_LLM_ANTHROPIC_BASE_URL", // CRITICAL fix v3: was ANTHROPIC_BASE_URL in v2 (different key!)
+}
+
+// assertManagedTenantHasLLMEnv verifies that, when running as a
+// managed SaaS tenant (MOLECULE_ORG_ID + ADMIN_TOKEN both set), all
+// required LLM proxy env vars are populated after refreshEnvFromCP.
+//
+// Self-hosted (no orgID/adminToken) is exempt — dev must not be
+// blocked here. Managed tenants with missing LLM keys fail with
+// MISSING_CP_LLM_ENV so they do not silently boot with broken proxy
+// creds. Caller in main.go decides whether to log and continue or
+// log.Fatalf depending on deployment context.
+func assertManagedTenantHasLLMEnv() error {
+	if os.Getenv("MOLECULE_ORG_ID") == "" || os.Getenv("ADMIN_TOKEN") == "" {
+		// Self-hosted dev / not yet provisioned — not a managed tenant.
+		return nil
+	}
+	var missing []string
+	for _, k := range requiredLLMEnvVars {
+		if os.Getenv(k) == "" {
+			missing = append(missing, k)
+		}
+	}
+	if len(missing) > 0 {
+		return fmt.Errorf("MISSING_CP_LLM_ENV: required LLM proxy keys not set after refreshEnvFromCP: %v", missing)
+	}
+	return nil
+}
@@ -5,6 +5,7 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"os"
+	"strings"
 	"testing"
 )

@@ -59,6 +60,138 @@ func TestRefreshEnvFromCP_AppliesCPResponse(t *testing.T) {
 	}
 }

+// TestRefreshEnvFromCP_ManagedTenantRequiresLLMKeys: watch-fail-first
+// per Researcher Task #46. When running as a managed tenant
+// (MOLECULE_ORG_ID + ADMIN_TOKEN set), missing LLM proxy env vars
+// after refreshEnvFromCP MUST surface as MISSING_CP_LLM_ENV, not be
+// silently accepted. Without this guard, a CP that loses its LLM
+// creds (e.g. during an incident) would let a tenant boot and then
+// fail later at first LLM call — worse than a loud refusal here.
+func TestRefreshEnvFromCP_ManagedTenantRequiresLLMKeys(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		// Stub CP returns a CP response WITHOUT any of the required
+		// LLM keys — simulates the failure mode where the CP side
+		// dropped or never had the LLM creds for this org.
+		w.Header().Set("Content-Type", "application/json")
+		fmt.Fprint(w, `{"MOLECULE_CP_SHARED_SECRET":"x","MOLECULE_CP_URL":"https://api.moleculesai.app"}`)
+	}))
+	defer srv.Close()
+
+	t.Setenv("MOLECULE_ORG_ID", "org-managed-1")
+	t.Setenv("ADMIN_TOKEN", "admin-tok")
+	t.Setenv("MOLECULE_CP_URL", srv.URL)
+	// Clear all LLM keys to simulate the boot-without-LLM-env failure mode.
+	t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "")
+	t.Setenv("MOLECULE_LLM_USAGE_URL", "")
+	t.Setenv("MOLECULE_LLM_BASE_URL", "")
+	t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "")
+
+	// refreshEnvFromCP itself should succeed — CP is reachable, returned 200.
+	if err := refreshEnvFromCP(); err != nil {
+		t.Fatalf("refreshEnvFromCP: %v", err)
+	}
+	// The boot assertion must catch the missing LLM keys.
+	err := assertManagedTenantHasLLMEnv()
+	if err == nil {
+		t.Fatal("expected MISSING_CP_LLM_ENV error for managed tenant without LLM keys, got nil")
+	}
+	if !strings.Contains(err.Error(), "MISSING_CP_LLM_ENV") {
+		t.Errorf("expected error to contain MISSING_CP_LLM_ENV, got: %v", err)
+	}
+}
+
+// TestRefreshEnvFromCP_ManagedTenantHappyPath: when the CP returns
+// all 4 LLM-proxy keys, the gate must PASS — no MISSING_CP_LLM_ENV
+// for a properly-configured managed tenant. Watch-fail counterpart
+// to TestRefreshEnvFromCP_ManagedTenantRequiresLLMKeys: if THIS test
+// ever fires MISSING_CP_LLM_ENV on the byte-correct key set, the
+// requiredLLMEnvVars list has drifted from the CP emission again.
+// Per Researcher REQUEST_CHANGES TEST ADEQUACY note.
+func TestRefreshEnvFromCP_ManagedTenantHappyPath(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		// Return ALL 4 LLM-proxy keys — names byte-matched to
+		// tenant_config.go:140-144 CP emission.
+		fmt.Fprint(w, `{"MOLECULE_LLM_USAGE_TOKEN":"tok-1","MOLECULE_LLM_USAGE_URL":"https://llm.example.com/usage","MOLECULE_LLM_BASE_URL":"https://llm.example.com","MOLECULE_LLM_ANTHROPIC_BASE_URL":"https://llm.example.com/anthropic"}`)
+	}))
+	defer srv.Close()
+
+	t.Setenv("MOLECULE_ORG_ID", "org-managed-happy")
+	t.Setenv("ADMIN_TOKEN", "admin-tok")
+	t.Setenv("MOLECULE_CP_URL", srv.URL)
+	// Pre-clear so we can verify the refresh actually populated them.
+	t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "")
+	t.Setenv("MOLECULE_LLM_USAGE_URL", "")
+	t.Setenv("MOLECULE_LLM_BASE_URL", "")
+	t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "")
+
+	if err := refreshEnvFromCP(); err != nil {
+		t.Fatalf("refreshEnvFromCP: %v", err)
+	}
+	// Sanity: refresh actually applied the keys.
+	if got := os.Getenv("MOLECULE_LLM_USAGE_TOKEN"); got != "tok-1" {
+		t.Errorf("refresh did not apply USAGE_TOKEN: got %q", got)
+	}
+	// The boot assertion must pass — no MISSING_CP_LLM_ENV.
+	if err := assertManagedTenantHasLLMEnv(); err != nil {
+		t.Errorf("managed happy path must not MISSING_CP_LLM_ENV, got: %v", err)
+	}
+}
+
+// TestRefreshEnvFromCP_ManagedTenantPartialEnv: when the CP returns
+// 3 of 4 LLM-proxy keys (one missing), the gate must STILL catch it
+// and the error must name the missing key. Per Researcher
+// REQUEST_CHANGES TEST ADEQUACY note — partial-env coverage is
+// critical because the production failure mode is usually "one
+// key dropped" not "all keys dropped".
+func TestRefreshEnvFromCP_ManagedTenantPartialEnv(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		// 3 of 4 — MOLECULE_LLM_ANTHROPIC_BASE_URL is missing.
+		fmt.Fprint(w, `{"MOLECULE_LLM_USAGE_TOKEN":"tok-1","MOLECULE_LLM_USAGE_URL":"https://llm.example.com/usage","MOLECULE_LLM_BASE_URL":"https://llm.example.com"}`)
+	}))
+	defer srv.Close()
+
+	t.Setenv("MOLECULE_ORG_ID", "org-managed-partial")
+	t.Setenv("ADMIN_TOKEN", "admin-tok")
+	t.Setenv("MOLECULE_CP_URL", srv.URL)
+	// Pre-clear all 4 so the 3 that come back from CP are the only
+	// ones set; the 4th (MOLECULE_LLM_ANTHROPIC_BASE_URL) stays empty.
+	t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "")
+	t.Setenv("MOLECULE_LLM_USAGE_URL", "")
+	t.Setenv("MOLECULE_LLM_BASE_URL", "")
+	t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "")
+
+	if err := refreshEnvFromCP(); err != nil {
+		t.Fatalf("refreshEnvFromCP: %v", err)
+	}
+	err := assertManagedTenantHasLLMEnv()
+	if err == nil {
+		t.Fatal("expected MISSING_CP_LLM_ENV for partial env (3 of 4 keys), got nil")
+	}
+	if !strings.Contains(err.Error(), "MISSING_CP_LLM_ENV") {
+		t.Errorf("expected error to contain MISSING_CP_LLM_ENV, got: %v", err)
+	}
+	if !strings.Contains(err.Error(), "MOLECULE_LLM_ANTHROPIC_BASE_URL") {
+		t.Errorf("expected error to name the missing key MOLECULE_LLM_ANTHROPIC_BASE_URL, got: %v", err)
+	}
+}
+
+// TestAssertManagedTenantHasLLMEnv_NotManagedIsNoop: self-hosted
+// (no orgID/adminToken) must NOT block on missing LLM keys — dev
+// ergonomics matter and the assertion's contract is "managed only".
+func TestAssertManagedTenantHasLLMEnv_NotManagedIsNoop(t *testing.T) {
+	t.Setenv("MOLECULE_ORG_ID", "")
+	t.Setenv("ADMIN_TOKEN", "")
+	t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "")
+	t.Setenv("MOLECULE_LLM_USAGE_URL", "")
+	t.Setenv("MOLECULE_LLM_BASE_URL", "")
+	t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "")
+	if err := assertManagedTenantHasLLMEnv(); err != nil {
+		t.Errorf("self-hosted (not managed) must not block, got: %v", err)
+	}
+}
+
 // TestRefreshEnvFromCP_CPUnreachableDoesNotFailBoot: network errors must
 // return non-nil BUT main.go treats that as warn-and-continue. We assert
 // the function returns an error (not a panic) so the caller can log.
@@ -82,6 +82,16 @@ func main() {
 		log.Printf("CP env refresh: %v (continuing with baked-in env)", err)
 	}

+	// Managed-tenant boot assertion (cp#469 — tenant proxy-env delivery).
+	// If we're a managed SaaS tenant (orgID + adminToken set), all required
+	// LLM proxy env vars must be present after refresh. Missing keys block
+	// the tenant from booting with broken LLM creds — silent-fail is worse
+	// than a loud refusal. Self-hosted (no orgID/adminToken) short-circuits
+	// inside the assertion, so this never fires for dev.
+	if err := assertManagedTenantHasLLMEnv(); err != nil {
+		log.Fatalf("Managed tenant boot assertion: %v", err)
+	}
+
 	// Secrets encryption. In MOLECULE_ENV=prod, boot refuses to start
 	// without a valid SECRETS_ENCRYPTION_KEY (fail-secure — Top-5 #5).
 	// In any other environment, missing keys just log a warning and
@@ -327,6 +337,25 @@ func main() {
 		})
 	}

+	// CP-mode instance-state reconciler — authoritative EC2-liveness pass
+	// for SaaS workspaces (core#2261). Every other liveness sweep keys off
+	// a PROXY (Redis TTL, agent heartbeat, local Docker, or
+	// runtime='external'); a SaaS claude-code workspace whose EC2 was
+	// terminated/stopped falls through ALL of them and stays status='online'
+	// pointing at a dead instance_id forever (root cause: core#2247). This
+	// loop asks the ONE authoritative question the others lack —
+	// cpProv.IsRunning (CP DescribeInstances-equivalent) — for each online
+	// SaaS row, and on a CLEAN "not running" feeds it into the SAME
+	// onWorkspaceOffline closure the other sweeps use (status flip +
+	// RestartByID reprovision, existing volume). Fail-safe: IsRunning is
+	// (true, err) on any transient error, so a CP blip never flips a healthy
+	// workspace.
+	if cpProv != nil {
+		go supervised.RunWithRecover(ctx, "cp-instance-reconciler", func(c context.Context) {
+			registry.StartCPInstanceReconciler(c, cpProv, onWorkspaceOffline, 60*time.Second)
+		})
+	}
+
 	// Pending-uploads GC sweep — deletes acked rows past their retention
 	// window plus unacked rows past expires_at. Without this the
 	// pending_uploads table grows unbounded; even with the 24h hard TTL,
@@ -359,7 +388,6 @@ func main() {
 	// (WorkspaceHandler.BootstrapFailed) wires its own capture inline.
 	registry.BootFailureRescueHook = handlers.BootFailureRescueHook

-
 	// Provision-timeout sweep — flips workspaces that have been stuck in
 	// status='provisioning' past the timeout window to 'failed' and emits
 	// WORKSPACE_PROVISION_TIMEOUT. Without this the UI banner is cosmetic
@@ -149,9 +149,11 @@ func markFailed(ctx context.Context, wsID string, broadcaster *events.Broadcaste
 		models.StatusFailed, msg, wsID); dbErr != nil {
 		log.Printf("bundle import: failed to mark workspace %s as failed: %v", wsID, dbErr)
 	}
-	broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceProvisionFailed), wsID, map[string]interface{}{
+	if bcErr := broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceProvisionFailed), wsID, map[string]interface{}{
 		"error": msg,
-	})
+	}); bcErr != nil {
+		log.Printf("bundle import: failed to broadcast provision failed for %s: %v", wsID, bcErr)
+	}
 }

 func nilIfEmpty(s string) interface{} {
@@ -407,12 +407,14 @@ func (m *Manager) HandleInbound(ctx context.Context, ch ChannelRow, msg *Inbound

 	// Broadcast event
 	if m.broadcaster != nil {
-		m.broadcaster.RecordAndBroadcast(ctx, string(events.EventChannelMessage), ch.WorkspaceID, map[string]interface{}{
+		if err := m.broadcaster.RecordAndBroadcast(ctx, string(events.EventChannelMessage), ch.WorkspaceID, map[string]interface{}{
 			"channel_id":   ch.ID,
 			"channel_type": ch.ChannelType,
 			"username":     msg.Username,
 			"direction":    "inbound",
-		})
+		}); err != nil {
+			log.Printf("Channels: failed to broadcast inbound event: %v", err)
+		}
 	}

 	return nil
@@ -453,11 +455,13 @@ func (m *Manager) SendOutbound(ctx context.Context, channelID string, text strin
 	}

 	if m.broadcaster != nil {
-		m.broadcaster.RecordAndBroadcast(ctx, string(events.EventChannelMessage), ch.WorkspaceID, map[string]interface{}{
+		if err := m.broadcaster.RecordAndBroadcast(ctx, string(events.EventChannelMessage), ch.WorkspaceID, map[string]interface{}{
 			"channel_id":   ch.ID,
 			"channel_type": ch.ChannelType,
 			"direction":    "outbound",
-		})
+		}); err != nil {
+			log.Printf("Channels: failed to broadcast outbound event: %v", err)
+		}
 	}

 	return nil
@@ -517,7 +517,9 @@ func (t *TelegramAdapter) StartPolling(ctx context.Context, config map[string]in

 				// Acknowledge the button press (removes loading spinner)
 				ackCfg := tgbotapi.NewCallback(cb.ID, "Received")
-				bot.Send(ackCfg)
+				if _, err := bot.Send(ackCfg); err != nil {
+					log.Printf("telegram: failed to send callback ack: %v", err)
+				}

 				// Update the message to show what was clicked
 				decision := "approved"
@@ -529,7 +531,9 @@ func (t *TelegramAdapter) StartPolling(ctx context.Context, config map[string]in
 					cb.Message.MessageID,
 					cb.Message.Text+"\n\n✅ CEO "+decision,
 				)
-				bot.Send(editMsg)
+				if _, err := bot.Send(editMsg); err != nil {
+					log.Printf("telegram: failed to send edit message: %v", err)
+				}

 				// Route the decision as an inbound message to the agent
 				inbound := &InboundMessage{
@@ -0,0 +1,141 @@
+package handlers
+
+// a2a_outbound_envelope_test.go — outbound A2A `message/send` envelope
+// CONTRACT gate (issue #2251).
+//
+// #2251: an outbound A2A envelope shipped without `role` and with text
+// parts keyed `type` instead of the v0.3-canonical `kind`. The receiver's
+// a-2-a-sdk v0.3 Pydantic validator silently rejected the message
+// post-dispatch — the sender saw a happy 200/202 while the brief was
+// dropped (the same invisible-rejection failure class as the v0.2→v0.3
+// content bug pinned by a2a_corpus_test.go, but on the SEND side).
+//
+// The inbound corpus replay (a2a_corpus_test.go) proves normalizeA2APayload
+// produces `parts[].kind` + a non-empty messageId, but it does NOT assert
+// `role`, and it only covers what we RECEIVE. Nothing pins what core
+// EMITS. This file pins the emit contract at the helper that builds the
+// parts (buildA2AMessageParts, used by both delegate_task and
+// delegate_task_async) and asserts the canonical Part key is `kind`.
+//
+// Part-object schema (A2A v0.3): every Part MUST carry a `kind`
+// discriminator ("text" | "file" | "data"); there is NO `type` key. A
+// text Part is {"kind":"text","text":"..."}. Emitting `type` makes the
+// v0.3 validator drop the Part.
+
+import (
+	"encoding/json"
+	"testing"
+)
+
+// TestBuildA2AMessageParts_TextPartUsesKindNotType pins the v0.3 Part
+// discriminator for the text part emitted on every outbound A2A
+// delegation. RED before #2251's fix (the helper emitted
+// {"type":"text",...}); the receiver's v0.3 Pydantic validator drops a
+// Part keyed `type`, silently losing the task text.
+func TestBuildA2AMessageParts_TextPartUsesKindNotType(t *testing.T) {
+	parts := buildA2AMessageParts("do the work", nil)
+	if len(parts) == 0 {
+		t.Fatal("buildA2AMessageParts returned no parts for a non-empty task")
+	}
+	text := parts[0]
+
+	if _, hasType := text["type"]; hasType {
+		t.Errorf("text part uses forbidden v0.2 key `type` %v — A2A v0.3 Parts discriminate on `kind`; `type` is dropped by the receiver's validator (#2251)", text)
+	}
+	kind, ok := text["kind"].(string)
+	if !ok {
+		t.Fatalf("text part missing string `kind` discriminator; got %v", text)
+	}
+	if kind != "text" {
+		t.Errorf("text part kind = %q, want \"text\"", kind)
+	}
+	if text["text"] != "do the work" {
+		t.Errorf("text part text = %v, want \"do the work\"", text["text"])
+	}
+}
+
+// TestBuildA2AMessageParts_FilePartUsesKind guards the file-attachment
+// Part the same way. The file path was already correct (it used `kind`),
+// so this is a non-regression pin — it must STAY `kind` when the text
+// path is fixed (a careless "make them consistent" edit could flip both
+// to the wrong key).
+func TestBuildA2AMessageParts_FilePartUsesKind(t *testing.T) {
+	atts := []AgentMessageAttachment{
+		{URI: "https://example.com/a.png", MimeType: "image/png", Name: "a.png"},
+	}
+	parts := buildA2AMessageParts("caption", atts)
+	if len(parts) < 2 {
+		t.Fatalf("expected text + file parts, got %d", len(parts))
+	}
+	file := parts[1]
+	if _, hasType := file["type"]; hasType {
+		t.Errorf("file part uses forbidden `type` key: %v", file)
+	}
+	if _, hasKind := file["kind"]; !hasKind {
+		t.Errorf("file part missing `kind` discriminator: %v", file)
+	}
+}
+
+// TestDelegationOutboundEnvelope_RoleAndKind pins the FULL outbound
+// envelope contract — role + parts[].kind — on the canonical helper.
+// A v0.3 `message` MUST carry `role` ("user" for a delegation request)
+// and `parts` whose every entry discriminates on `kind`. This is the
+// shape the receiver's MessageSendParams validator accepts; an envelope
+// missing `role` or keyed `type` is silently rejected (#2251).
+//
+// Built from the same primitives delegation.go / mcp_tools.go assemble
+// (role:"user" + buildA2AMessageParts) so the round-trip through
+// json.Marshal proves the wire bytes are v0.3-valid.
+func TestDelegationOutboundEnvelope_RoleAndKind(t *testing.T) {
+	envelope := map[string]interface{}{
+		"method": "message/send",
+		"params": map[string]interface{}{
+			"message": map[string]interface{}{
+				"role":      "user",
+				"messageId": "deleg-1",
+				"parts":     buildA2AMessageParts("do the work", nil),
+			},
+		},
+	}
+	raw, err := json.Marshal(envelope)
+	if err != nil {
+		t.Fatalf("marshal envelope: %v", err)
+	}
+	var parsed map[string]interface{}
+	if err := json.Unmarshal(raw, &parsed); err != nil {
+		t.Fatalf("unmarshal envelope: %v", err)
+	}
+
+	params, _ := parsed["params"].(map[string]interface{})
+	if params == nil {
+		t.Fatal("envelope missing params")
+	}
+	msg, _ := params["message"].(map[string]interface{})
+	if msg == nil {
+		t.Fatal("envelope missing params.message")
+	}
+
+	// role is mandatory on a v0.3 message — the receiver rejects without it.
+	role, hasRole := msg["role"].(string)
+	if !hasRole || role == "" {
+		t.Errorf("params.message missing non-empty `role` — v0.3 requires it; omitting it is the other half of #2251")
+	}
+
+	parts, _ := msg["parts"].([]interface{})
+	if len(parts) == 0 {
+		t.Fatal("params.message.parts is empty")
+	}
+	for i, p := range parts {
+		pm, _ := p.(map[string]interface{})
+		if pm == nil {
+			t.Errorf("part %d is not an object: %v", i, p)
+			continue
+		}
+		if _, hasType := pm["type"]; hasType {
+			t.Errorf("part %d uses forbidden `type` key (must be `kind`): %v", i, pm)
+		}
+		if _, hasKind := pm["kind"]; !hasKind {
+			t.Errorf("part %d missing `kind` discriminator: %v", i, pm)
+		}
+	}
+}
@@ -801,6 +801,18 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
 			if _, hasID := msg["messageId"]; !hasID {
 				msg["messageId"] = uuid.New().String()
 			}
+			// #2251: default params.message.role to "user" when absent.
+			// The downstream a2a-sdk v0.3 Pydantic validator marks role a
+			// REQUIRED field; a role-less envelope fails parse with
+			// "params.message.role Field required". The Go builders
+			// (mcp_tools/delegation/scheduler/channels) already set it, but
+			// raw external/canvas POSTs to ProxyA2A may omit it — making this
+			// the single canonical choke that guarantees a schema-valid role.
+			// Mirror the messageId default exactly: inject only when missing,
+			// never overwrite a caller-supplied role (e.g. "agent").
+			if _, hasRole := msg["role"]; !hasRole {
+				msg["role"] = "user"
+			}
 			_, hasParts := msg["parts"]
 			rawContent, hasContent := msg["content"]
 			if !hasParts {
@@ -832,6 +844,27 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
 					}
 				}
 			}
+			// #2251: wire hygiene — the A2A v0.3 Part discriminator is
+			// "kind", but some builders/clients emit the legacy "type" key
+			// (e.g. delegation.go). The v0.3 Pydantic validator keys on
+			// "kind"; a stray "type" leaves the Part untagged. Rename
+			// "type" → "kind" on every Part that lacks an explicit "kind"
+			// so the discriminator is always present on the wire.
+			if parts, ok := msg["parts"].([]interface{}); ok {
+				for _, p := range parts {
+					part, ok := p.(map[string]interface{})
+					if !ok {
+						continue
+					}
+					if _, hasKind := part["kind"]; hasKind {
+						continue
+					}
+					if t, hasType := part["type"]; hasType {
+						part["kind"] = t
+						delete(part, "type")
+					}
+				}
+			}
 		}
 	}

@@ -383,23 +383,48 @@ func (h *WorkspaceHandler) logA2ASuccess(ctx context.Context, workspaceID, calle
 	}
 	summary := a2aMethod + " → " + wsNameForLog
 	toolTrace := extractToolTrace(respBody)
-	parent := ctx
-	h.goAsync(func() {
-		logCtx, cancel := context.WithTimeout(context.WithoutCancel(parent), 30*time.Second)
-		defer cancel()
-		LogActivity(logCtx, h.broadcaster, ActivityParams{
-			WorkspaceID:  workspaceID,
-			ActivityType: "a2a_receive",
-			SourceID:     nilIfEmpty(callerID),
-			TargetID:     &workspaceID,
-			Method:       &a2aMethod,
-			Summary:      &summary,
-			RequestBody:  json.RawMessage(body),
-			ResponseBody: json.RawMessage(respBody),
-			ToolTrace:    toolTrace,
-			DurationMs:   &durationMs,
-			Status:       logStatus,
-		})
+
+	// DATA-LOSS FIX (internal#470 / #1347 push-mode sibling): this
+	// a2a_receive row is the ONLY durable record of a push-mode chat
+	// round-trip — request_body carries the user's message, response_body
+	// carries the agent's reply, and chat-history hydration
+	// (messagestore.PostgresMessageStore) reads BOTH back to rebuild the
+	// transcript on canvas reopen / reload. It MUST be written
+	// SYNCHRONOUSLY, before proxyA2ARequest returns and ProxyA2A flushes
+	// the 200 to the canvas — otherwise the canvas sees the reply
+	// acknowledged (and rendered optimistically) while the row is still
+	// racing in a detached goroutine, and a reload (or a workspace-server
+	// restart / deploy / OOM) between the 200 and the goroutine's commit
+	// loses the message permanently on reopen.
+	//
+	// This mirrors the discipline already applied to the poll-mode ingest
+	// path (logA2AReceiveQueued / persistUserMessageAtIngest); the
+	// push-mode counterpart was left async, which the E2E Chat
+	// "history persists across reload" test surfaced as an intermittent
+	// red (the reload out-raced the INSERT).
+	//
+	//   - context.WithoutCancel: a client disconnect on chat-exit (which
+	//     cancels the inbound request ctx) MUST NOT abort this write.
+	//   - SYNCHRONOUS (no goAsync): the row must be durable before the 200.
+	//   - Best-effort: LogActivity logs+swallows INSERT errors internally,
+	//     so a DB hiccup never blocks or fails the user's send — behaviour
+	//     for that one request is never worse than the pre-fix async path.
+	//   - The post-commit ACTIVITY_LOGGED broadcast still fires inside
+	//     LogActivity; the durable row is the truth the canvas re-reads.
+	logCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 30*time.Second)
+	defer cancel()
+	LogActivity(logCtx, h.broadcaster, ActivityParams{
+		WorkspaceID:  workspaceID,
+		ActivityType: "a2a_receive",
+		SourceID:     nilIfEmpty(callerID),
+		TargetID:     &workspaceID,
+		Method:       &a2aMethod,
+		Summary:      &summary,
+		RequestBody:  json.RawMessage(body),
+		ResponseBody: json.RawMessage(respBody),
+		ToolTrace:    toolTrace,
+		DurationMs:   &durationMs,
+		Status:       logStatus,
 	})

 	if callerID == "" && statusCode < 400 {
@@ -1514,6 +1514,142 @@ func TestNormalizeA2APayload_NoMessageNoCheck(t *testing.T) {
 	}
 }

+// --- #2251: role default + part-kind hygiene contract tests ---
+//
+// These assert normalizeA2APayload is the single canonical Go choke that
+// guarantees a schema-valid outbound message/send envelope: it injects a
+// default params.message.role="user" when the sender omitted role (the bug
+// that made delegate_task fail the peer's a2a Pydantic validator with
+// "params.message.role Field required" while reply_to_workspace worked), and
+// it renames the legacy Part discriminator "type"→"kind" for wire hygiene.
+
+// normMsg is a small helper that runs normalizeA2APayload and returns the
+// resolved params.message map, failing the test on any normalization error.
+func normMsg(t *testing.T, raw string) map[string]interface{} {
+	t.Helper()
+	out, _, perr := normalizeA2APayload([]byte(raw))
+	if perr != nil {
+		t.Fatalf("normalizeA2APayload returned error: %+v", perr)
+	}
+	var parsed map[string]interface{}
+	if err := json.Unmarshal(out, &parsed); err != nil {
+		t.Fatalf("output not valid JSON: %v", err)
+	}
+	params, ok := parsed["params"].(map[string]interface{})
+	if !ok {
+		t.Fatalf("output missing params object: %s", string(out))
+	}
+	msg, ok := params["message"].(map[string]interface{})
+	if !ok {
+		t.Fatalf("output missing params.message object: %s", string(out))
+	}
+	return msg
+}
+
+func TestNormalizeA2APayload_DefaultsRoleWhenMissing(t *testing.T) {
+	cases := []struct {
+		name string
+		raw  string
+	}{
+		{
+			name: "v0.3 parts, no role",
+			raw:  `{"method":"message/send","params":{"message":{"parts":[{"kind":"text","text":"hi"}]}}}`,
+		},
+		{
+			name: "v0.2 string content, no role",
+			raw:  `{"method":"message/send","params":{"message":{"content":"hi"}}}`,
+		},
+		{
+			name: "legacy type part, no role",
+			raw:  `{"method":"message/send","params":{"message":{"parts":[{"type":"text","text":"hi"}]}}}`,
+		},
+		{
+			name: "already wrapped jsonrpc, no role",
+			raw:  `{"jsonrpc":"2.0","id":"x","method":"message/send","params":{"message":{"parts":[{"kind":"text","text":"hi"}]}}}`,
+		},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			msg := normMsg(t, tc.raw)
+			if msg["role"] != "user" {
+				t.Errorf("expected role defaulted to \"user\", got %v", msg["role"])
+			}
+			// Parts must remain valid (non-empty) after normalization.
+			parts, ok := msg["parts"].([]interface{})
+			if !ok || len(parts) == 0 {
+				t.Fatalf("expected non-empty parts after normalization, got %v", msg["parts"])
+			}
+			// Every part must carry the v0.3 "kind" discriminator.
+			for i, p := range parts {
+				part, ok := p.(map[string]interface{})
+				if !ok {
+					t.Fatalf("part %d is not an object: %v", i, p)
+				}
+				if _, hasKind := part["kind"]; !hasKind {
+					t.Errorf("part %d missing \"kind\" discriminator: %v", i, part)
+				}
+				if _, hasType := part["type"]; hasType {
+					t.Errorf("part %d still has legacy \"type\" key: %v", i, part)
+				}
+			}
+		})
+	}
+}
+
+func TestNormalizeA2APayload_PreservesExplicitRole(t *testing.T) {
+	// A caller-supplied role (e.g. "agent") must NOT be overwritten with "user".
+	msg := normMsg(t, `{"method":"message/send","params":{"message":{"role":"agent","parts":[{"kind":"text","text":"hi"}]}}}`)
+	if msg["role"] != "agent" {
+		t.Errorf("explicit role overwritten: expected \"agent\", got %v", msg["role"])
+	}
+}
+
+func TestNormalizeA2APayload_RenamesPartTypeToKind(t *testing.T) {
+	// Mirrors delegation.go's builder which emits {"type":"text",...}. After
+	// normalization the wire Part must be discriminated by "kind".
+	msg := normMsg(t, `{"method":"message/send","params":{"message":{"role":"user","parts":[{"type":"text","text":"a"},{"type":"file","uri":"workspace:/x"}]}}}`)
+	parts := msg["parts"].([]interface{})
+	if len(parts) != 2 {
+		t.Fatalf("expected 2 parts, got %d", len(parts))
+	}
+	wantKind := []string{"text", "file"}
+	for i, p := range parts {
+		part := p.(map[string]interface{})
+		if part["kind"] != wantKind[i] {
+			t.Errorf("part %d: expected kind=%q, got %v", i, wantKind[i], part["kind"])
+		}
+		if _, hasType := part["type"]; hasType {
+			t.Errorf("part %d still carries legacy \"type\": %v", i, part)
+		}
+	}
+}
+
+func TestNormalizeA2APayload_DoesNotClobberKindWithType(t *testing.T) {
+	// If a part has BOTH kind and type, kind wins and is left untouched.
+	msg := normMsg(t, `{"method":"message/send","params":{"message":{"role":"user","parts":[{"kind":"text","type":"ignored","text":"a"}]}}}`)
+	part := msg["parts"].([]interface{})[0].(map[string]interface{})
+	if part["kind"] != "text" {
+		t.Errorf("expected kind preserved as \"text\", got %v", part["kind"])
+	}
+}
+
+// TestNormalizeA2APayload_RoleDefault_ContractRegression documents the
+// pre-fix failure: without the role default, a role-less message/send body
+// emerged from normalization still missing params.message.role, which the
+// peer's a2a Pydantic validator rejects. This asserts the POST-fix invariant
+// (role present) directly; before the a2a_proxy.go change this assertion
+// fails (role is absent → msg["role"] == nil).
+func TestNormalizeA2APayload_RoleDefault_ContractRegression(t *testing.T) {
+	msg := normMsg(t, `{"method":"message/send","params":{"message":{"parts":[{"kind":"text","text":"delegate this"}]}}}`)
+	role, hasRole := msg["role"]
+	if !hasRole {
+		t.Fatal("REGRESSION (#2251): params.message.role absent after normalization — peer a2a validator will reject with 'role Field required'")
+	}
+	if role != "user" {
+		t.Errorf("expected default role \"user\", got %v", role)
+	}
+}
+
 // --- resolveAgentURL direct unit tests ---

 func TestResolveAgentURL_CacheHit(t *testing.T) {
@@ -60,10 +60,10 @@ func sanitizeErrorDetailForBroadcast(s string) string {
 }

 type ActivityHandler struct {
-	broadcaster *events.Broadcaster
+	broadcaster events.EventEmitter
 }

-func NewActivityHandler(b *events.Broadcaster) *ActivityHandler {
+func NewActivityHandler(b events.EventEmitter) *ActivityHandler {
 	return &ActivityHandler{broadcaster: b}
 }

@@ -152,7 +152,7 @@ func extractAttachmentsFromMessageParts(body map[string]interface{}) []map[strin
 		if kind == "" {
 			kind, _ = part["type"].(string)
 		}
-		if kind != "file" && kind != "image" && kind != "audio" {
+		if kind != "file" && kind != "image" && kind != "audio" && kind != "video" {
 			continue
 		}
 		// The file sub-object holds uri/mime_type/name. The a2a-sdk v1
@@ -380,12 +380,18 @@ func (h *ActivityHandler) List(c *gin.Context) {
 	// "row not found" — both indicate the cursor is no longer usable for
 	// this caller, no information leak.
 	var cursorTime time.Time
+	var cursorSeq int64
 	usingCursor := false
 	if sinceID != "" {
+		// Resolve BOTH ordering-key components of the cursor row. The feed is
+		// ordered by (created_at, seq), so the strictly-after filter below must
+		// compare the full tuple — comparing created_at alone silently drops a
+		// row written in the SAME microsecond as the cursor row (the boundary
+		// skip the since_id E2E intermittently tripped over).
 		err := db.DB.QueryRowContext(c.Request.Context(),
-			`SELECT created_at FROM activity_logs WHERE id = $1 AND workspace_id = $2`,
+			`SELECT created_at, seq FROM activity_logs WHERE id = $1 AND workspace_id = $2`,
 			sinceID, workspaceID,
-		).Scan(&cursorTime)
+		).Scan(&cursorTime, &cursorSeq)
 		if errors.Is(err, sql.ErrNoRows) {
 			c.JSON(http.StatusGone, gin.H{
 				"error": "since_id cursor not found (row may have been pruned or belongs to a different workspace); omit since_id to reset",
@@ -492,10 +498,20 @@ func (h *ActivityHandler) List(c *gin.Context) {
 		argIdx++
 	}
 	if usingCursor {
-		// Strictly after — never replay the cursor row itself.
-		query += fmt.Sprintf(" AND "+actCol+"created_at > $%d", argIdx)
-		args = append(args, cursorTime)
-		argIdx++
+		// Strictly after the cursor on the FULL ordering key (created_at, seq).
+		// Tuple comparison: a row is "after" the cursor if its created_at is
+		// later, OR it shares the cursor's created_at but has a higher seq.
+		// This (a) never replays the cursor row itself and (b) — unlike a bare
+		// `created_at > cursor` — never drops a row written in the same
+		// microsecond as the cursor row. Expressed as the expanded boolean
+		// rather than a row-value `(created_at, seq) > ($t, $s)` so it composes
+		// with the actCol qualifier prefix and the existing placeholder/arg
+		// builder cleanly.
+		query += fmt.Sprintf(
+			" AND ("+actCol+"created_at > $%d OR ("+actCol+"created_at = $%d AND "+actCol+"seq > $%d))",
+			argIdx, argIdx, argIdx+1)
+		args = append(args, cursorTime, cursorSeq)
+		argIdx += 2
 	}

 	// Polling clients (since_id) need oldest-first within the new window so
@@ -503,9 +519,13 @@ func (h *ActivityHandler) List(c *gin.Context) {
 	// since_id) keeps DESC — that's the canvas/UI shape and changing it
 	// would surprise existing callers.
 	if usingCursor {
-		query += fmt.Sprintf(" ORDER BY "+actCol+"created_at ASC LIMIT $%d", argIdx)
+		// (created_at, seq) ASC — seq is the deterministic tiebreaker for rows
+		// sharing a microsecond-collided created_at. Replays in recorded order.
+		query += fmt.Sprintf(" ORDER BY "+actCol+"created_at ASC, "+actCol+"seq ASC LIMIT $%d", argIdx)
 	} else {
-		query += fmt.Sprintf(" ORDER BY "+actCol+"created_at DESC LIMIT $%d", argIdx)
+		// (created_at, seq) DESC — same tiebreaker, newest-first for the
+		// canvas/recent-feed shape.
+		query += fmt.Sprintf(" ORDER BY "+actCol+"created_at DESC, "+actCol+"seq DESC LIMIT $%d", argIdx)
 	}
 	args = append(args, limit)

@@ -680,7 +700,8 @@ func buildSessionSearchQuery(workspaceID, query string, limit int) (string, []in
 				COALESCE(status, '') AS status,
 				request_body,
 				response_body,
-				created_at
+				created_at,
+				seq
 			FROM activity_logs
 			WHERE workspace_id = $1
 		)
@@ -702,7 +723,13 @@ func buildSessionSearchQuery(workspaceID, query string, limit int) (string, []in
 		args = append(args, "%"+query+"%")
 	}

-	sqlQuery += ` ORDER BY created_at DESC LIMIT $` + strconv.Itoa(len(args)+1)
+	// Deterministic order: created_at alone is not unique (same-microsecond
+	// rows), so tie-break on the monotonic seq — same fix as the since_id feed
+	// (§ No flakes: no unstable sorts, even on an unused surface). `seq` is
+	// projected through the session_items CTE above so this outer ORDER BY can
+	// reference it — the outer SELECT can only sort on the CTE's output columns,
+	// not on activity_logs directly.
+	sqlQuery += ` ORDER BY created_at DESC, seq DESC LIMIT $` + strconv.Itoa(len(args)+1)
 	args = append(args, limit)
 	return sqlQuery, args
 }
@@ -118,6 +118,23 @@ func TestExtractAttachmentsFromRequestBody_ImageAndAudio(t *testing.T) {
 	}
 }

+func TestExtractAttachmentsFromRequestBody_VideoPart(t *testing.T) {
+	// Video parts are accepted in message-parts envelope (issue #2222).
+	body := []byte(`{"jsonrpc":"2.0","method":"message/send","params":{"message":{"parts":[
+		{"kind":"video","file":{"uri":"workspace:clip.mp4","mime_type":"video/mp4","name":"clip.mp4"}}
+	]}}}`)
+	atts := extractAttachmentsFromRequestBody(body)
+	if len(atts) != 1 {
+		t.Fatalf("want 1 attachment, got %d", len(atts))
+	}
+	if atts[0]["kind"] != "video" {
+		t.Errorf("kind: want video, got %v", atts[0]["kind"])
+	}
+	if atts[0]["uri"] != "workspace:clip.mp4" {
+		t.Errorf("uri mismatch: %v", atts[0]["uri"])
+	}
+}
+
 func TestExtractAttachmentsFromRequestBody_LegacyV0TypeDiscriminator(t *testing.T) {
 	// Legacy v0 shape: type=file (not kind), inlined fields (no nested .file)
 	body := []byte(`{"jsonrpc":"2.0","method":"message/send","params":{"message":{"parts":[
@@ -0,0 +1,211 @@
+//go:build integration
+// +build integration
+
+// activity_seq_backfill_integration_test.go — REAL Postgres proof of the
+// invariant the 20260604000000_activity_logs_seq.up.sql migration guarantees:
+// every activity_logs row carries a NON-NULL `seq`, both for rows that existed
+// before the migration ran (assigned during the ALTER TABLE rewrite) and for
+// rows created afterward via the normal INSERT path (assigned by the IDENTITY
+// default). This is the coverage CR2 (#2339 review) correctly flagged as
+// missing on PR #2258.
+//
+// WHY THIS IS A SEPARATE TEST from activity_since_id_ordering_integration_test.go:
+// that test pins the *ordering* contract (same-microsecond rows come back in a
+// deterministic (created_at, seq) order). THIS test pins the *backfill* contract
+// — that `seq` is never NULL — and the consequence the reviewer doubted: a
+// pre-existing/backfilled row is usable as a since_id cursor because its seq is
+// non-null, so the tuple cursor `(created_at, seq)` the handler builds is well
+// defined for it.
+//
+// EMPIRICAL BASIS (PostgreSQL 16.13, the prod PG version):
+//   - `ALTER TABLE activity_logs ADD COLUMN seq BIGINT GENERATED BY DEFAULT AS
+//     IDENTITY` rewrites the table and assigns seq to EXISTING rows in physical
+//     table-scan order — they are NON-NULL, not left NULL as the review claimed.
+//   - The identity sequence then advances ABOVE max(seq), so the next INSERT
+//     that omits seq gets max+1 with no collision.
+// Run against any Postgres 15/16 the integration harness boots — the property
+// holds on both.
+//
+// Run with (same harness as activity_delegation_a2a_integration_test.go):
+//
+//	docker run --rm -d --name pg-integration \
+//	  -e POSTGRES_PASSWORD=test -e POSTGRES_DB=molecule \
+//	  -p 55432:5432 postgres:15-alpine
+//	sleep 4
+//	# apply migrations (incl. 20260604000000_activity_logs_seq.up.sql) then:
+//	INTEGRATION_DB_URL="postgres://postgres:test@localhost:55432/molecule?sslmode=disable" \
+//	  go test -tags=integration ./internal/handlers/ -run Integration_ActivityLogs_Seq
+//
+// WATCH-IT-FAIL: if `seq` were left nullable / un-backfilled (the failure mode
+// the reviewer hypothesized), the NULL-count assertion in _NoNull trips, and
+// the since_id-on-a-backfilled-row case in _SinceIDOnBackfilledRow trips because
+// the handler cannot read a non-null seq for the cursor row. With the migration
+// as written both are green every run.
+
+package handlers
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"testing"
+	"time"
+
+	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
+	"github.com/gin-gonic/gin"
+)
+
+// TestIntegration_ActivityLogs_SeqBackfill_NoNull pins the core migration
+// invariant: AFTER migrations have run, NO activity_logs row may have a NULL
+// seq — neither rows that the seedActivityRowAt path inserts (IDENTITY default)
+// nor any row the schema carries. It also proves the IDENTITY sequence keeps
+// producing distinct, non-null seq for fresh inserts (no collision, no NULL).
+//
+// This is the assertion that would FAIL if the ALTER had left existing rows
+// with NULL seq (the reviewer's claim) — table-scan backfill makes it pass.
+func TestIntegration_ActivityLogs_SeqBackfill_NoNull(t *testing.T) {
+	conn := integrationDB_ActivityDelegationA2A(t)
+	_ = conn
+	wsID := seedWorkspace(t, conn, "test-2151-seq-backfill-nonull")
+
+	// Insert several rows via the normal path. seq is left to the IDENTITY
+	// default — exactly how production writes activity_logs.
+	t0 := time.Date(2026, 6, 4, 9, 0, 0, 0, time.UTC)
+	const n = 5
+	ids := make([]string, 0, n)
+	for i := 0; i < n; i++ {
+		ids = append(ids, seedActivityRowAt(t, wsID, "backfill-row", t0.Add(time.Duration(i)*time.Second)))
+	}
+
+	// (a) No row in this workspace may have a NULL seq. If the column were
+	// un-backfilled / nullable this is > 0 and the test fails.
+	var nullCount int
+	if err := db.DB.QueryRowContext(context.Background(),
+		`SELECT COUNT(*) FROM activity_logs WHERE workspace_id = $1 AND seq IS NULL`,
+		wsID,
+	).Scan(&nullCount); err != nil {
+		t.Fatalf("null-seq count query: %v", err)
+	}
+	if nullCount != 0 {
+		t.Fatalf("found %d activity_logs rows with NULL seq — migration did NOT backfill/assign seq", nullCount)
+	}
+
+	// Belt-and-suspenders: the GLOBAL invariant (no NULL seq anywhere in the
+	// table) is what the migration actually guarantees. Assert it too, so a
+	// regression that nulls seq for rows written by some other path is caught.
+	var globalNull int
+	if err := db.DB.QueryRowContext(context.Background(),
+		`SELECT COUNT(*) FROM activity_logs WHERE seq IS NULL`,
+	).Scan(&globalNull); err != nil {
+		t.Fatalf("global null-seq count query: %v", err)
+	}
+	if globalNull != 0 {
+		t.Fatalf("found %d activity_logs rows table-wide with NULL seq — seq must be non-null for every row", globalNull)
+	}
+
+	// (b) The IDENTITY sequence yields DISTINCT, monotonic, non-null seq for
+	// the rows we just inserted (proves the normal insert path gets a real seq,
+	// and that the sequence advanced past any backfilled max instead of
+	// colliding). We read them back in insert order and require strictly
+	// increasing, all-non-null seq.
+	rows, err := db.DB.QueryContext(context.Background(),
+		`SELECT seq FROM activity_logs WHERE workspace_id = $1 ORDER BY created_at ASC, seq ASC`,
+		wsID,
+	)
+	if err != nil {
+		t.Fatalf("read-back seq query: %v", err)
+	}
+	defer rows.Close()
+	var seqs []int64
+	for rows.Next() {
+		var s *int64 // pointer so a NULL would scan as nil rather than 0
+		if err := rows.Scan(&s); err != nil {
+			t.Fatalf("scan seq: %v", err)
+		}
+		if s == nil {
+			t.Fatal("a freshly-inserted activity_logs row has NULL seq — IDENTITY default did not fire")
+		}
+		seqs = append(seqs, *s)
+	}
+	if err := rows.Err(); err != nil {
+		t.Fatalf("rows err: %v", err)
+	}
+	if len(seqs) != n {
+		t.Fatalf("expected %d rows, read back %d", n, len(seqs))
+	}
+	for i := 1; i < len(seqs); i++ {
+		if seqs[i] <= seqs[i-1] {
+			t.Fatalf("seq not strictly increasing in insert order: %v (IDENTITY collision / reuse)", seqs)
+		}
+	}
+}
+
+// TestIntegration_ActivityLogs_SeqBackfill_SinceIDOnBackfilledRow pins the
+// consequence the reviewer doubted: a row whose seq came from the migration /
+// IDENTITY (i.e. NOT explicitly set by the caller) is usable as a since_id
+// cursor, and a SECOND row sharing its exact created_at microsecond is returned
+// (not dropped). This proves the handler's (created_at, seq) tuple cursor
+// resolves a same-timestamp boundary that a created_at-only cursor would drop,
+// AND that the cursor row's seq is non-null (else the handler could not build
+// the tuple at all).
+//
+// Distinct from _BoundaryRowSameMicrosecondNotSkipped in the ordering test:
+// here the explicit angle under test is "the cursor row's seq is a
+// migration/IDENTITY-assigned (backfilled-style) value, non-null, and the
+// handler uses it" — i.e. the backfill behavior is what makes the boundary
+// resolution work, pinned head-on.
+func TestIntegration_ActivityLogs_SeqBackfill_SinceIDOnBackfilledRow(t *testing.T) {
+	conn := integrationDB_ActivityDelegationA2A(t)
+	_ = conn
+	wsID := seedWorkspace(t, conn, "test-2151-seq-backfill-sinceid")
+
+	tSame := time.Date(2026, 6, 4, 10, 0, 0, 0, time.UTC)
+	// Cursor row: seq comes purely from the IDENTITY default (never set by
+	// the caller) — the same assignment mechanism the migration uses to
+	// backfill pre-existing rows. The "next" row shares the exact created_at
+	// microsecond and is inserted afterward, so it gets a strictly higher seq.
+	cursorID := seedActivityRowAt(t, wsID, "sinceid-cursor", tSame)
+	nextID := seedActivityRowAt(t, wsID, "sinceid-next-same-us", tSame)
+
+	// Prove the precondition the reviewer doubted: the cursor row's seq is
+	// NON-NULL, so the handler can read it to build the (created_at, seq)
+	// tuple. If it were NULL the handler's cursor lookup would yield a NULL
+	// seq and the strictly-after tuple comparison would mis-behave.
+	var cursorSeq *int64
+	if err := db.DB.QueryRowContext(context.Background(),
+		`SELECT seq FROM activity_logs WHERE id = $1`, cursorID,
+	).Scan(&cursorSeq); err != nil {
+		t.Fatalf("read cursor seq: %v", err)
+	}
+	if cursorSeq == nil {
+		t.Fatal("cursor row has NULL seq — a since_id cursor on a backfilled-style row would be unusable")
+	}
+
+	h := NewActivityHandler(nil)
+	c, w := newTestGinContext()
+	c.Params = gin.Params{{Key: "id", Value: wsID}}
+	q := c.Request.URL.Query()
+	q.Set("since_id", cursorID)
+	q.Set("type", "a2a_receive")
+	q.Set("limit", "10")
+	c.Request.URL.RawQuery = q.Encode()
+
+	h.List(c)
+	if w.Code != http.StatusOK {
+		t.Fatalf("List returned %d, want 200: %s", w.Code, w.Body.String())
+	}
+	var resp []map[string]interface{}
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("unmarshal: %v", err)
+	}
+	// Exactly the one same-microsecond row after the cursor — present (not
+	// dropped by a strict created_at-only filter) and the cursor itself
+	// excluded (strictly-after on the full tuple).
+	if len(resp) != 1 {
+		t.Fatalf("same-microsecond row after backfilled-style cursor dropped: expected 1 row, got %d: %+v",
+			len(resp), resp)
+	}
+	if got, _ := resp[0]["id"].(string); got != nextID {
+		t.Fatalf("expected boundary row id %s, got %s", nextID, got)
+	}
+}
@@ -0,0 +1,162 @@
+//go:build integration
+// +build integration
+
+// activity_since_id_ordering_integration_test.go — REAL Postgres proof that
+// the poll-mode since_id activity feed (#2339) is DETERMINISTICALLY ordered
+// even when multiple rows collide on the same created_at microsecond.
+//
+// This is the test that the original bug report mis-labeled a "flake".
+// sqlmock cannot catch it: sqlmock returns rows in the order the test stuffs
+// them, so it can never reveal a non-deterministic ORDER BY. Only a real
+// planner over real same-created_at rows exposes it.
+//
+// Run with (same harness as activity_delegation_a2a_integration_test.go):
+//
+//	docker run --rm -d --name pg-integration \
+//	  -e POSTGRES_PASSWORD=test -e POSTGRES_DB=molecule \
+//	  -p 55432:5432 postgres:15-alpine
+//	sleep 4
+//	# apply migrations (incl. 20260604000000_activity_logs_seq.up.sql) then:
+//	INTEGRATION_DB_URL="postgres://postgres:test@localhost:55432/molecule?sslmode=disable" \
+//	  go test -tags=integration ./internal/handlers/ -run Integration_SinceID
+//
+// WATCH-IT-FAIL: against the pre-fix handler (ORDER BY created_at only, no
+// seq tiebreaker, and `created_at > cursor` strict) this test is unstable —
+// the equal-created_at rows come back in arbitrary planner order so the
+// ordered-id assertion fails intermittently, and the same-microsecond
+// boundary row is dropped so the count assertion fails. With the fix
+// (ORDER BY created_at, seq + tuple cursor) it is green every run.
+
+package handlers
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"testing"
+	"time"
+
+	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
+	"github.com/gin-gonic/gin"
+)
+
+// seedActivityRowAt inserts one activity_logs row with an explicit created_at
+// (so the test can force microsecond-equal collisions) and a unique summary;
+// returns the generated id. seq is left to the IDENTITY default — Postgres
+// assigns it in INSERT order, which is the deterministic tiebreaker under test.
+// db.DB has been hot-swapped to the integration connection by
+// integrationDB_ActivityDelegationA2A(t) in the calling test.
+func seedActivityRowAt(t *testing.T, wsID, summary string, createdAt time.Time) string {
+	t.Helper()
+	var id string
+	err := db.DB.QueryRowContext(context.Background(), `
+		INSERT INTO activity_logs (workspace_id, activity_type, summary, status, created_at)
+		VALUES ($1, 'a2a_receive', $2, 'ok', $3)
+		RETURNING id
+	`, wsID, summary, createdAt).Scan(&id)
+	if err != nil {
+		t.Fatalf("seedActivityRowAt(%q): %v", summary, err)
+	}
+	return id
+}
+
+// TestIntegration_SinceID_StableOrderingSameMicrosecond proves the feed is
+// deterministic when rows share a created_at, AND that the same-microsecond
+// boundary row immediately after the cursor is NOT dropped.
+func TestIntegration_SinceID_StableOrderingSameMicrosecond(t *testing.T) {
+	conn := integrationDB_ActivityDelegationA2A(t)
+	_ = conn
+	wsID := seedWorkspace(t, conn, "test-2151-sinceid-ordering")
+
+	// One earlier row to serve as the cursor (the "last processed" row).
+	tCursor := time.Date(2026, 6, 4, 12, 0, 0, 0, time.UTC)
+	cursorID := seedActivityRowAt(t, wsID, "cursor-row", tCursor)
+
+	// Three rows that ALL collide on the exact same created_at microsecond,
+	// inserted in a known order. Pre-fix, ORDER BY created_at alone returns
+	// these in arbitrary planner order.
+	tEqual := time.Date(2026, 6, 4, 12, 0, 1, 0, time.UTC)
+	idA := seedActivityRowAt(t, wsID, "equal-A", tEqual)
+	idB := seedActivityRowAt(t, wsID, "equal-B", tEqual)
+	idCc := seedActivityRowAt(t, wsID, "equal-C", tEqual)
+	wantOrder := []string{idA, idB, idCc}
+
+	// Drive the handler exactly as a polling client would.
+	h := NewActivityHandler(nil)
+	c, w := newTestGinContext()
+	c.Params = gin.Params{{Key: "id", Value: wsID}}
+	q := c.Request.URL.Query()
+	q.Set("since_id", cursorID)
+	q.Set("type", "a2a_receive")
+	q.Set("limit", "10")
+	c.Request.URL.RawQuery = q.Encode()
+
+	h.List(c)
+	if w.Code != http.StatusOK {
+		t.Fatalf("List returned %d, want 200: %s", w.Code, w.Body.String())
+	}
+	var resp []map[string]interface{}
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("unmarshal: %v", err)
+	}
+
+	// All three equal-created_at rows must be present (boundary not dropped)
+	// and the cursor row itself must be excluded (strictly-after).
+	if len(resp) != len(wantOrder) {
+		t.Fatalf("expected %d rows after cursor (the 3 equal-created_at rows), got %d: %+v",
+			len(wantOrder), len(resp), resp)
+	}
+
+	gotOrder := make([]string, len(resp))
+	for i, row := range resp {
+		idVal, _ := row["id"].(string)
+		gotOrder[i] = idVal
+	}
+	for i := range wantOrder {
+		if gotOrder[i] != wantOrder[i] {
+			t.Fatalf("non-deterministic ordering: got id order %v, want %v (seq tiebreaker not applied)",
+				gotOrder, wantOrder)
+		}
+	}
+}
+
+// TestIntegration_SinceID_BoundaryRowSameMicrosecondNotSkipped isolates the
+// cursor-boundary bug: a row written in the SAME microsecond as the cursor
+// row (but with a higher seq) must still be returned. Pre-fix the strict
+// `created_at > cursor` filter silently dropped it.
+func TestIntegration_SinceID_BoundaryRowSameMicrosecondNotSkipped(t *testing.T) {
+	conn := integrationDB_ActivityDelegationA2A(t)
+	_ = conn
+	wsID := seedWorkspace(t, conn, "test-2151-sinceid-boundary")
+
+	tSame := time.Date(2026, 6, 4, 13, 0, 0, 0, time.UTC)
+	// Cursor row and the next row share the exact same created_at; the next
+	// row is inserted afterwards so it gets a higher seq.
+	cursorID := seedActivityRowAt(t, wsID, "boundary-cursor", tSame)
+	nextID := seedActivityRowAt(t, wsID, "boundary-next-same-us", tSame)
+
+	h := NewActivityHandler(nil)
+	c, w := newTestGinContext()
+	c.Params = gin.Params{{Key: "id", Value: wsID}}
+	q := c.Request.URL.Query()
+	q.Set("since_id", cursorID)
+	q.Set("type", "a2a_receive")
+	q.Set("limit", "10")
+	c.Request.URL.RawQuery = q.Encode()
+
+	h.List(c)
+	if w.Code != http.StatusOK {
+		t.Fatalf("List returned %d, want 200: %s", w.Code, w.Body.String())
+	}
+	var resp []map[string]interface{}
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("unmarshal: %v", err)
+	}
+	if len(resp) != 1 {
+		t.Fatalf("same-microsecond boundary row dropped: expected exactly the 1 next row, got %d rows: %+v",
+			len(resp), resp)
+	}
+	if got, _ := resp[0]["id"].(string); got != nextID {
+		t.Fatalf("expected boundary row id %s, got %s", nextID, got)
+	}
+}
@@ -26,17 +26,21 @@ func TestActivityHandler_SinceID_ReturnsNewerASC(t *testing.T) {

 	cursorID := "act-cursor-42"
 	cursorTime := time.Date(2026, 4, 30, 5, 0, 0, 0, time.UTC)
+	cursorSeq := int64(42)

 	// Step 1: cursor lookup — must include workspace_id scope so a UUID
-	// from another workspace can't be used.
-	mock.ExpectQuery(`SELECT created_at FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
+	// from another workspace can't be used. Now resolves BOTH ordering-key
+	// components (created_at, seq) so the strictly-after filter can compare
+	// the full tuple.
+	mock.ExpectQuery(`SELECT created_at, seq FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
 		WithArgs(cursorID, "ws-1").
-		WillReturnRows(sqlmock.NewRows([]string{"created_at"}).AddRow(cursorTime))
+		WillReturnRows(sqlmock.NewRows([]string{"created_at", "seq"}).AddRow(cursorTime, cursorSeq))

-	// Step 2: main query with the cursor's created_at as a > filter,
-	// ASC ordering. Args: workspace_id, cursorTime, limit.
+	// Step 2: main query with the cursor's (created_at, seq) as a tuple
+	// strictly-after filter, (created_at, seq) ASC ordering.
+	// Args: workspace_id, cursorTime, cursorSeq, limit.
 	mock.ExpectQuery("SELECT id, workspace_id, activity_type").
-		WithArgs("ws-1", cursorTime, 100).
+		WithArgs("ws-1", cursorTime, cursorSeq, 100).
 		WillReturnRows(newActivityRows())

 	broadcaster := newTestBroadcaster()
@@ -64,7 +68,7 @@ func TestActivityHandler_SinceID_ReturnsNewerASC(t *testing.T) {
 func TestActivityHandler_SinceID_CursorNotFound_410(t *testing.T) {
 	mock := setupTestDB(t)

-	mock.ExpectQuery(`SELECT created_at FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
+	mock.ExpectQuery(`SELECT created_at, seq FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
 		WithArgs("act-gone", "ws-1").
 		WillReturnError(sql.ErrNoRows)

@@ -96,7 +100,7 @@ func TestActivityHandler_SinceID_CrossWorkspaceCursor_410(t *testing.T) {

 	// Cursor exists in DB but the WHERE workspace_id = $2 filter excludes
 	// it — sqlmock returns no rows, which is what Postgres would do.
-	mock.ExpectQuery(`SELECT created_at FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
+	mock.ExpectQuery(`SELECT created_at, seq FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
 		WithArgs("act-other-ws", "ws-1").
 		WillReturnError(sql.ErrNoRows)

@@ -120,20 +124,23 @@ func TestActivityHandler_SinceID_CrossWorkspaceCursor_410(t *testing.T) {

 // TestActivityHandler_SinceID_CombinedWithSinceSecs: both filters apply
 // together (AND). Argument order in the main query: workspace_id,
-// since_secs, cursorTime, limit. Sanity-checks the placeholder index
-// arithmetic in the query builder.
+// since_secs, cursorTime, cursorSeq, limit. Sanity-checks the placeholder
+// index arithmetic in the query builder (the cursor now binds TWO args —
+// the (created_at, seq) tuple — so since_secs no longer shifts the tail by
+// one but by two).
 func TestActivityHandler_SinceID_CombinedWithSinceSecs(t *testing.T) {
 	mock := setupTestDB(t)

 	cursorID := "act-c"
 	cursorTime := time.Date(2026, 4, 30, 4, 0, 0, 0, time.UTC)
+	cursorSeq := int64(7)

-	mock.ExpectQuery(`SELECT created_at FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
+	mock.ExpectQuery(`SELECT created_at, seq FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
 		WithArgs(cursorID, "ws-1").
-		WillReturnRows(sqlmock.NewRows([]string{"created_at"}).AddRow(cursorTime))
+		WillReturnRows(sqlmock.NewRows([]string{"created_at", "seq"}).AddRow(cursorTime, cursorSeq))

 	mock.ExpectQuery("SELECT id, workspace_id, activity_type").
-		WithArgs("ws-1", 600, cursorTime, 100).
+		WithArgs("ws-1", 600, cursorTime, cursorSeq, 100).
 		WillReturnRows(newActivityRows())

 	broadcaster := newTestBroadcaster()
@@ -54,23 +54,29 @@ func (h *ApprovalsHandler) Create(c *gin.Context) {
 		return
 	}

-	h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalRequested), workspaceID, map[string]interface{}{
+	if err := h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalRequested), workspaceID, map[string]interface{}{
 		"approval_id": approvalID,
 		"action":      body.Action,
 		"reason":      body.Reason,
 		"task_id":     body.TaskID,
-	})
+	}); err != nil {
+		log.Printf("approvals: failed to broadcast approval requested: %v", err)
+	}

 	// Auto-escalate to parent
 	var parentID *string
-	db.DB.QueryRowContext(ctx, `SELECT parent_id FROM workspaces WHERE id = $1`, workspaceID).Scan(&parentID)
+	if err := db.DB.QueryRowContext(ctx, `SELECT parent_id FROM workspaces WHERE id = $1`, workspaceID).Scan(&parentID); err != nil {
+		log.Printf("approvals: failed to lookup parent for escalation: %v", err)
+	}
 	if parentID != nil {
-		h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalEscalated), *parentID, map[string]interface{}{
+		if err := h.broadcaster.RecordAndBroadcast(ctx, string(events.EventApprovalEscalated), *parentID, map[string]interface{}{
 			"approval_id":       approvalID,
 			"from_workspace_id": workspaceID,
 			"action":            body.Action,
 			"reason":            body.Reason,
-		})
+		}); err != nil {
+			log.Printf("approvals: failed to broadcast approval escalated: %v", err)
+		}
 	}

 	c.JSON(http.StatusCreated, gin.H{"approval_id": approvalID, "status": "pending"})
@@ -221,11 +227,13 @@ func (h *ApprovalsHandler) Decide(c *gin.Context) {
 		eventType = "APPROVAL_DENIED"
 	}

-	h.broadcaster.RecordAndBroadcast(ctx, eventType, workspaceID, map[string]interface{}{
+	if err := h.broadcaster.RecordAndBroadcast(ctx, eventType, workspaceID, map[string]interface{}{
 		"approval_id": approvalID,
 		"decision":    body.Decision,
 		"decided_by":  decidedBy,
-	})
+	}); err != nil {
+		log.Printf("approvals: failed to broadcast approval decision: %v", err)
+	}

 	c.JSON(http.StatusOK, gin.H{"status": body.Decision, "approval_id": approvalID})
 }
@@ -102,10 +102,10 @@ func pushDelegationResultToInbox(ctx context.Context, sourceID, delegationID, st
 // and the A2A request runs in the background.
 type DelegationHandler struct {
 	workspace   *WorkspaceHandler
-	broadcaster *events.Broadcaster
+	broadcaster events.EventEmitter
 }

-func NewDelegationHandler(wh *WorkspaceHandler, b *events.Broadcaster) *DelegationHandler {
+func NewDelegationHandler(wh *WorkspaceHandler, b events.EventEmitter) *DelegationHandler {
 	return &DelegationHandler{workspace: wh, broadcaster: b}
 }

@@ -179,8 +179,11 @@ func (h *DelegationHandler) Delegate(c *gin.Context) {
 			"message": map[string]interface{}{
 				"role":      "user",
 				"messageId": delegationID,
-				"parts":     []map[string]interface{}{{"type": "text", "text": body.Task}},
-				"metadata":  map[string]interface{}{"delegation_id": delegationID},
+				// A2A v0.3 Part discriminator is `kind`, NOT `type` (#2251) —
+				// a `type`-keyed Part is dropped by the receiver's v0.3
+				// validator, silently losing the delegated task.
+				"parts":    []map[string]interface{}{{"kind": "text", "text": body.Task}},
+				"metadata": map[string]interface{}{"delegation_id": delegationID},
 			},
 		},
 	})
@@ -36,7 +36,6 @@ package handlers
 import (
 	"context"
 	"database/sql"
-	"os"
 	"strings"
 	"testing"
 	"time"
@@ -57,10 +56,7 @@ import (
 // directly rather than going through the package global.
 func integrationDB(t *testing.T) *sql.DB {
 	t.Helper()
-	url := os.Getenv("INTEGRATION_DB_URL")
-	if url == "" {
-		t.Skip("INTEGRATION_DB_URL not set; skipping (local devs: see file header)")
-	}
+	url := requireIntegrationDBURL(t)
 	conn, err := sql.Open("postgres", url)
 	if err != nil {
 		t.Fatalf("open: %v", err)
@@ -0,0 +1,40 @@
+//go:build integration
+// +build integration
+
+// integration_helper_test.go — shared preflight for handler Postgres
+// integration tests. Extracted so the fail-open/skip logic is in ONE place
+// and can be tightened without editing every integration test file.
+//
+// See delegation_ledger_integration_test.go for the docker-postgres setup
+// incantation used by local devs.
+
+package handlers
+
+import (
+	"os"
+	"testing"
+)
+
+// requireIntegrationDBURL returns $INTEGRATION_DB_URL.
+//
+// In CI (CI, GITHUB_ACTIONS, or GITEA_ACTIONS env var is non-empty), an
+// empty URL is a fatal error — it means the workflow failed to export the
+// variable (postgres container did not start, bridge IP resolution failed,
+// or a regression in the workflow YAML). t.Fatalf keeps the test red so the
+// failure is visible; t.Skip would silently pass and mask the defect.
+//
+// Locally (none of the three CI markers set), an empty URL skips the test
+// so devs can run `go test ./...` without booting a Postgres container.
+func requireIntegrationDBURL(t *testing.T) string {
+	t.Helper()
+	url := os.Getenv("INTEGRATION_DB_URL")
+	if url == "" {
+		if os.Getenv("CI") != "" ||
+			os.Getenv("GITHUB_ACTIONS") != "" ||
+			os.Getenv("GITEA_ACTIONS") != "" {
+			t.Fatalf("INTEGRATION_DB_URL required in CI handler integration tests — check workflow env export")
+		}
+		t.Skip("INTEGRATION_DB_URL not set; skipping (local devs: see file header)")
+	}
+	return url
+}
@@ -126,6 +126,32 @@ var mcpAllTools = []mcpTool{
 					"type":        "string",
 					"description": "The task description to send to the target workspace",
 				},
+				"attachments": map[string]interface{}{
+					"type":        "array",
+					"description": "Optional files to send with the task. Each item must include uri and name; mimeType and size are optional.",
+					"items": map[string]interface{}{
+						"type": "object",
+						"properties": map[string]interface{}{
+							"uri": map[string]interface{}{
+								"type":        "string",
+								"description": "Workspace attachment URI, usually workspace:/absolute/path",
+							},
+							"name": map[string]interface{}{
+								"type":        "string",
+								"description": "Display filename",
+							},
+							"mimeType": map[string]interface{}{
+								"type":        "string",
+								"description": "Optional MIME type",
+							},
+							"size": map[string]interface{}{
+								"type":        "number",
+								"description": "Optional file size in bytes",
+							},
+						},
+						"required": []string{"uri", "name"},
+					},
+				},
 			},
 			"required": []string{"workspace_id", "task"},
 		},
@@ -144,6 +170,32 @@ var mcpAllTools = []mcpTool{
 					"type":        "string",
 					"description": "The task description to send to the target workspace",
 				},
+				"attachments": map[string]interface{}{
+					"type":        "array",
+					"description": "Optional files to send with the task. Each item must include uri and name; mimeType and size are optional.",
+					"items": map[string]interface{}{
+						"type": "object",
+						"properties": map[string]interface{}{
+							"uri": map[string]interface{}{
+								"type":        "string",
+								"description": "Workspace attachment URI, usually workspace:/absolute/path",
+							},
+							"name": map[string]interface{}{
+								"type":        "string",
+								"description": "Display filename",
+							},
+							"mimeType": map[string]interface{}{
+								"type":        "string",
+								"description": "Optional MIME type",
+							},
+							"size": map[string]interface{}{
+								"type":        "number",
+								"description": "Optional file size in bytes",
+							},
+						},
+						"required": []string{"uri", "name"},
+					},
+				},
 			},
 			"required": []string{"workspace_id", "task"},
 		},
@@ -285,6 +285,121 @@ func TestMCPHandler_DelegateTaskAsync_RoutesThroughPlatformA2AProxy(t *testing.T
 // goroutine returns early and never calls proxyA2ARequest with a nil/empty
 // body. Before the fix the goroutine logged the error and fell through,
 // dispatching a malformed A2A request.
+
+func TestMCPHandler_DelegateTask_WithAttachments(t *testing.T) {
+	h, mock := newMCPHandler(t)
+	callerID := "11111111-1111-1111-1111-111111111111"
+	targetID := "22222222-2222-2222-2222-222222222222"
+	parentID := "33333333-3333-3333-3333-333333333333"
+
+	expectCanCommunicateSiblings(mock, callerID, targetID, parentID)
+	mock.ExpectExec(`(?s)INSERT INTO activity_logs.*'delegation'.*'delegate'`).
+		WithArgs(callerID, callerID, targetID, "Delegating to "+targetID, sqlmock.AnyArg(), "pending").
+		WillReturnResult(sqlmock.NewResult(1, 1))
+	mock.ExpectExec(`UPDATE activity_logs`).
+		WithArgs("dispatched", "", callerID, sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	h.a2aProxy = func(ctx context.Context, workspaceID string, body []byte, proxyCallerID string, logActivity bool) (int, []byte, error) {
+		if workspaceID != targetID || proxyCallerID != callerID {
+			t.Fatalf("unexpected proxy route target=%q caller=%q", workspaceID, proxyCallerID)
+		}
+		bodyStr := string(body)
+		if !strings.Contains(bodyStr, `"text":"review this video"`) {
+			t.Fatalf("A2A body missing task text: %s", bodyStr)
+		}
+		if !strings.Contains(bodyStr, `"kind":"video"`) {
+			t.Fatalf("A2A body missing video attachment kind: %s", bodyStr)
+		}
+		if !strings.Contains(bodyStr, `"uri":"workspace:/tmp/clip.mp4"`) {
+			t.Fatalf("A2A body missing attachment uri: %s", bodyStr)
+		}
+		if !strings.Contains(bodyStr, `"mime_type":"video/mp4"`) {
+			t.Fatalf("A2A body missing attachment mime_type: %s", bodyStr)
+		}
+		return 200, []byte(`{"result":{"message":{"parts":[{"text":"done"}]}}}`), nil
+	}
+
+	out, err := h.toolDelegateTask(context.Background(), callerID, map[string]interface{}{
+		"workspace_id": targetID,
+		"task":         "review this video",
+		"attachments": []interface{}{
+			map[string]interface{}{
+				"uri":      "workspace:/tmp/clip.mp4",
+				"name":     "clip.mp4",
+				"mimeType": "video/mp4",
+				"size":     12345,
+			},
+		},
+	}, mcpCallTimeout)
+	if err != nil {
+		t.Fatalf("delegate_task returned error: %v", err)
+	}
+	if out != "done" {
+		t.Fatalf("delegate_task response = %q, want done", out)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet expectations: %v", err)
+	}
+}
+
+func TestMCPHandler_DelegateTaskAsync_WithAttachments(t *testing.T) {
+	h, mock := newMCPHandler(t)
+	callerID := "11111111-1111-1111-1111-111111111111"
+	targetID := "22222222-2222-2222-2222-222222222222"
+	parentID := "33333333-3333-3333-3333-333333333333"
+
+	expectCanCommunicateSiblings(mock, callerID, targetID, parentID)
+	mock.ExpectExec(`(?s)INSERT INTO activity_logs.*'delegation'.*'delegate'`).
+		WithArgs(callerID, callerID, targetID, "Delegating to "+targetID, sqlmock.AnyArg(), "pending").
+		WillReturnResult(sqlmock.NewResult(1, 1))
+	mock.ExpectExec(`UPDATE activity_logs`).
+		WithArgs("dispatched", "", callerID, sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	called := make(chan []byte, 1)
+	h.a2aProxy = func(ctx context.Context, workspaceID string, body []byte, proxyCallerID string, logActivity bool) (int, []byte, error) {
+		if workspaceID != targetID || proxyCallerID != callerID {
+			t.Fatalf("unexpected proxy route target=%q caller=%q", workspaceID, proxyCallerID)
+		}
+		called <- body
+		return 200, []byte(`{"result":{"message":{"parts":[{"text":"accepted"}]}}}`), nil
+	}
+
+	out, err := h.toolDelegateTaskAsync(context.Background(), callerID, map[string]interface{}{
+		"workspace_id": targetID,
+		"task":         "async work with image",
+		"attachments": []interface{}{
+			map[string]interface{}{
+				"uri":      "workspace:/tmp/screenshot.png",
+				"name":     "screenshot.png",
+				"mimeType": "image/png",
+			},
+		},
+	})
+	if err != nil {
+		t.Fatalf("delegate_task_async returned error: %v", err)
+	}
+	if !strings.Contains(out, `"status":"dispatched"`) {
+		t.Fatalf("delegate_task_async response = %s", out)
+	}
+	waitGlobalAsyncForTest()
+	select {
+	case body := <-called:
+		bodyStr := string(body)
+		if !strings.Contains(bodyStr, `"kind":"image"`) {
+			t.Fatalf("A2A body missing image attachment kind: %s", bodyStr)
+		}
+		if !strings.Contains(bodyStr, `"uri":"workspace:/tmp/screenshot.png"`) {
+			t.Fatalf("A2A body missing attachment uri: %s", bodyStr)
+		}
+	default:
+		t.Fatal("async delegate did not call platform A2A proxy")
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet expectations: %v", err)
+	}
+}
 func TestMCPHandler_DelegateTaskAsync_MarshalFailureDoesNotCallProxy(t *testing.T) {
 	h, mock := newMCPHandler(t)
 	callerID := "11111111-1111-1111-1111-111111111111"
@@ -187,6 +187,32 @@ func (h *MCPHandler) toolGetWorkspaceInfo(ctx context.Context, workspaceID strin
 	return string(b), nil
 }

+// buildA2AMessageParts constructs the A2A message parts array from a task string
+// and optional attachments. The text part always comes first; attachment parts
+// follow in the order provided, with kind derived from MIME type.
+func buildA2AMessageParts(task string, attachments []AgentMessageAttachment) []map[string]interface{} {
+	parts := []map[string]interface{}{
+		// A2A v0.3 Part discriminator is `kind`, NOT `type` (#2251).
+		// The receiver's v0.3 Pydantic validator drops a Part keyed
+		// `type`, silently losing the task text — the file part below
+		// already uses `kind`, this is the matching fix for text.
+		{"kind": "text", "text": task},
+	}
+	for _, att := range attachments {
+		kind := kindFromMimeType(att.MimeType)
+		filePart := map[string]interface{}{
+			"kind": kind,
+			"file": map[string]interface{}{
+				"uri":       att.URI,
+				"mime_type": att.MimeType,
+				"name":      att.Name,
+			},
+		}
+		parts = append(parts, filePart)
+	}
+	return parts
+}
+
 func (h *MCPHandler) toolDelegateTask(ctx context.Context, callerID string, args map[string]interface{}, timeout time.Duration) (string, error) {
 	targetID, _ := args["workspace_id"].(string)
 	task, _ := args["task"].(string)
@@ -208,6 +234,8 @@ func (h *MCPHandler) toolDelegateTask(ctx context.Context, callerID string, args
 		// Non-fatal: still make the A2A call even if activity log write fails.
 	}

+	attachments, _ := parseAgentMessageAttachments(args["attachments"])
+
 	a2aBody, err := json.Marshal(map[string]interface{}{
 		"jsonrpc": "2.0",
 		"id":      uuid.New().String(),
@@ -215,7 +243,7 @@ func (h *MCPHandler) toolDelegateTask(ctx context.Context, callerID string, args
 		"params": map[string]interface{}{
 			"message": map[string]interface{}{
 				"role":      "user",
-				"parts":     []map[string]interface{}{{"type": "text", "text": task}},
+				"parts":     buildA2AMessageParts(task, attachments),
 				"messageId": uuid.New().String(),
 			},
 		},
@@ -275,6 +303,8 @@ func (h *MCPHandler) toolDelegateTaskAsync(ctx context.Context, callerID string,
 		bgCtx, cancel := context.WithTimeout(context.Background(), mcpAsyncCallTimeout)
 		defer cancel()

+		attachments, _ := parseAgentMessageAttachments(args["attachments"])
+
 		a2aBody, marshalErr := marshalA2ABody(map[string]interface{}{
 			"jsonrpc": "2.0",
 			"id":      delegationID,
@@ -282,7 +312,7 @@ func (h *MCPHandler) toolDelegateTaskAsync(ctx context.Context, callerID string,
 			"params": map[string]interface{}{
 				"message": map[string]interface{}{
 					"role":      "user",
-					"parts":     []map[string]interface{}{{"type": "text", "text": task}},
+					"parts":     buildA2AMessageParts(task, attachments),
 					"messageId": uuid.New().String(),
 				},
 			},
@@ -17,19 +17,30 @@ package handlers

 import (
 	"fmt"
+	"sort"
 	"strings"
 )

 // validateRegisteredModelForRuntime reports whether (runtime, model) is
 // selectable per the provider registry. Returns:
 //
-//	(true,  "")     — allowed: model is registered for this runtime, OR the
-//	                  runtime is not in the registry (fail-open), OR model=="".
-//	(false, reason) — rejected: the runtime IS registered but the model is not
-//	                  in its native ModelsForRuntime set.
+//	(true,  "")     — allowed: model is on the runtime's platform menu
+//	                  (ModelsForRuntime) OR DeriveProvider(runtime, model)
+//	                  RESOLVES a native provider (the cp#529 routability-aware
+//	                  BYOK path), OR the runtime is not in the registry
+//	                  (fail-open), OR model=="".
+//	(false, reason) — rejected: the runtime IS registered, the model is not on
+//	                  its platform menu, AND no native provider prefix-owns it
+//	                  (genuinely unroutable).
 //
 // model=="" is allowed here: the MODEL_REQUIRED gate owns the empty-model case,
 // so this validator must not double-reject it.
+//
+// ROUTABILITY-AWARE (cp#529, CTO Option C): the final predicate is an OR —
+// `model ∈ ModelsForRuntime(runtime)` OR `DeriveProvider(runtime, model, nil)`
+// resolves. The platform menu carries platform-billed ids; the DeriveProvider
+// path covers BYOK ids that prefix-match a name-only native arm (no platform
+// billing). The drift checker in molecule-controlplane mirrors this exact OR.
 func validateRegisteredModelForRuntime(runtime, model string) (bool, string) {
 	model = strings.TrimSpace(model)
 	if model == "" {
@@ -51,7 +62,117 @@ func validateRegisteredModelForRuntime(runtime, model string) (bool, string) {
 			return true, ""
 		}
 	}
+	// ROUTABILITY-AWARE allow path (cp#529, CTO-approved Option C). The model is
+	// NOT on the runtime's platform menu (ModelsForRuntime) — but a model can be
+	// legitimately SELECTABLE without being a platform-menu id: a BYOK id whose
+	// prefix matches one of the runtime's NATIVE provider arms (a name-only arm
+	// added in providers.yaml) resolves to a concrete provider via DeriveProvider
+	// even though it carries no platform billing. Allow it iff DeriveProvider
+	// resolves a provider for (runtime, model). A genuinely-unroutable id (no
+	// native provider prefix-owns it) still falls through to the 422 below.
+	//
+	// BILLING GUARDRAIL: only CONFIRMED-NON-PLATFORM (BYOK) providers are wired as
+	// name-only arms in providers.yaml (never platform/anthropic-*/openai-*/
+	// moonshot/minimax/google/vertex), so a DeriveProvider-resolved id reached by
+	// THIS path can never bill the platform's key for a customer's model. The
+	// platform-menu ids that DO carry platform billing are already allowed by the
+	// exact-membership loop above; this path only ever resolves to a BYOK arm.
+	if _, derr := m.DeriveProvider(runtime, model, nil); derr == nil {
+		return true, ""
+	}
 	return false, fmt.Sprintf(
 		"model %q is not a registered model for runtime %q; pick one of the runtime's registered models (provider-registry SSOT, internal#718)",
 		model, runtime)
 }
+
+// validateDerivedProviderInRegistry (issue #2172) is the provider-side companion
+// to validateRegisteredModelForRuntime. The model-side check asks "is this
+// (runtime, model) in the registry?"; the provider-side check asks "is the
+// provider this model DERIVES to — the same one the adapter will resolve at
+// boot — a known provider in providers.yaml?"
+//
+// Live trigger (adk-demo Assistant, 2026-06-03): workspace config
+// `model=moonshot/kimi-k2.6` (claude-code) → adapter derives `provider=moonshot`
+// → `ValueError: provider=moonshot not in providers registry` at BOOT. The
+// save was accepted (no validation at the API boundary), and the failure only
+// surfaced when the agent tried to register. CI never saw it. The drift gate
+// (RFC#580) validates TEMPLATES against the registry, NOT per-workspace
+// configs; the existing model-side check rejects a model the runtime doesn't
+// own but says nothing about the DERIVED provider's registry membership.
+//
+// Returns:
+//
+//	(true,  "")     — pass: model is empty (MODEL_REQUIRED owns it), the
+//	                  runtime is not in the registry (fail-open for
+//	                  federated / non-first-party runtimes — mirror of the
+//	                  model-side check's federation contract), the registry
+//	                  failed to load (build-time gate owns it), OR the
+//	                  derived provider name is a known provider in the
+//	                  registry's `providers:` list.
+//	(false, reason) — reject: a known (runtime, model) pair derives to a
+//	                  provider name absent from the providers list. This is
+//	                  the structural class the adk-demo boot failure belongs
+//	                  to — the registry's `runtimes:` block references a
+//	                  provider not declared in `providers:`, which by
+//	                  construction is a registry-data bug. Catching it at
+//	                  config-SAVE keeps it out of the agent-boot path.
+//
+// Defense-in-depth: by construction, a model in a runtime's native provider set
+// has a provider that IS in the catalog (the runtime ref names a provider from
+// the providers list). So the rejection path is primarily a registry-consistency
+// guard. The real value is the FAIL-LOUD semantics — any future drift between
+// `providers:` and `runtimes:` fails the create call with a clear pointer to
+// the missing provider, instead of silently wedging the agent at boot.
+func validateDerivedProviderInRegistry(runtime, model string) (bool, string) {
+	model = strings.TrimSpace(model)
+	if model == "" {
+		return true, "" // MODEL_REQUIRED owns this.
+	}
+	m, err := providerRegistry()
+	if err != nil || m == nil {
+		// Registry unavailable (build-time defect the gates catch). Fail open —
+		// do not block create on a registry-load failure.
+		return true, ""
+	}
+	// DeriveProvider is fail-closed for unknown runtimes. Mirror the
+	// model-side check's federation contract: a runtime the registry does
+	// NOT know (langgraph / external / kimi / mock / federated) is allowed
+	// to pass through. DeriveProvider's `unknown runtime` error IS that
+	// signal — treat it as fail-open, identical to ModelsForRuntime's
+	// not-found behavior above.
+	p, err := m.DeriveProvider(runtime, model, nil)
+	if err != nil {
+		// Either the runtime is unknown (fail-open by contract) OR the model
+		// is not native to the runtime (the model-side validator already
+		// rejected this — DeriveProvider's error here means
+		// validateRegisteredModelForRuntime should have caught it. Don't
+		// double-reject: pass through and let the model-side response own
+		// the message).
+		return true, ""
+	}
+	// Defense-in-depth: confirm the DERIVED provider is a known entry in the
+	// providers list. By construction it should be (DeriveProvider only
+	// returns a Provider that was looked up by name from `providers:`), but
+	// a future federation merge could introduce a runtime ref pointing at a
+	// contributed provider absent from the core catalog. Reject loudly here
+	// rather than letting the save reach the agent-boot path and wedge with
+	// "provider=X not in providers registry" (the original adk-demo class).
+	for _, candidate := range m.Providers {
+		if candidate.Name == p.Name {
+			return true, ""
+		}
+	}
+	// Build a sorted, comma-separated list of valid provider names so the
+	// operator/caller sees the actionable list (the boot-time error message
+	// the adk-demo class produced does NOT include this — the fix is to
+	// surface it at the API boundary, where the caller can fix the request
+	// without a stuck workspace + operator page).
+	valid := make([]string, 0, len(m.Providers))
+	for _, c := range m.Providers {
+		valid = append(valid, c.Name)
+	}
+	sort.Strings(valid)
+	return false, fmt.Sprintf(
+		"derived provider %q (for model %q on runtime %q) is not in the providers registry; pick a model whose derived provider is one of: %s",
+		p.Name, model, runtime, strings.Join(valid, ", "))
+}
@@ -6,8 +6,17 @@ package handlers
 // fail OPEN (allow) for a runtime the registry doesn't know yet (federation /
 // langgraph/etc. not in the first-party registry) so the existing knownRuntimes
 // gate stays authoritative there.
+//
+// TestValidateDerivedProviderInRegistry (issue #2172) is the provider-side
+// companion: once the model-side check passes, confirm the DERIVED provider
+// (the one the adapter will resolve at boot) is a known provider in
+// providers.yaml. Catches the adk-demo "provider=X not in providers registry"
+// class at config-SAVE time instead of letting it wedge the agent at boot.

-import "testing"
+import (
+	"strings"
+	"testing"
+)

 func TestValidateRegisteredModelForRuntime(t *testing.T) {
 	type tc struct {
@@ -70,6 +79,50 @@ func TestValidateRegisteredModelForRuntime(t *testing.T) {
 			model:   "",
 			wantOK:  true,
 		},
+		// ---- cp#529 routability-aware allow path -------------------------------
+		{
+			// BYOK passthrough id: NOT on hermes's platform menu, but the
+			// openrouter name-only native arm prefix-owns it → DeriveProvider
+			// resolves → ALLOWED (no platform billing — openrouter is BYOK).
+			name:    "byok_passthrough_routable_now_allowed",
+			runtime: "hermes",
+			model:   "openrouter/anthropic/claude-3.5-sonnet",
+			wantOK:  true,
+		},
+		{
+			// BYOK namespaced vendor id: deepseek's widened ^deepseek[-:/]
+			// matches the vendor/ form on a name-only hermes arm → allowed.
+			name:    "byok_namespaced_vendor_routable_now_allowed",
+			runtime: "hermes",
+			model:   "deepseek/deepseek-chat",
+			wantOK:  true,
+		},
+		{
+			// claude-code bare GLM- BYOK id: zai name-only arm + (?i)^(glm-|…)
+			// matches → DeriveProvider resolves → allowed.
+			name:    "claude_code_bare_glm_byok_routable_now_allowed",
+			runtime: "claude-code",
+			model:   "GLM-4.6",
+			wantOK:  true,
+		},
+		{
+			// Genuinely UNROUTABLE id: no native hermes arm prefix-owns bare
+			// gpt-4o (the platform-shared openai vendor is NOT wired into hermes
+			// — billing guardrail), so DeriveProvider errors → still 422.
+			name:    "genuinely_unroutable_still_rejected",
+			runtime: "hermes",
+			model:   "gpt-4o",
+			wantOK:  false,
+		},
+		{
+			// A namespaced vendor id NOW routable on hermes via the dedicated
+			// byok-openai provider (cp#529 BYOK-vendor arms): routes with the
+			// tenant's OPENAI_API_KEY → BYOK billing, never the platform key.
+			name:    "byok_openai_namespaced_routable_now_allowed",
+			runtime: "hermes",
+			model:   "openai/gpt-4o",
+			wantOK:  true,
+		},
 	}
 	for _, c := range cases {
 		t.Run(c.name, func(t *testing.T) {
@@ -80,3 +133,163 @@ func TestValidateRegisteredModelForRuntime(t *testing.T) {
 		})
 	}
 }
+
+func TestValidateDerivedProviderInRegistry(t *testing.T) {
+	type tc struct {
+		name    string
+		runtime string
+		model   string
+		wantOK  bool
+		// wantReasonContains: a substring the rejection reason must include
+		// (skipped for OK cases). Pins the actionable list / derivation pointer
+		// so the caller knows which provider was missing and what the valid
+		// set looks like — this is the fix that distinguishes the new gate
+		// from the boot-time "provider=X not in providers registry" string
+		// it replaces.
+		wantReasonContains string
+	}
+	cases := []tc{
+		// PASS — every native (runtime, model) in the catalog derives to a
+		// provider that IS in the providers list. These are the live corpus
+		// entries; the test pins the registry-consistency invariant.
+		{
+			name:    "claude_code_anthropic_api_native",
+			runtime: "claude-code",
+			model:   "claude-sonnet-4-6",
+			wantOK:  true,
+		},
+		{
+			name:    "claude_code_kimi_coding_native",
+			runtime: "claude-code",
+			model:   "kimi-for-coding",
+			wantOK:  true,
+		},
+		{
+			name:    "claude_code_minimax_native",
+			runtime: "claude-code",
+			model:   "MiniMax-M2.7",
+			wantOK:  true,
+		},
+		{
+			name:    "claude_code_platform_namespaced",
+			runtime: "claude-code",
+			model:   "moonshot/kimi-k2.6",
+			wantOK:  true,
+		},
+		{
+			name:    "codex_openai_subscription_default_arm",
+			runtime: "codex",
+			model:   "gpt-5.5",
+			wantOK:  true,
+		},
+		{
+			name:    "codex_platform_namespaced",
+			runtime: "codex",
+			model:   "openai/gpt-5.4-mini",
+			wantOK:  true,
+		},
+		{
+			name:    "hermes_kimi_coding",
+			runtime: "hermes",
+			model:   "kimi-coding/kimi-k2",
+			wantOK:  true,
+		},
+		{
+			name:    "hermes_platform_namespaced",
+			runtime: "hermes",
+			model:   "moonshot/kimi-k2.6",
+			wantOK:  true,
+		},
+		{
+			name:    "openclaw_kimi_coding",
+			runtime: "openclaw",
+			model:   "moonshot:kimi-k2.6",
+			wantOK:  true,
+		},
+		// FAIL — model-side validator catches this, but the provider-side
+		// gate is called AFTER it in Create and inherits the fail-open
+		// contract for "model is not native to runtime" (DeriveProvider
+		// errors → allow, letting the model-side response own the message).
+		// This is the deliberate "don't double-reject" decision.
+		{
+			name:    "unregistered_model_pass_through_to_model_side",
+			runtime: "claude-code",
+			model:   "totally-made-up-model-xyz",
+			wantOK:  true, // pass-through: model-side validator owns the rejection
+		},
+		// Federation contract — mirror of the model-side test above.
+		{
+			name:    "langgraph_runtime_failopen",
+			runtime: "langgraph",
+			model:   "anything-goes",
+			wantOK:  true,
+		},
+		{
+			name:    "external_runtime_failopen",
+			runtime: "external",
+			model:   "whatever",
+			wantOK:  true,
+		},
+		// Empty model — MODEL_REQUIRED owns it; allow.
+		{
+			name:    "empty_model_allowed_other_gate_owns_it",
+			runtime: "claude-code",
+			model:   "",
+			wantOK:  true,
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			ok, why := validateDerivedProviderInRegistry(c.runtime, c.model)
+			if ok != c.wantOK {
+				t.Errorf("validateDerivedProviderInRegistry(%q,%q) ok=%v want %v (reason=%q)",
+					c.runtime, c.model, ok, c.wantOK, why)
+			}
+			if !c.wantOK && c.wantReasonContains != "" && !strings.Contains(why, c.wantReasonContains) {
+				t.Errorf("rejection reason missing %q: got %q", c.wantReasonContains, why)
+			}
+		})
+	}
+}
+
+// TestRegistryConsistency_AllNativeModelsDeriveToKnownProvider walks every
+// (runtime, model) pair in the registry's native model sets and asserts each
+// one derives to a provider that IS in the providers list. This is the
+// static regression gate the issue calls for ("a CI test fails if any shipped
+// demo/template config references an unregistered provider") — generalized
+// to the catalog as a whole: if anyone edits providers.yaml such that a
+// `runtimes:` block names a provider absent from `providers:`, this test
+// fires before the bad config can reach a customer workspace.
+//
+// By construction this invariant should always hold (DeriveProvider only
+// returns a Provider that was looked up by name from `providers:`), so the
+// test primarily guards against future federation merges that introduce a
+// runtime ref pointing at a contributed provider absent from the core
+// catalog — exactly the failure shape the adk-demo Assistant wedge
+// belongs to.
+func TestRegistryConsistency_AllNativeModelsDeriveToKnownProvider(t *testing.T) {
+	m, err := providerRegistry()
+	if err != nil || m == nil {
+		t.Skipf("providerRegistry unavailable in test env (err=%v); skipping consistency walk", err)
+	}
+	providerNames := make(map[string]struct{}, len(m.Providers))
+	for _, p := range m.Providers {
+		providerNames[p.Name] = struct{}{}
+	}
+	for runtimeName, runtime := range m.Runtimes {
+		for _, ref := range runtime.Providers {
+			for _, modelID := range ref.Models {
+				p, err := m.DeriveProvider(runtimeName, modelID, nil)
+				if err != nil {
+					t.Errorf("catalog invariant broken: runtime=%q model=%q failed DeriveProvider: %v",
+						runtimeName, modelID, err)
+					continue
+				}
+				if _, ok := providerNames[p.Name]; !ok {
+					t.Errorf("catalog invariant broken: runtime=%q model=%q derives to provider %q which is not in the providers list (refs=%q)",
+						runtimeName, modelID, p.Name, ref.Name)
+				}
+			}
+		}
+	}
+}
@@ -43,7 +43,6 @@ package handlers
 import (
 	"context"
 	"database/sql"
-	"os"
 	"strings"
 	"testing"
 	"time"
@@ -63,10 +62,7 @@ import (
 // but kept separate so each table's wipe step is local to its tests.
 func integrationDB_PendingUploads(t *testing.T) *sql.DB {
 	t.Helper()
-	url := os.Getenv("INTEGRATION_DB_URL")
-	if url == "" {
-		t.Skip("INTEGRATION_DB_URL not set; skipping (local devs: see file header)")
-	}
+	url := requireIntegrationDBURL(t)
 	conn, err := sql.Open("postgres", url)
 	if err != nil {
 		t.Fatalf("open: %v", err)
@@ -0,0 +1,21 @@
+package handlers
+
+import (
+	"testing"
+
+	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/providers"
+)
+
+// Proper-SSOT (task #65): required_env is DERIVED from the resolved provider's
+// serving classification (IsPlatform), not hand-authored — platform injects
+// creds server-side (none required), BYOK requires its auth_env.
+func TestRequiredEnvForRegistryProvider(t *testing.T) {
+	if got := requiredEnvForRegistryProvider(providers.Provider{Name: providers.PlatformProviderName}); got != nil {
+		t.Errorf("platform provider requiredEnv = %v; want nil (creds injected server-side)", got)
+	}
+	byok := providers.Provider{Name: "google", AuthEnv: []string{"GEMINI_API_KEY", "GOOGLE_API_KEY"}}
+	got := requiredEnvForRegistryProvider(byok)
+	if len(got) != 2 || got[0] != "GEMINI_API_KEY" {
+		t.Errorf("byok requiredEnv = %v; want its auth_env", got)
+	}
+}
@@ -176,6 +176,10 @@ func TestResolveAgentURLForRestartSignal_CacheMiss(t *testing.T) {
 // TestGracefulPreRestart_Success verifies that when the workspace returns 200,
 // the signal is logged as acknowledged without error.
 func TestGracefulPreRestart_Success(t *testing.T) {
+	hWrapper := &resolveURLTestWrapper{
+		WorkspaceHandler: newHandlerWithTestDeps(t),
+		testURL:          "http://fake-agent.example/agent",
+	}
 	_ = setupTestDB(t)

 	// httptest server simulating the workspace container's /signals/restart_pending
@@ -205,18 +209,15 @@ func TestGracefulPreRestart_Success(t *testing.T) {
 		})
 	}))
 	defer srv.Close()
+	hWrapper.testURL = srv.URL + "/agent"

 	// Pre-populate Redis cache with the test server URL
 	_ = setupTestRedisWithURL(t, srv.URL)

-	// Use a wrapper so gracefulPreRestart runs through the embedded handler.
-	hWrapper := &resolveURLTestWrapper{
-		WorkspaceHandler: newHandlerWithTestDeps(t),
-		testURL:          srv.URL + "/agent",
-	}
+	// gracefulPreRestart runs in a goroutine; wait for it before db.DB is restored.
+	// Must be registered AFTER setupTestDB (LIFO: async wait → db.DB restore).
+	waitForHandlerAsyncBeforeDBCleanup(t, hWrapper.WorkspaceHandler)

-	// gracefulPreRestart runs in a goroutine with its own timeout.
-	// We give it time to complete before the test ends.
 	hWrapper.gracefulPreRestart(context.Background(), "ws-ack-789")
 	time.Sleep(200 * time.Millisecond)
 }
@@ -224,19 +225,22 @@ func TestGracefulPreRestart_Success(t *testing.T) {
 // TestGracefulPreRestart_NotImplemented verifies that when the workspace returns
 // 404 (old SDK version), the platform proceeds gracefully (log + no error).
 func TestGracefulPreRestart_NotImplemented(t *testing.T) {
+	hWrapper := &resolveURLTestWrapper{
+		WorkspaceHandler: newHandlerWithTestDeps(t),
+		testURL:          "http://fake-agent.example/agent",
+	}
 	_ = setupTestDB(t)

 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		w.WriteHeader(http.StatusNotFound)
 	}))
 	defer srv.Close()
+	hWrapper.testURL = srv.URL + "/agent"

 	_ = setupTestRedisWithURL(t, srv.URL)

-	hWrapper := &resolveURLTestWrapper{
-		WorkspaceHandler: newHandlerWithTestDeps(t),
-		testURL:          srv.URL + "/agent",
-	}
+	// Must be registered AFTER setupTestDB so LIFO order is: async wait → db.DB restore.
+	waitForHandlerAsyncBeforeDBCleanup(t, hWrapper.WorkspaceHandler)

 	hWrapper.gracefulPreRestart(context.Background(), "ws-noimpl-999")
 	time.Sleep(200 * time.Millisecond)
@@ -246,15 +250,18 @@ func TestGracefulPreRestart_NotImplemented(t *testing.T) {
 // TestGracefulPreRestart_ConnectionRefused verifies that when the workspace
 // is unreachable, the platform proceeds gracefully without error.
 func TestGracefulPreRestart_ConnectionRefused(t *testing.T) {
-	_ = setupTestDB(t)
-
-	mr := setupTestRedisWithURL(t, "http://localhost:19999/agent") // nothing listening on 19999
-	_ = mr
-
 	hWrapper := &resolveURLTestWrapper{
 		WorkspaceHandler: newHandlerWithTestDeps(t),
 		testURL:          "http://localhost:19999/agent",
 	}
+	_ = setupTestDB(t)
+
+	// Nothing listening on 19999 — deliberate connection failure.
+	mr := setupTestRedisWithURL(t, "http://localhost:19999/agent")
+	_ = mr
+
+	// Must be registered AFTER setupTestDB so LIFO order is: async wait → db.DB restore.
+	waitForHandlerAsyncBeforeDBCleanup(t, hWrapper.WorkspaceHandler)

 	hWrapper.gracefulPreRestart(context.Background(), "ws-unreachable-000")
 	time.Sleep(200 * time.Millisecond)
@@ -264,13 +271,17 @@ func TestGracefulPreRestart_ConnectionRefused(t *testing.T) {
 // TestGracefulPreRestart_URLResolutionError verifies that when URL resolution
 // fails, the platform proceeds gracefully without blocking the restart.
 func TestGracefulPreRestart_URLResolutionError(t *testing.T) {
-	_ = setupTestDB(t)
-	_ = setupTestRedis(t) // empty → URL resolution will fail in resolveAgentURLForRestartSignal
-
 	hWrapper := &resolveURLTestWrapper{
 		WorkspaceHandler: newHandlerWithTestDeps(t),
 		errToReturn:      context.DeadlineExceeded,
 	}
+	_ = setupTestDB(t)
+	_ = setupTestRedis(t) // empty → URL resolution will fail in resolveAgentURLForRestartSignal
+
+	// Must be registered AFTER setupTestDB so LIFO order is: async wait → db.DB restore.
+	// This ensures goroutines (which access both DB and Redis) are drained before
+	// any cleanup fires. setupTestRedis comes after newHandlerWithTestDeps
+	// so the handler holds the correct Redis client reference.
 	waitForHandlerAsyncBeforeDBCleanup(t, hWrapper.WorkspaceHandler)

 	hWrapper.gracefulPreRestart(context.Background(), "ws-url-err-111")
@@ -710,6 +710,44 @@ func (h *SecretsHandler) SetModel(c *gin.Context) {
 		return
 	}

+	// issue #2172: validate the model against the registry before persisting.
+	// Empty model clears the override — skip validation (MODEL_REQUIRED owns
+	// the empty case at create time; clearing is always allowed).
+	if body.Model != "" {
+		var runtime string
+		if err := db.DB.QueryRowContext(ctx,
+			`SELECT runtime FROM workspaces WHERE id = $1`, workspaceID,
+		).Scan(&runtime); err != nil {
+			if err == sql.ErrNoRows {
+				c.JSON(http.StatusNotFound, gin.H{"error": "workspace not found"})
+				return
+			}
+			log.Printf("SetModel: runtime lookup failed for %s: %v", workspaceID, err)
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to read workspace runtime"})
+			return
+		}
+		if ok, why := validateRegisteredModelForRuntime(runtime, body.Model); !ok {
+			log.Printf("SetModel: 422 UNREGISTERED_MODEL_FOR_RUNTIME (runtime=%q model=%q): %s", runtime, body.Model, why)
+			c.JSON(http.StatusUnprocessableEntity, gin.H{
+				"error":   why,
+				"runtime": runtime,
+				"model":   body.Model,
+				"code":    "UNREGISTERED_MODEL_FOR_RUNTIME",
+			})
+			return
+		}
+		if ok, why := validateDerivedProviderInRegistry(runtime, body.Model); !ok {
+			log.Printf("SetModel: 422 DERIVED_PROVIDER_NOT_IN_REGISTRY (runtime=%q model=%q): %s", runtime, body.Model, why)
+			c.JSON(http.StatusUnprocessableEntity, gin.H{
+				"error":   why,
+				"runtime": runtime,
+				"model":   body.Model,
+				"code":    "DERIVED_PROVIDER_NOT_IN_REGISTRY",
+			})
+			return
+		}
+	}
+
 	if err := setModelSecret(ctx, workspaceID, body.Model); err != nil {
 		log.Printf("SetModel error: %v", err)
 		if body.Model == "" {
@@ -546,6 +546,11 @@ func TestSecretsSetModel_Upsert(t *testing.T) {
 	restartCalled := make(chan string, 1)
 	handler := NewSecretsHandler(func(id string) { restartCalled <- id })

+	// Runtime lookup (issue #2172) — model is non-empty so validation fires.
+	mock.ExpectQuery(`SELECT runtime FROM workspaces WHERE id = \$1`).
+		WithArgs("00000000-0000-0000-0000-000000000001").
+		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("claude-code"))
+
 	// Pin the literal 'MODEL' key in the SQL so a regression to the
 	// pre-2026-05-19 'MODEL_PROVIDER' column name shows up here.
 	mock.ExpectExec(`INSERT INTO workspace_secrets[\s\S]*'MODEL'`).
@@ -623,6 +628,99 @@ func TestSecretsSetModel_InvalidID(t *testing.T) {
 	}
 }

+// TestSecretsSetModel_UnregisteredModel_422 guards that a model not in the
+// runtime's native set is rejected at save (issue #2172 continuation).
+func TestSecretsSetModel_UnregisteredModel_422(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewSecretsHandler(nil)
+
+	mock.ExpectQuery(`SELECT runtime FROM workspaces WHERE id = \$1`).
+		WithArgs("00000000-0000-0000-0000-000000000003").
+		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("claude-code"))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000003"}}
+	c.Request = httptest.NewRequest("PUT", "/workspaces/00000000-0000-0000-0000-000000000003/model",
+		strings.NewReader(`{"model":"totally-made-up-model-xyz"}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.SetModel(c)
+
+	if w.Code != http.StatusUnprocessableEntity {
+		t.Fatalf("expected 422, got %d: %s", w.Code, w.Body.String())
+	}
+	body := w.Body.String()
+	if !strings.Contains(body, "UNREGISTERED_MODEL_FOR_RUNTIME") {
+		t.Errorf("expected code UNREGISTERED_MODEL_FOR_RUNTIME in body, got: %s", body)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestSecretsSetModel_UnknownRuntimeFailOpen_200 verifies the federation
+// contract: a runtime absent from the registry (langgraph) passes through
+// without validation so non-first-party runtimes are not blocked.
+func TestSecretsSetModel_UnknownRuntimeFailOpen_200(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewSecretsHandler(nil)
+
+	mock.ExpectQuery(`SELECT runtime FROM workspaces WHERE id = \$1`).
+		WithArgs("00000000-0000-0000-0000-000000000004").
+		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("langgraph"))
+
+	mock.ExpectExec(`INSERT INTO workspace_secrets[\s\S]*'MODEL'`).
+		WithArgs("00000000-0000-0000-0000-000000000004", sqlmock.AnyArg(), sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(1, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000004"}}
+	c.Request = httptest.NewRequest("PUT", "/workspaces/00000000-0000-0000-0000-000000000004/model",
+		strings.NewReader(`{"model":"any-arbitrary-model"}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.SetModel(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestSecretsSetModel_WorkspaceNotFound_404 verifies 404 when the runtime
+// lookup finds no workspace row.
+func TestSecretsSetModel_WorkspaceNotFound_404(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewSecretsHandler(nil)
+
+	mock.ExpectQuery(`SELECT runtime FROM workspaces WHERE id = \$1`).
+		WithArgs("00000000-0000-0000-0000-000000000005").
+		WillReturnError(sql.ErrNoRows)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000005"}}
+	c.Request = httptest.NewRequest("PUT", "/workspaces/00000000-0000-0000-0000-000000000005/model",
+		strings.NewReader(`{"model":"claude-sonnet-4-6"}`))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.SetModel(c)
+
+	if w.Code != http.StatusNotFound {
+		t.Fatalf("expected 404, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
 // TestSecretsModel_RoundTrip_KeyIsMODELNotMODEL_PROVIDER pins the
 // 2026-05-19 rename: writes via SetModel land under workspace_secrets
 // key='MODEL', and reads via GetModel hit the same key. A regression
@@ -636,6 +734,10 @@ func TestSecretsModel_RoundTrip_KeyIsMODELNotMODEL_PROVIDER(t *testing.T) {
 	handler := NewSecretsHandler(func(string) {})

 	// 1. SetModel — must hit key='MODEL' in the INSERT.
+	// Runtime lookup (issue #2172) — model is non-empty so validation fires.
+	mock.ExpectQuery(`SELECT runtime FROM workspaces WHERE id = \$1`).
+		WithArgs("00000000-0000-0000-0000-000000000099").
+		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("codex"))
 	mock.ExpectExec(`INSERT INTO workspace_secrets[\s\S]*'MODEL'[\s\S]*ON CONFLICT`).
 		WithArgs("00000000-0000-0000-0000-000000000099", sqlmock.AnyArg(), sqlmock.AnyArg()).
 		WillReturnResult(sqlmock.NewResult(1, 1))
@@ -44,6 +44,20 @@ func billingModeForRegistryProvider(p providers.Provider) string {
 	return LLMBillingModeBYOK
 }

+// requiredEnvForRegistryProvider derives the env vars the USER must supply for
+// a model owned by the resolved provider — the proper-SSOT single fact behind
+// the canvas "Missing API Keys" preflight (task #65). The closed platform
+// provider injects credentials server-side (the metered proxy) -> nothing
+// required; BYOK providers require their auth_env. Derived from IsPlatform +
+// AuthEnv so a template can no longer hand-author a required_env that drifts
+// from the registry's serving classification.
+func requiredEnvForRegistryProvider(p providers.Provider) []string {
+	if p.IsPlatform() {
+		return nil
+	}
+	return p.AuthEnv
+}
+
 // enrichFromRegistry populates the registry-served fields on a templateSummary
 // when its runtime is known to the provider registry. It is a no-op (leaves
 // RegistryBacked=false and the registry slices nil) for a runtime the registry
@@ -98,6 +112,7 @@ func enrichFromRegistry(summary *templateSummary, runtime string) {
 		if derived, derr := m.DeriveProvider(runtime, id, nil); derr == nil {
 			ms.Provider = derived.Name
 			ms.BillingMode = billingModeForRegistryProvider(derived)
+			ms.RequiredEnv = requiredEnvForRegistryProvider(derived)
 		}
 		// If DeriveProvider errors (ambiguous/overlap — a manifest defect the
 		// loader's tests pin against), still serve the id without a provider
@@ -474,6 +474,32 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
 			})
 			return
 		}
+		// issue #2172 (provider-side companion): once the (runtime, model)
+		// pair is known to be registered, confirm the DERIVED provider
+		// (the one the adapter will resolve at boot) is a known provider
+		// in the providers.yaml catalog. Live trigger (adk-demo Assistant,
+		// 2026-06-03): the model-side check passed for a registry-resident
+		// model whose derived provider name was NOT in the providers list,
+		// so the save was accepted and the agent wedged at boot with
+		// "provider=X not in providers registry". This check is a
+		// defense-in-depth registry-consistency guard: by construction a
+		// model in a runtime's native set derives to a provider that IS in
+		// the catalog, so the rejection path is primarily a registry-data
+		// invariant — any future drift between `providers:` and `runtimes:`
+		// fails the create with a clear pointer to the missing provider
+		// rather than silently wedging the agent. Fails open for runtimes
+		// the registry doesn't know (langgraph/external/kimi/mock/federated
+		// — the federation contract the model-side check also honors).
+		if ok, why := validateDerivedProviderInRegistry(payload.Runtime, payload.Model); !ok {
+			log.Printf("Create: 422 DERIVED_PROVIDER_NOT_IN_REGISTRY (runtime=%q model=%q): %s [issue #2172 hard-reject]", payload.Runtime, payload.Model, why)
+			c.JSON(http.StatusUnprocessableEntity, gin.H{
+				"error":   why,
+				"runtime": payload.Runtime,
+				"model":   payload.Model,
+				"code":    "DERIVED_PROVIDER_NOT_IN_REGISTRY",
+			})
+			return
+		}
 	}

 	ctx := c.Request.Context()
@@ -56,7 +56,20 @@ func PatchAbilities(c *gin.Context) {
 		return
 	}

-	if body.BroadcastEnabled != nil {
+	// Atomic update: when both fields are supplied, apply them in one SQL
+	// statement so the request is all-or-nothing (#2131). A partial mutation
+	// (e.g. broadcast_enabled updated but talk_to_user_enabled failing) would
+	// leave the workspace in an ambiguous capability state.
+	if body.BroadcastEnabled != nil && body.TalkToUserEnabled != nil {
+		if _, err := db.DB.ExecContext(ctx,
+			`UPDATE workspaces SET broadcast_enabled = $2, talk_to_user_enabled = $3, updated_at = now() WHERE id = $1`,
+			id, *body.BroadcastEnabled, *body.TalkToUserEnabled,
+		); err != nil {
+			log.Printf("PatchAbilities both-fields for %s: %v", id, err)
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "update failed"})
+			return
+		}
+	} else if body.BroadcastEnabled != nil {
 		if _, err := db.DB.ExecContext(ctx,
 			`UPDATE workspaces SET broadcast_enabled = $2, updated_at = now() WHERE id = $1`,
 			id, *body.BroadcastEnabled,
@@ -65,9 +78,7 @@ func PatchAbilities(c *gin.Context) {
 			c.JSON(http.StatusInternalServerError, gin.H{"error": "update failed"})
 			return
 		}
-	}
-
-	if body.TalkToUserEnabled != nil {
+	} else if body.TalkToUserEnabled != nil {
 		if _, err := db.DB.ExecContext(ctx,
 			`UPDATE workspaces SET talk_to_user_enabled = $2, updated_at = now() WHERE id = $1`,
 			id, *body.TalkToUserEnabled,
@@ -130,11 +130,8 @@ func TestPatchAbilities_BothFields(t *testing.T) {
 	mock.ExpectQuery(`SELECT EXISTS\(SELECT 1 FROM workspaces WHERE id = \$1 AND status != 'removed'\)`).
 		WithArgs(wsUUID1).
 		WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
-	mock.ExpectExec(`UPDATE workspaces SET broadcast_enabled = \$2, updated_at = now\(\) WHERE id = \$1`).
-		WithArgs(wsUUID1, true).
-		WillReturnResult(sqlmock.NewResult(0, 1))
-	mock.ExpectExec(`UPDATE workspaces SET talk_to_user_enabled = \$2, updated_at = now\(\) WHERE id = \$1`).
-		WithArgs(wsUUID1, true).
+	mock.ExpectExec(`UPDATE workspaces SET broadcast_enabled = \$2, talk_to_user_enabled = \$3, updated_at = now\(\) WHERE id = \$1`).
+		WithArgs(wsUUID1, true, true).
 		WillReturnResult(sqlmock.NewResult(0, 1))

 	w := patchAbilitiesReq(t, wsUUID1, `{"broadcast_enabled":true,"talk_to_user_enabled":true}`)
@@ -182,19 +179,25 @@ func TestPatchAbilities_TalkToUserUpdateError(t *testing.T) {
 	}
 }

-func TestPatchAbilities_BothFields_BroadcastFails(t *testing.T) {
+// TestPatchAbilities_BothFields_UpdateError — regression for #2131. When
+// both fields are supplied the handler uses a single combined UPDATE. A
+// failure of that UPDATE must leave the workspace unchanged (atomic).
+func TestPatchAbilities_BothFields_UpdateError(t *testing.T) {
 	mock, cleanup := withMockDB(t)
 	defer cleanup()

 	mock.ExpectQuery(`SELECT EXISTS\(SELECT 1 FROM workspaces WHERE id = \$1 AND status != 'removed'\)`).
 		WithArgs(wsUUID1).
 		WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
-	mock.ExpectExec(`UPDATE workspaces SET broadcast_enabled = \$2, updated_at = now\(\) WHERE id = \$1`).
-		WithArgs(wsUUID1, true).
+	mock.ExpectExec(`UPDATE workspaces SET broadcast_enabled = \$2, talk_to_user_enabled = \$3, updated_at = now\(\) WHERE id = \$1`).
+		WithArgs(wsUUID1, true, true).
 		WillReturnError(errors.New("disk full"))

 	w := patchAbilitiesReq(t, wsUUID1, `{"broadcast_enabled":true,"talk_to_user_enabled":true}`)
 	if w.Code != http.StatusInternalServerError {
 		t.Fatalf("expected 500, got %d: %s", w.Code, w.Body.String())
 	}
+	// Because only one UPDATE is issued, there is no partial-mutation
+	// path to assert against; sqlmock implicitly verifies no second
+	// exec occurred.
 }
@@ -95,6 +95,14 @@ func TestIntegration_BroadcastOrgRoot_NonRootSenderResolvesToRoot(t *testing.T)
 		}
 	})

+	// Pre-test hygiene: if a prior run crashed or was killed, its rows may
+	// still be in the shared integration DB. Remove them before inserting so
+	// the unique index workspaces_parent_name_uniq does not conflict.
+	if _, err := conn.ExecContext(ctx,
+		`DELETE FROM workspaces WHERE name LIKE $1`, prefix+"%"); err != nil {
+		t.Logf("pre-test cleanup (non-fatal): %v", err)
+	}
+
 	rootID := uuid.New().String()
 	midID := uuid.New().String()
 	leafID := uuid.New().String()
@@ -41,7 +41,6 @@ import (
 	"context"
 	"database/sql"
 	"fmt"
-	"os"
 	"testing"

 	"github.com/google/uuid"
@@ -59,10 +58,7 @@ import (
 // only those.
 func integrationDB_WorkspaceCreateName(t *testing.T) *sql.DB {
 	t.Helper()
-	url := os.Getenv("INTEGRATION_DB_URL")
-	if url == "" {
-		t.Skip("INTEGRATION_DB_URL not set; skipping (see file header)")
-	}
+	url := requireIntegrationDBURL(t)
 	conn, err := sql.Open("postgres", url)
 	if err != nil {
 		t.Fatalf("open: %v", err)
@@ -0,0 +1,331 @@
+package handlers
+
+import (
+	"context"
+	"database/sql"
+	"net/http"
+	"net/http/httptest"
+	"net/url"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/gin-gonic/gin"
+	"github.com/gorilla/websocket"
+)
+
+// rfbGreeting is the first frame a real websockify/RFB backend writes on
+// connect. The fake backend below sends these exact bytes so the positive
+// test can prove the upstream's first binary frame survives the reverse
+// proxy chain (the "WS 1006" regression surface from core#2247 was the
+// upgrade/handshake silently failing before any RFB byte reached the
+// browser).
+var rfbGreeting = []byte("RFB 003.008\n")
+
+// newFakeWebsockifyBackend stands up an httptest.NewServer that upgrades the
+// websocket, writes the RFB greeting as a binary frame, then echoes every
+// frame it receives back to the client. No EC2, noVNC, or SSH involved — it
+// is the stand-in for the on-instance :6080 websockify listener that
+// realDisplayForward would normally tunnel to.
+func newFakeWebsockifyBackend(t *testing.T) *httptest.Server {
+	t.Helper()
+	upgrader := websocket.Upgrader{
+		// The proxy rewrites Sec-WebSocket-Protocol to "binary"; accept any
+		// origin/subprotocol so the fake backend never rejects the handshake.
+		CheckOrigin:       func(*http.Request) bool { return true },
+		Subprotocols:      []string{"binary"},
+		HandshakeTimeout:  5 * time.Second,
+		EnableCompression: false,
+	}
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		conn, err := upgrader.Upgrade(w, r, nil)
+		if err != nil {
+			return
+		}
+		defer conn.Close()
+		if err := conn.WriteMessage(websocket.BinaryMessage, rfbGreeting); err != nil {
+			return
+		}
+		for {
+			mt, msg, err := conn.ReadMessage()
+			if err != nil {
+				return
+			}
+			if err := conn.WriteMessage(mt, msg); err != nil {
+				return
+			}
+		}
+	}))
+	t.Cleanup(srv.Close)
+	return srv
+}
+
+// wireDisplayForwardToBackend overrides the injectable displayForward package
+// var so DisplaySession proxies to the fake backend instead of opening an EIC
+// SSH tunnel. Restored via t.Cleanup. The returned *url.URL is the http://
+// backend address (the reverse proxy upgrades it to ws:// natively under
+// Go 1.25's ReverseProxy WebSocket support).
+func wireDisplayForwardToBackend(t *testing.T, backendURL string) {
+	t.Helper()
+	target, err := url.Parse(backendURL)
+	if err != nil {
+		t.Fatalf("parse backend URL %q: %v", backendURL, err)
+	}
+	prev := displayForward
+	displayForward = func(_ context.Context, _ string, fn func(target *url.URL) error) error {
+		return fn(target)
+	}
+	t.Cleanup(func() { displayForward = prev })
+}
+
+// newDisplaySessionTestServer mounts DisplaySession on a gin router behind an
+// httptest.NewServer so a real websocket client can dial the route end-to-end.
+// It returns the base ws:// URL for the websockify route.
+func newDisplaySessionTestServer(t *testing.T, handler *WorkspaceHandler) *httptest.Server {
+	t.Helper()
+	r := gin.New()
+	// Mirror the production registration in internal/router/router.go:
+	//   GET /workspaces/:id/display/session/*proxyPath -> wh.DisplaySession
+	r.GET("/workspaces/:id/display/session/*proxyPath", handler.DisplaySession)
+	srv := httptest.NewServer(r)
+	t.Cleanup(srv.Close)
+	return srv
+}
+
+const (
+	displayProxyWorkspaceID  = "ws-display"
+	displayProxyInstanceID   = "i-0fakedeadbeef00001"
+	displayProxyControlledBy = "admin-token"
+)
+
+// expectDisplaySessionTargetRow mocks loadWorkspaceDisplaySessionTarget's
+// workspaces SELECT. mode "desktop-control" + a non-empty instance_id is the
+// "display enabled, tunnel available" shape. (Note: the compute validator
+// accepts modes none/desktop-control/gpu-desktop-control and protocols
+// dcv/novnc — "novnc" is a *protocol*, not a mode, so the enabled rows use
+// mode=desktop-control,protocol=novnc.)
+func expectDisplaySessionTargetRow(mock sqlmock.Sqlmock, computeJSON, instanceID string) {
+	mock.ExpectQuery(`SELECT COALESCE\(compute, '\{\}'::jsonb\), COALESCE\(instance_id, ''\) FROM workspaces WHERE id = \$1`).
+		WithArgs(displayProxyWorkspaceID).
+		WillReturnRows(sqlmock.NewRows([]string{"compute", "instance_id"}).AddRow(computeJSON, instanceID))
+}
+
+// expectActiveDisplayControlRow mocks loadActiveDisplayControl's locks SELECT
+// returning an active lock owned by controlledBy expiring at expiresAt.
+func expectActiveDisplayControlRow(mock sqlmock.Sqlmock, controlledBy string, expiresAt time.Time) {
+	mock.ExpectQuery(`SELECT controller, controlled_by, expires_at FROM workspace_display_control_locks WHERE workspace_id = \$1 AND expires_at > now\(\)`).
+		WithArgs(displayProxyWorkspaceID).
+		WillReturnRows(sqlmock.NewRows([]string{"controller", "controlled_by", "expires_at"}).
+			AddRow("user", controlledBy, expiresAt))
+}
+
+const enabledComputeJSON = `{"display":{"mode":"desktop-control","protocol":"novnc","width":1280,"height":800}}`
+
+// dialDisplaySession dials the websockify route on the given test server with
+// the supplied Sec-WebSocket-Protocol values. It returns the conn (nil on
+// failure), the HTTP response, and the dial error.
+func dialDisplaySession(t *testing.T, srv *httptest.Server, subprotocols []string) (*websocket.Conn, *http.Response, error) {
+	t.Helper()
+	wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + "/workspaces/" + displayProxyWorkspaceID + "/display/session/websockify"
+	dialer := websocket.Dialer{
+		HandshakeTimeout: 5 * time.Second,
+		Subprotocols:     subprotocols,
+	}
+	return dialer.Dial(wsURL, nil)
+}
+
+// TestDisplaySessionProxy_Positive proves the full take-control WS-proxy path
+// without any network/EC2: a valid signed token + active lock + enabled
+// display upgrades successfully (HTTP 101), the backend's RFB greeting arrives
+// through the proxy, and a client->server byte round-trips back (bidirectional
+// proxy chain). This is the direct regression guard for the "WS 1006" failure
+// class in core#2247.
+func TestDisplaySessionProxy_Positive(t *testing.T) {
+	t.Setenv("DISPLAY_SESSION_SIGNING_SECRET", "test-secret")
+	mock := setupTestDB(t)
+	backend := newFakeWebsockifyBackend(t)
+	wireDisplayForwardToBackend(t, backend.URL)
+
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+	srv := newDisplaySessionTestServer(t, handler)
+
+	expiresAt := time.Now().Add(5 * time.Minute)
+	expectDisplaySessionTargetRow(mock, enabledComputeJSON, displayProxyInstanceID)
+	expectActiveDisplayControlRow(mock, displayProxyControlledBy, expiresAt)
+
+	token := signDisplaySessionToken(displayProxyWorkspaceID, displayProxyControlledBy, expiresAt)
+	if token == "" {
+		t.Fatal("signDisplaySessionToken returned empty token")
+	}
+
+	conn, resp, err := dialDisplaySession(t, srv, []string{"binary", displaySessionTokenProtocolPrefix + token})
+	if err != nil {
+		body := ""
+		if resp != nil {
+			body = resp.Status
+		}
+		t.Fatalf("websocket dial failed: %v (resp=%s)", err, body)
+	}
+	t.Cleanup(func() { conn.Close() })
+	if resp.StatusCode != http.StatusSwitchingProtocols {
+		t.Fatalf("expected 101 Switching Protocols, got %d", resp.StatusCode)
+	}
+
+	// 1. The backend's RFB greeting must arrive through the proxy.
+	conn.SetReadDeadline(time.Now().Add(5 * time.Second))
+	mt, msg, err := conn.ReadMessage()
+	if err != nil {
+		t.Fatalf("read greeting through proxy failed: %v", err)
+	}
+	if mt != websocket.BinaryMessage || string(msg) != string(rfbGreeting) {
+		t.Fatalf("greeting = %q (type %d), want %q binary", msg, mt, rfbGreeting)
+	}
+
+	// 2. A client->server byte must echo back (bidirectional chain).
+	probe := []byte{0x13, 0x37, 0x00, 0xff}
+	if err := conn.WriteMessage(websocket.BinaryMessage, probe); err != nil {
+		t.Fatalf("write probe through proxy failed: %v", err)
+	}
+	conn.SetReadDeadline(time.Now().Add(5 * time.Second))
+	_, echo, err := conn.ReadMessage()
+	if err != nil {
+		t.Fatalf("read echo through proxy failed: %v", err)
+	}
+	if string(echo) != string(probe) {
+		t.Fatalf("echo = %q, want %q", echo, probe)
+	}
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestDisplaySessionProxy_Rejections is table-driven over the failure surface.
+// Each case asserts the WS upgrade does NOT happen (dial errors / no 101) and
+// the right HTTP status is returned, WITHOUT ever reaching the fake backend.
+func TestDisplaySessionProxy_Rejections(t *testing.T) {
+	t.Setenv("DISPLAY_SESSION_SIGNING_SECRET", "test-secret")
+	pastExpiry := time.Now().Add(-5 * time.Minute)
+	futureExpiry := time.Now().Add(5 * time.Minute)
+
+	cases := []struct {
+		name string
+		// expect wires the sqlmock rows that the handler will actually read
+		// for this case (the locks SELECT is only reached for token cases).
+		expect func(mock sqlmock.Sqlmock)
+		// subprotocols sent on the dial (token header, if any).
+		subprotocols []string
+		// proxyPath overrides the default "/websockify" route segment.
+		proxyPath  string
+		wantStatus int
+	}{
+		{
+			name: "missing token -> 403",
+			expect: func(m sqlmock.Sqlmock) {
+				expectDisplaySessionTargetRow(m, enabledComputeJSON, displayProxyInstanceID)
+				expectActiveDisplayControlRow(m, displayProxyControlledBy, futureExpiry)
+			},
+			subprotocols: []string{"binary"},
+			wantStatus:   http.StatusForbidden,
+		},
+		{
+			name: "tampered token -> 403",
+			expect: func(m sqlmock.Sqlmock) {
+				expectDisplaySessionTargetRow(m, enabledComputeJSON, displayProxyInstanceID)
+				expectActiveDisplayControlRow(m, displayProxyControlledBy, futureExpiry)
+			},
+			subprotocols: []string{"binary", displaySessionTokenProtocolPrefix + "garbage.not-a-valid-mac"},
+			wantStatus:   http.StatusForbidden,
+		},
+		{
+			name: "expired lock -> 403",
+			expect: func(m sqlmock.Sqlmock) {
+				expectDisplaySessionTargetRow(m, enabledComputeJSON, displayProxyInstanceID)
+				// Active-lock query filters expires_at > now(), so an
+				// expired lock returns no rows -> found=false -> 403.
+				m.ExpectQuery(`SELECT controller, controlled_by, expires_at FROM workspace_display_control_locks WHERE workspace_id = \$1 AND expires_at > now\(\)`).
+					WithArgs(displayProxyWorkspaceID).
+					WillReturnError(sql.ErrNoRows)
+			},
+			// Token signed against the past expiry would also fail validation
+			// even if a stale lock row were returned.
+			subprotocols: []string{"binary", displaySessionTokenProtocolPrefix +
+				signDisplaySessionToken(displayProxyWorkspaceID, displayProxyControlledBy, pastExpiry)},
+			wantStatus: http.StatusForbidden,
+		},
+		{
+			name: "display mode none -> 404",
+			expect: func(m sqlmock.Sqlmock) {
+				expectDisplaySessionTargetRow(m, `{"display":{"mode":"none"}}`, displayProxyInstanceID)
+			},
+			subprotocols: []string{"binary"},
+			wantStatus:   http.StatusNotFound,
+		},
+		{
+			name: "empty instance_id -> 503",
+			expect: func(m sqlmock.Sqlmock) {
+				expectDisplaySessionTargetRow(m, enabledComputeJSON, "")
+			},
+			subprotocols: []string{"binary"},
+			wantStatus:   http.StatusServiceUnavailable,
+		},
+		{
+			name: "wrong proxyPath -> 404",
+			expect: func(m sqlmock.Sqlmock) {
+				expectDisplaySessionTargetRow(m, enabledComputeJSON, displayProxyInstanceID)
+			},
+			subprotocols: []string{"binary"},
+			proxyPath:    "/frames",
+			wantStatus:   http.StatusNotFound,
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			mock := setupTestDB(t)
+			// A backend that fatals if it is ever reached — proves these
+			// rejections happen strictly before any proxy dial.
+			reached := false
+			backend := httptest.NewServer(http.HandlerFunc(func(http.ResponseWriter, *http.Request) {
+				reached = true
+			}))
+			t.Cleanup(backend.Close)
+			wireDisplayForwardToBackend(t, backend.URL)
+
+			handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+			srv := newDisplaySessionTestServer(t, handler)
+			tc.expect(mock)
+
+			proxyPath := tc.proxyPath
+			if proxyPath == "" {
+				proxyPath = "/websockify"
+			}
+			wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") +
+				"/workspaces/" + displayProxyWorkspaceID + "/display/session" + proxyPath
+			dialer := websocket.Dialer{HandshakeTimeout: 5 * time.Second, Subprotocols: tc.subprotocols}
+			conn, resp, err := dialer.Dial(wsURL, nil)
+			if conn != nil {
+				conn.Close()
+			}
+			if err == nil {
+				t.Fatalf("expected WS upgrade to fail, but dial succeeded")
+			}
+			if resp == nil {
+				t.Fatalf("expected an HTTP response on rejected upgrade, got nil (err=%v)", err)
+			}
+			if resp.StatusCode != tc.wantStatus {
+				t.Fatalf("status = %d, want %d", resp.StatusCode, tc.wantStatus)
+			}
+			if resp.StatusCode == http.StatusSwitchingProtocols {
+				t.Fatalf("upgrade unexpectedly succeeded (101)")
+			}
+			if reached {
+				t.Fatalf("rejection leaked to the upstream backend")
+			}
+			if err := mock.ExpectationsWereMet(); err != nil {
+				t.Errorf("unmet sqlmock expectations: %v", err)
+			}
+		})
+	}
+}
@@ -613,6 +613,32 @@ func (h *WorkspaceHandler) ensureDefaultConfig(workspaceID string, payload model
 	if model == "" {
 		log.Printf("ensureDefaultConfig: workspace %s reached provisioning with empty model — Create handler should have rejected this; rendering empty model: \"\" in config.yaml (workspace will boot not_configured)", workspaceID)
 	}
+
+	// Derive the provider from the providers manifest and stamp it into the
+	// generated config BEFORE claude-code model normalization strips the
+	// slash-prefix. DeriveProvider needs the FULL, un-normalized model id
+	// (e.g. "moonshot/kimi-k2.6") for the exact-id match that resolves the
+	// canvas claude-code case to provider=platform — normalizing to
+	// "kimi-k2.6" first would lose that match.
+	//
+	// Why this exists (RFC#340 Fix A): a canvas-created claude-code workspace
+	// with model "moonshot/kimi-k2.6" booted NOT_CONFIGURED — the adapter
+	// derived provider="moonshot" (slash-split of the model id) which is not
+	// in the providers registry. CP bakes `provider: platform` via heredoc,
+	// but the cp#329 config-bundle fetch overwrites /configs/config.yaml with
+	// THIS (previously providerless) bundle version, so molecule-runtime
+	// config.py re-derived the wrong provider. Stamping the manifest-derived
+	// provider here (mirroring CP's buildModelProviderYAML shape) makes the
+	// config the adapter reads carry the canonical provider.
+	//
+	// Reuses the SAME manifest path the config-SAVE validators use
+	// (providerRegistry() + Manifest.DeriveProvider; see
+	// model_registry_validation.go). On a derive MISS (unknown/unregistered
+	// model, or registry unavailable) provider is left empty and the field is
+	// omitted below — preserving today's behavior; never fail provisioning on
+	// a derive miss.
+	derivedProvider := deriveDefaultConfigProvider(runtime, model)
+
 	if runtime == "claude-code" {
 		model = normalizeClaudeCodeModel(model)
 	}
@@ -640,6 +666,14 @@ func (h *WorkspaceHandler) ensureDefaultConfig(workspaceID string, payload model
 	// Model always at top level — config.py reads raw["model"] for all runtimes.
 	configYAML += fmt.Sprintf("model: %s\n", quoteModel)

+	// Stamp the manifest-derived provider at top level (mirroring CP's
+	// buildModelProviderYAML). Omitted entirely on a derive miss so the prior
+	// behavior — no `provider:` key, runtime re-derives — is preserved for
+	// unregistered models (requirement #3).
+	if derivedProvider != "" {
+		configYAML += fmt.Sprintf("provider: '%s'\n", yamlEscapeSingleQuotedProvider(derivedProvider))
+	}
+
 	// Add runtime_config. required_env is intentionally omitted — the
 	// platform injects secrets at container-start time via the secrets API,
 	// and preflight already validates that the env vars are present before
@@ -649,6 +683,10 @@ func (h *WorkspaceHandler) ensureDefaultConfig(workspaceID string, payload model
 	if runtime == "claude-code" {
 		configYAML += fmt.Sprintf("  model: %s\n", quoteModel)
 	}
+	// Mirror the top-level provider under runtime_config (CP writes both).
+	if derivedProvider != "" {
+		configYAML += fmt.Sprintf("  provider: '%s'\n", yamlEscapeSingleQuotedProvider(derivedProvider))
+	}
 	configYAML += "  timeout: 0\n"

 	files["config.yaml"] = []byte(configYAML)
@@ -657,6 +695,48 @@ func (h *WorkspaceHandler) ensureDefaultConfig(workspaceID string, payload model
 	return files
 }

+// deriveDefaultConfigProvider resolves the provider name the adapter should
+// see for (runtime, model) using the SAME providers manifest the config-SAVE
+// validators use (providerRegistry() + Manifest.DeriveProvider; see
+// model_registry_validation.go). It is intentionally fail-OPEN: any miss
+// (empty model, registry unavailable, unknown runtime, or a model the runtime
+// does not own) returns "" so the caller omits the `provider:` field and the
+// generated config keeps its pre-fix shape. It NEVER fails provisioning.
+//
+// `model` must be the FULL, un-normalized id (e.g. "moonshot/kimi-k2.6") so
+// DeriveProvider's exact-id match resolves the canvas claude-code case to
+// provider=platform. The availableAuthEnv arg is nil here — config-generation
+// has no per-workspace auth context yet (secrets are injected at container
+// start), matching the validators' nil call.
+func deriveDefaultConfigProvider(runtime, model string) string {
+	if strings.TrimSpace(model) == "" {
+		return ""
+	}
+	m, err := providerRegistry()
+	if err != nil || m == nil {
+		// Registry unavailable (a build-time defect the gen/sync gates catch).
+		// Fail open — do not stamp a provider, do not block provisioning.
+		return ""
+	}
+	p, err := m.DeriveProvider(runtime, model, nil)
+	if err != nil {
+		// Unknown runtime (federation / non-first-party) or a model the
+		// runtime does not own. Either way, omit the provider and let the
+		// runtime fall back to its prior derivation — preserving today's
+		// behavior for unregistered models.
+		return ""
+	}
+	return p.Name
+}
+
+// yamlEscapeSingleQuotedProvider escapes a value for a YAML single-quoted
+// scalar, mirroring CP's buildModelProviderYAML (a literal single quote is
+// doubled). Provider names are registry-controlled identifiers, so this is a
+// defense-in-depth measure rather than a hot path.
+func yamlEscapeSingleQuotedProvider(v string) string {
+	return strings.ReplaceAll(v, "'", "''")
+}
+
 func normalizeClaudeCodeModel(model string) string {
 	model = strings.TrimSpace(model)
 	if before, after, ok := strings.Cut(model, "/"); ok && before != "" && after != "" {
@@ -0,0 +1,196 @@
+package handlers
+
+// workspace_provision_platform_boot_test.go — the deterministic, SSOT-driven
+// regression suite for the class of bug behind the moonshot/kimi
+// "canvas-created claude-code workspace boots NOT_CONFIGURED" production
+// incident (RFC#340 Fix A #2187, canvas Fix C #2188).
+//
+// THE BUG (what shipped to prod):
+//   A claude-code workspace created via the canvas with provider=Platform +
+//   model="moonshot/kimi-k2.6" booted NOT_CONFIGURED. Unit tests passed; the
+//   REAL boot path was broken. ensureDefaultConfig generated a config.yaml that
+//   carried NO derived `provider:` key, so the cp#329 config-bundle the adapter
+//   actually reads left molecule-runtime config.py to slash-split the model id
+//   "moonshot/kimi-k2.6" -> provider="moonshot", which is NOT in the providers
+//   registry -> NOT_CONFIGURED.
+//
+// THE FIX A INVARIANT (this file pins it, and pins it for the WHOLE class):
+//   ensureDefaultConfig MUST stamp the manifest-derived provider into the
+//   generated config.yaml — at BOTH the top level and under runtime_config —
+//   for every (runtime, model) the providers SSOT maps to a platform provider.
+//   The single-combo pin (TestEnsureDefaultConfig_StampsDerivedProvider in
+//   workspace_provision_test.go) proves the headline case. THIS file closes the
+//   gap that single pin leaves: it is PARAMETRIZED OVER THE SSOT, so when a NEW
+//   platform model is added to providers.yaml for claude-code (or any runtime
+//   with a platform arm), the new id is automatically covered — a future
+//   platform model that fails to derive `provider: platform` fails THIS test at
+//   build time, before it can ship a NOT_CONFIGURED boot.
+//
+// WHY SSOT-DRIVEN AND NOT A HAND-MAINTAINED LIST:
+//   The original bug was a divergence between "what the canvas offers"
+//   (providers.yaml platform arm) and "what the config generator stamps". A
+//   hardcoded test model list would itself drift from the SSOT and re-open the
+//   same divergence gap. By enumerating the platform model set directly from the
+//   loaded providers.Manifest (the SAME manifest ensureDefaultConfig's
+//   deriveDefaultConfigProvider resolves against), this test cannot fall behind
+//   the offered set: add a platform model, get a test case for free; the test
+//   only passes if the generator actually stamps it.
+//
+// SCOPE: deterministic, no live infra. The REAL-boot complement (provision a
+// staging workspace and assert status=online + a completion returns 200 for the
+// SAME combo) is the bash staging harness — see
+// tests/e2e/test_staging_full_saas.sh (E2E_LLM_PATH=platform) and the
+// e2e-staging-platform-boot job in .gitea/workflows/e2e-staging-saas.yml. That
+// asserts the REAL artifact (booted status / completion); THIS asserts the
+// deterministic config-generation invariant the real boot depends on.
+
+import (
+	"testing"
+
+	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models"
+	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/providers"
+	"gopkg.in/yaml.v3"
+)
+
+// platformModelsForRuntime returns the exact model ids the providers SSOT lists
+// under runtime rt's `platform` native provider arm — the set the canvas offers
+// as provider=Platform and the set ensureDefaultConfig MUST stamp
+// `provider: platform` for. Reads the SAME embedded manifest the config
+// generator derives against (providers.LoadManifest), so it can never drift from
+// the offered set. Returns nil when the runtime has no platform arm.
+func platformModelsForRuntime(t *testing.T, rt string) []string {
+	t.Helper()
+	m, err := providers.LoadManifest()
+	if err != nil {
+		t.Fatalf("LoadManifest: %v", err)
+	}
+	native, ok := m.Runtimes[rt]
+	if !ok {
+		t.Fatalf("providers SSOT has no runtimes entry for %q", rt)
+	}
+	for _, ref := range native.Providers {
+		if ref.Name == "platform" {
+			return ref.Models
+		}
+	}
+	return nil
+}
+
+// TestEnsureDefaultConfig_StampsProviderForEverySSOTPlatformModel is the
+// class-level regression for the moonshot/kimi NOT_CONFIGURED incident. For
+// EVERY model the providers SSOT offers under claude-code's platform arm, it
+// asserts the generated config.yaml carries the manifest-derived provider at
+// both the top level and under runtime_config. This is the Fix A invariant,
+// parametrized over the SSOT so a newly-offered platform model cannot ship
+// without the stamp (the exact divergence — offered-but-not-stamped — that
+// booted "moonshot/kimi-k2.6" into NOT_CONFIGURED).
+func TestEnsureDefaultConfig_StampsProviderForEverySSOTPlatformModel(t *testing.T) {
+	const runtime = "claude-code"
+	platformModels := platformModelsForRuntime(t, runtime)
+	if len(platformModels) == 0 {
+		t.Fatalf("providers SSOT lists no platform models for runtime %q — the regression matrix would be empty; the SSOT shape changed (this test is the canary)", runtime)
+	}
+	// Headline sentinel: the exact id that booted NOT_CONFIGURED in prod MUST be
+	// in the enumerated set. If a refactor drops it from the platform arm, this
+	// test must still cover it explicitly — fail loud rather than silently
+	// shrinking the matrix.
+	if !containsString(platformModels, "moonshot/kimi-k2.6") {
+		t.Fatalf("the headline incident model \"moonshot/kimi-k2.6\" is no longer in the claude-code platform SSOT set (%v) — regression coverage for the original bug would be lost", platformModels)
+	}
+
+	for _, model := range platformModels {
+		model := model
+		t.Run(model, func(t *testing.T) {
+			broadcaster := newTestBroadcaster()
+			handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+			files := handler.ensureDefaultConfig("ws-platform-boot", models.CreateWorkspacePayload{
+				Name:    "Platform Boot Agent",
+				Tier:    2,
+				Runtime: runtime,
+				Model:   model,
+			})
+
+			raw, ok := files["config.yaml"]
+			if !ok {
+				t.Fatalf("expected config.yaml in generated files for model %q", model)
+			}
+
+			var parsed struct {
+				Model         string `yaml:"model"`
+				Provider      string `yaml:"provider"`
+				RuntimeConfig struct {
+					Model    string `yaml:"model"`
+					Provider string `yaml:"provider"`
+				} `yaml:"runtime_config"`
+			}
+			if err := yaml.Unmarshal(raw, &parsed); err != nil {
+				t.Fatalf("generated YAML invalid for model %q: %v\n%s", model, err, raw)
+			}
+
+			// The load-bearing invariant: BOTH the top-level and the
+			// runtime_config provider must be exactly "platform". An empty or
+			// vendor-namespace ("moonshot") value here is the prod NOT_CONFIGURED
+			// boot — the adapter would slash-split the model id and look up an
+			// unregistered provider.
+			if parsed.Provider != "platform" {
+				t.Errorf("model %q: top-level provider = %q, want \"platform\" (Fix A invariant — empty/vendor value is the NOT_CONFIGURED boot)\n%s", model, parsed.Provider, raw)
+			}
+			if parsed.RuntimeConfig.Provider != "platform" {
+				t.Errorf("model %q: runtime_config.provider = %q, want \"platform\"\n%s", model, parsed.RuntimeConfig.Provider, raw)
+			}
+			// Sanity: the config must still render a non-empty model (a config
+			// with provider but no model is equally undeployable).
+			if parsed.Model == "" {
+				t.Errorf("model %q: generated config has empty top-level model\n%s", model, raw)
+			}
+		})
+	}
+}
+
+// TestPlatformModelDeriveProvider_SSOTConsistency is the upstream half of the
+// same invariant, one layer below ensureDefaultConfig: it asserts the providers
+// manifest's DeriveProvider — the resolver deriveDefaultConfigProvider calls —
+// maps every SSOT-offered claude-code platform model to a provider whose Name is
+// "platform". If DeriveProvider itself regressed (e.g. a model_prefix_match
+// change made "moonshot/kimi-k2.6" resolve to the bare "moonshot" entry again),
+// this fails closer to the root cause than the config-shape test above, making
+// the diagnosis unambiguous: SSOT/derive regression vs config-emission
+// regression.
+func TestPlatformModelDeriveProvider_SSOTConsistency(t *testing.T) {
+	const runtime = "claude-code"
+	m, err := providers.LoadManifest()
+	if err != nil {
+		t.Fatalf("LoadManifest: %v", err)
+	}
+	platformModels := platformModelsForRuntime(t, runtime)
+	if len(platformModels) == 0 {
+		t.Fatalf("no platform models for %q in SSOT", runtime)
+	}
+	for _, model := range platformModels {
+		model := model
+		t.Run(model, func(t *testing.T) {
+			// nil availableAuthEnv mirrors deriveDefaultConfigProvider's call at
+			// config-generation time (no per-workspace auth context yet).
+			p, err := m.DeriveProvider(runtime, model, nil)
+			if err != nil {
+				t.Fatalf("DeriveProvider(%q, %q): unexpected error %v — an SSOT-offered platform model MUST derive", runtime, model, err)
+			}
+			if p.Name != "platform" {
+				t.Errorf("DeriveProvider(%q, %q).Name = %q, want \"platform\" (this is the exact slash-split-to-vendor regression that booted NOT_CONFIGURED)", runtime, model, p.Name)
+			}
+		})
+	}
+}
+
+// containsString is a tiny local membership helper. Kept here (not a shared
+// test util) so this regression file is self-contained and can be read top to
+// bottom without chasing helpers across the package.
+func containsString(xs []string, want string) bool {
+	for _, x := range xs {
+		if x == want {
+			return true
+		}
+	}
+	return false
+}
@@ -363,6 +363,74 @@ runtime_config:
 	}
 }

+// TestEnsureDefaultConfig_StampsDerivedProvider pins RFC#340 Fix A: a
+// canvas-created claude-code workspace with model "moonshot/kimi-k2.6" must
+// have the manifest-derived provider stamped into config.yaml at BOTH the top
+// level and under runtime_config, so the cp#329 config-bundle the adapter
+// reads no longer leaves the runtime to slash-split "moonshot/..." → an
+// unregistered provider="moonshot" (the original NOT_CONFIGURED boot). The
+// canonical manifest exact-id-matches "moonshot/kimi-k2.6" to provider=platform.
+func TestEnsureDefaultConfig_StampsDerivedProvider(t *testing.T) {
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	files := handler.ensureDefaultConfig("ws-moonshot", models.CreateWorkspacePayload{
+		Name:    "Kimi Agent",
+		Tier:    2,
+		Runtime: "claude-code",
+		Model:   "moonshot/kimi-k2.6",
+	})
+
+	var parsed struct {
+		Model         string `yaml:"model"`
+		Provider      string `yaml:"provider"`
+		RuntimeConfig struct {
+			Model    string `yaml:"model"`
+			Provider string `yaml:"provider"`
+		} `yaml:"runtime_config"`
+	}
+	if err := yaml.Unmarshal(files["config.yaml"], &parsed); err != nil {
+		t.Fatalf("generated YAML invalid: %v\n%s", err, files["config.yaml"])
+	}
+	if parsed.Provider != "platform" {
+		t.Errorf("top-level provider = %q, want platform\n%s", parsed.Provider, files["config.yaml"])
+	}
+	if parsed.RuntimeConfig.Provider != "platform" {
+		t.Errorf("runtime_config.provider = %q, want platform\n%s", parsed.RuntimeConfig.Provider, files["config.yaml"])
+	}
+	// The claude-code model normalization still strips the slash prefix.
+	if parsed.Model != "kimi-k2.6" {
+		t.Errorf("top-level model = %q, want kimi-k2.6\n%s", parsed.Model, files["config.yaml"])
+	}
+}
+
+// TestEnsureDefaultConfig_DeriveMissOmitsProvider pins requirement #3: a model
+// the providers manifest does NOT recognize for the runtime (a derive miss)
+// must NOT write any `provider:` key — neither top-level nor under
+// runtime_config — preserving the pre-fix behavior (no empty `provider:`,
+// provisioning never fails on a miss). "gpt-4o" is not a registered
+// claude-code model, so DeriveProvider errors and the field is omitted.
+func TestEnsureDefaultConfig_DeriveMissOmitsProvider(t *testing.T) {
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	files := handler.ensureDefaultConfig("ws-derivemiss", models.CreateWorkspacePayload{
+		Name:    "Unregistered Agent",
+		Tier:    1,
+		Runtime: "claude-code",
+		Model:   "gpt-4o",
+	})
+
+	content := string(files["config.yaml"])
+	if strings.Contains(content, "provider:") {
+		t.Errorf("derive miss must NOT write any provider: key, got:\n%s", content)
+	}
+	// Sanity: a derive miss must still produce a valid, model-bearing config.
+	if !strings.Contains(content, `model: "gpt-4o"`) {
+		t.Errorf("derive miss should still render the model, got:\n%s", content)
+	}
+}
+
 func TestEnsureDefaultConfig_CustomModel(t *testing.T) {
 	broadcaster := newTestBroadcaster()
 	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
@@ -876,8 +876,9 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {
 	h.provisionWorkspaceAutoSync(workspaceID, "", nil, payload)
 	// sendRestartContext is a one-way notification to the new container; safe
 	// to fire async — the next restart cycle won't depend on it completing.
-	// Tracked via goAsync so the test harness can drain it before the
-	// global db.DB swap (sendRestartContext reads db.DB).
+	// Tracked via h.goAsync so tests can wait for it via h.asyncWG before
+	// closing the sqlmock. Without this, untracked goroutines hit the restored
+	// mock and cause "was not expected" errors in parallel CI execution (mc#1264).
 	h.goAsync(func() { h.sendRestartContext(workspaceID, restartData) })
 }

@@ -0,0 +1,160 @@
+package models
+
+// Contract test: the EXACT request bodies the workspace runtime emits for
+// POST /registry/register and POST /registry/heartbeat bind cleanly against
+// the real RegisterPayload / HeartbeatPayload structs — and a body missing a
+// binding:"required" field is REJECTED.
+//
+// Why this exists — the same blind-spot class as the #2251 A2A bug
+// ----------------------------------------------------------------
+// The existing registry_test.go binds HAND-WRITTEN JSON literals
+// (`{"id":"ws-123","agent_card":{...}}`) that encode the *test author's*
+// idea of the wire shape, not the bytes the runtime actually produces. The
+// runtime's producer (molecule-ai-workspace-runtime main.py:484 /
+// heartbeat.py:233) is a separate hand-rolled dict. Nothing pinned that the
+// two agree on the required keys.
+//
+// These golden bodies are byte-for-byte the shapes the runtime emits (see the
+// companion Python contract test test_registry_payload_contract.py, which
+// asserts the runtime PRODUCES exactly these required keys). Together the two
+// halves form a producer→consumer contract: if the runtime drops a required
+// key, the Python test fails; if this struct adds/renames a required field,
+// the Go test below fails — drift can't pass silently on either side.
+//
+// gin's ShouldBindJSON runs `binding.JSON.BindBody`, which is json.Unmarshal
+// followed by the go-playground validator on the `binding` tags. We invoke
+// that exact path here without standing up a gin.Context / DB / Redis.
+
+import (
+	"testing"
+
+	"github.com/gin-gonic/gin/binding"
+)
+
+// bindJSON mirrors gin's ShouldBindJSON: decode + validate the `binding` tags.
+func bindJSON(t *testing.T, body []byte, out any) error {
+	t.Helper()
+	return binding.JSON.BindBody(body, out)
+}
+
+// ---- /registry/register --------------------------------------------------
+
+// The exact body main.py emits (workspace_id + workspace_url + the hand-rolled
+// agent_card_dict). agent_card is json.RawMessage on the struct so its inner
+// shape is opaque to the bind — only presence is required.
+const runtimeRegisterBody = `{
+  "id": "11111111-1111-1111-1111-111111111111",
+  "url": "https://ws.example/a2a",
+  "agent_card": {
+    "name": "pm",
+    "description": "team lead",
+    "version": "1.0.0",
+    "url": "https://ws.example/a2a",
+    "skills": [{"id": "coding", "name": "coding", "description": "coding", "tags": []}],
+    "capabilities": {"streaming": true, "pushNotifications": false},
+    "configuration_status": "ready"
+  }
+}`
+
+func TestRegisterPayload_RuntimeBodyBinds(t *testing.T) {
+	var p RegisterPayload
+	if err := bindJSON(t, []byte(runtimeRegisterBody), &p); err != nil {
+		t.Fatalf("runtime register body must bind against RegisterPayload, got: %v", err)
+	}
+	if p.ID != "11111111-1111-1111-1111-111111111111" {
+		t.Errorf("id not decoded: %q", p.ID)
+	}
+	if len(p.AgentCard) == 0 {
+		t.Error("agent_card must be present (binding:required)")
+	}
+	if p.URL == "" {
+		t.Error("url should round-trip from the runtime body")
+	}
+}
+
+func TestRegisterPayload_MissingID_Rejected(t *testing.T) {
+	// The #2251-style regression: runtime drops the required `id` key.
+	const noID = `{"url":"https://ws.example/a2a","agent_card":{"name":"pm"}}`
+	var p RegisterPayload
+	if err := bindJSON(t, []byte(noID), &p); err == nil {
+		t.Fatal("a register body missing the required `id` MUST be rejected (would 400); got nil error")
+	}
+}
+
+func TestRegisterPayload_MissingAgentCard_Rejected(t *testing.T) {
+	const noCard = `{"id":"ws-1","url":"https://ws.example/a2a"}`
+	var p RegisterPayload
+	if err := bindJSON(t, []byte(noCard), &p); err == nil {
+		t.Fatal("a register body missing the required `agent_card` MUST be rejected (would 400); got nil error")
+	}
+}
+
+// ---- /registry/heartbeat -------------------------------------------------
+
+// The exact body heartbeat.py:233 emits (no wedge/metadata, the healthy case).
+const runtimeHeartbeatBody = `{
+  "workspace_id": "00000000-0000-0000-0000-000000000688",
+  "error_rate": 0.0,
+  "sample_error": "",
+  "active_tasks": 0,
+  "current_task": "",
+  "uptime_seconds": 42
+}`
+
+func TestHeartbeatPayload_RuntimeBodyBinds(t *testing.T) {
+	var p HeartbeatPayload
+	if err := bindJSON(t, []byte(runtimeHeartbeatBody), &p); err != nil {
+		t.Fatalf("runtime heartbeat body must bind against HeartbeatPayload, got: %v", err)
+	}
+	if p.WorkspaceID != "00000000-0000-0000-0000-000000000688" {
+		t.Errorf("workspace_id not decoded: %q", p.WorkspaceID)
+	}
+	if p.UptimeSeconds != 42 {
+		t.Errorf("uptime_seconds not decoded: %d", p.UptimeSeconds)
+	}
+}
+
+// The wedged-runtime heartbeat (heartbeat.py _runtime_state_payload +
+// _runtime_metadata_payload layered on) must also bind — runtime_metadata is a
+// pointer so a present block decodes, and an absent one stays nil.
+const runtimeHeartbeatWedgedBody = `{
+  "workspace_id": "00000000-0000-0000-0000-000000000688",
+  "error_rate": 0.5,
+  "active_tasks": 1,
+  "current_task": "stuck",
+  "uptime_seconds": 99,
+  "runtime_state": "wedged",
+  "sample_error": "Control request timeout: initialize",
+  "runtime_metadata": {
+    "capabilities": {"heartbeat": true, "scheduler": false},
+    "idle_timeout_seconds": 600
+  }
+}`
+
+func TestHeartbeatPayload_WedgedRuntimeBodyBinds(t *testing.T) {
+	var p HeartbeatPayload
+	if err := bindJSON(t, []byte(runtimeHeartbeatWedgedBody), &p); err != nil {
+		t.Fatalf("wedged heartbeat body must bind, got: %v", err)
+	}
+	if p.RuntimeState != "wedged" {
+		t.Errorf("runtime_state not decoded: %q", p.RuntimeState)
+	}
+	if p.RuntimeMetadata == nil {
+		t.Fatal("runtime_metadata must decode to a non-nil pointer when present")
+	}
+	if got := p.RuntimeMetadata.Capabilities["heartbeat"]; !got {
+		t.Error("runtime_metadata.capabilities[heartbeat] should be true")
+	}
+	if p.RuntimeMetadata.IdleTimeoutSeconds == nil || *p.RuntimeMetadata.IdleTimeoutSeconds != 600 {
+		t.Error("runtime_metadata.idle_timeout_seconds should decode to 600")
+	}
+}
+
+func TestHeartbeatPayload_MissingWorkspaceID_Rejected(t *testing.T) {
+	// The drift the producer-side Python test guards: workspace_id renamed/dropped.
+	const renamed = `{"id":"ws-688","error_rate":0.0,"active_tasks":0}`
+	var p HeartbeatPayload
+	if err := bindJSON(t, []byte(renamed), &p); err == nil {
+		t.Fatal("a heartbeat body missing the required `workspace_id` MUST be rejected (would 400); got nil error")
+	}
+}
@@ -99,10 +99,16 @@ func TestDeriveProvider_UnregisteredErrors(t *testing.T) {
 		runtime string
 		model   string
 	}{
-		// gpt-* is OpenAI — not in claude-code's native set.
+		// gpt-* is OpenAI — not in claude-code's native set (no openai arm;
+		// the platform-shared openai vendor is never wired into a BYOK runtime).
 		{"claude-code", "gpt-5.5"},
-		// deepseek is a catalog provider but in NO runtime's native set.
-		{"claude-code", "deepseek-v4-pro"},
+		// qwen-* is alibaba — a catalog provider NOT wired into claude-code
+		// (cp#529 wires alibaba only into hermes; claude-code's name-only BYOK
+		// arms are zai/deepseek/xiaomi-mimo). So it stays unregistered here.
+		// (NB: deepseek-* IS now routable on claude-code via the deepseek
+		// name-only arm — see the routability tests — so it is no longer a valid
+		// "unregistered" example; qwen replaces it.)
+		{"claude-code", "qwen-max"},
 		// codex is OpenAI-only — a kimi id is unregistered for it.
 		{"codex", "kimi-for-coding"},
 		// a slug no provider in the manifest matches at all.
--- a/Show More
+++ b/Show More