Merge branch 'main' into fix/plugin-uninstall-exec-errors

Merge PR #2362 via Gitea merge queue
Serialized merge by gitea-merge-queue after current-main, genuine approvals, and required CI checks were green.
2026-06-06 13:30:27 +00:00 · 2026-06-06 12:50:47 +00:00 · 2026-06-06 11:05:42 +00:00 · 2026-06-06 10:50:18 +00:00 · 2026-06-06 10:13:50 +00:00 · 2026-06-06 03:07:06 -07:00
54 changed files with 5592 additions and 350 deletions
@@ -361,15 +361,17 @@ def detect_drift(branch: str) -> tuple[list[str], dict]:
    """Returns (findings, debug). Empty findings == no drift.

    Raises:
-        ApiError: propagated from the protection fetch only when the
-                  failure is likely a transient Gitea outage (5xx).
-                  403/404 from the protection endpoint is treated as
-                  "cannot determine drift for this branch" — a token-
-                  scope issue (missing repo-admin on DRIFT_BOT_TOKEN) or
-                  a repo with no protection set should not turn the
-                  hourly cron red. The workflow continues to the next
-                  branch; no [ci-drift] issue is filed for a branch
-                  whose protection cannot be read.
+        ApiError: propagated (fail-closed) on a transient Gitea outage
+                  (5xx) AND on a 401/403 auth failure from the protection
+                  endpoint. A 401/403 means DRIFT_BOT_TOKEN cannot read
+                  branch protections at all — drift is UNVERIFIABLE, so
+                  this HARD gate must fail loud rather than green
+                  undetected drift (the regression class it exists to
+                  catch). An authenticated 404 (branch genuinely has no
+                  protection, e.g. staging pre-rollout) is the one
+                  tolerated skip: it returns ([], debug) with a loud
+                  ::warning:: and the workflow continues to the next
+                  branch.
    """
    findings: list[str] = []

@@ -403,17 +405,38 @@ def detect_drift(branch: str) -> tuple[list[str], dict]:
        m = _re.search(r"HTTP (\d{3})", msg)
        if m:
            http_status = int(m.group(1))
-        if http_status in (403, 404):
-            # Token lacks scope OR branch has no protection. Cannot
-            # determine drift — skip this branch. Do NOT exit non-zero;
-            # the issue IS the alarm, not a red workflow.
+        # FAIL-CLOSED contract (was fail-open: 403 AND 404 both returned
+        # [] with no signal — fixed). This is a HARD gate (no
+        # continue-on-error → false) running hourly on a PROTECTED context
+        # (schedule/dispatch on main). We split auth-failure from
+        # genuinely-absent:
+        #   401/403 → AUTH FAILURE: the token cannot read branch
+        #     protections at all, so drift CANNOT be determined for ANY
+        #     branch. Greening the hourly cron here means jobs↔protection
+        #     drift goes silently undetected — exactly the regression class
+        #     this sentinel exists to catch. Raise so the workflow fails
+        #     loud / fails closed.
+        #   404 → authenticated absent resource: this specific branch has
+        #     no protection (e.g. `staging` before its protection rollout).
+        #     Genuinely nothing to diff against — skip THIS branch with a
+        #     loud ::warning::, continue to the next.
+        if http_status in (401, 403):
            sys.stderr.write(
-                f"::error::GET {protection_path} returned HTTP {http_status} — "
-                f"DRIFT_BOT_TOKEN lacks repo-admin scope (Gitea 1.22.6 "
-                f"requires it for this endpoint) OR branch has no protection "
-                f"configured. Cannot determine drift for {branch}; "
-                f"skipping. Fix: grant repo-admin to mc-drift-bot or "
-                f"configure protection on {branch}.\n"
+                f"::error::GET {protection_path} returned HTTP "
+                f"{http_status} — DRIFT_BOT_TOKEN cannot read branch "
+                f"protections (needs repo-admin scope). AUTH FAILURE: "
+                f"drift CANNOT be determined, so this HARD gate FAILS "
+                f"CLOSED rather than greening undetected drift. Fix: grant "
+                f"repo-admin to mc-drift-bot (org team `drift-bot`, "
+                f"perm=admin) — fix the token, not the lint.\n"
+            )
+            raise
+        if http_status == 404:
+            sys.stderr.write(
+                f"::warning::GET {protection_path} returned HTTP 404 — "
+                f"branch '{branch}' has no protection configured "
+                f"(authenticated absent resource). Skipping drift check for "
+                f"{branch}; if it SHOULD be protected, configure it.\n"
            )
            debug = {
                "branch": branch,
@@ -424,7 +447,7 @@ def detect_drift(branch: str) -> tuple[list[str], dict]:
                "audit_env_checks": sorted(env_set),
            }
            return [], debug
-        # 5xx — propagate (transient outage, fail loud per design).
+        # 5xx / other — propagate (transient outage, fail loud per design).
        raise
    if not isinstance(protection, dict):
        sys.stderr.write(
@@ -1,16 +1,77 @@
 #!/usr/bin/env python3
 """gitea-merge-queue — conservative serialized merge bot for Gitea.

-Gitea 1.22.6 has auto-merge (`pull_auto_merge`) but no GitHub-style merge
+Gitea 1.22.6+ has auto-merge (`pull_auto_merge`) but no GitHub-style merge
 queue. This script provides the missing serialized policy in user space:

-1. Pick the oldest open PR carrying QUEUE_LABEL.
-2. Refuse to act unless main is green.
+1. Scan open same-repo PRs that are NOT opted out (auto-discovery, see below),
+   oldest-first, skipping drafts, until an ACTIONABLE one is found. A non-ready
+   candidate (REQUEST_CHANGES, mergeable!=True, insufficient genuine approvals,
+   or red required CI) is SKIPPED so it cannot head-of-line block newer ready
+   PRs; the scan continues to the next candidate.
+2. Refuse to act unless main's BP-required contexts are green.
 3. Refuse fork PRs; the queue may only mutate same-repo branches.
 4. If the PR branch does not contain current main, call Gitea's
   /pulls/{n}/update endpoint and stop. CI must rerun on the updated head.
-5. If the updated PR head has all required contexts green, merge with the
-   non-bypass merge actor token.
+5. Merge ONLY when, on the PR's CURRENT head sha:
+     - >= REQUIRED_APPROVALS distinct GENUINE official APPROVED reviews from
+       the recognised reviewer set (not stale, not dismissed, commit_id ==
+       current head), AND
+     - no open official REQUEST_CHANGES on the current head, AND
+     - every BP-required status context is green, AND
+     - the PR is mergeable.
+
+Authoritative gates (fail-closed):
+  - The REQUIRED status contexts come from BRANCH PROTECTION
+    (`status_check_contexts`), not a hand-maintained env list. If branch
+    protection cannot be enumerated, the queue HOLDS (does not merge blindly).
+  - NON-required reds (qa-review, security-review, sop-tier, sop-checklist
+    when not branch-required, E2E Chat, Staging SaaS, ci-arm64-advisory, any
+    continue-on-error job) MUST NOT block. They are reported, never gating.
+  - `force_merge=true` is used ONLY when the merge is blocked *solely* by
+    missing-but-non-required governance contexts (required are green + genuine
+    approvals present). It is NEVER used to bypass a failing REQUIRED context
+    or missing approvals.
+
+Auto-discovery (opt-OUT, label-optional):
+  The queue is SELF-SUSTAINING — a ready PR does NOT need a human (or an agent)
+  to add the `merge-queue` label first. When AUTO_DISCOVER is on (default), the
+  queue enumerates ALL open same-repo PRs and considers any that meets the full
+  merge bar (genuine approvals on current head + BP-required green + mergeable +
+  no open REQUEST_CHANGES). The merge bar above is UNCHANGED; auto-discovery only
+  changes WHICH PRs are considered, not whether they are mergeable.
+
+  This deliberately removes the historical dependency on an agent adding the
+  `merge-queue` label — agent Gitea tokens lack `write:issue` (labels are
+  issue-scoped), so they could never self-label and the queue stalled. The label
+  is now OPTIONAL metadata, not a gate.
+
+  SAFETY is preserved as opt-OUT: any PR carrying an opt-out label
+  (OPT_OUT_LABELS — `merge-queue-hold`, `do-not-auto-merge`, `wip`, `draft` by
+  default) is skipped (never auto-considered, never merged). Draft PRs
+  (draft=true STATE) are also skipped; the literal `draft` LABEL is an
+  additional explicit opt-out a human can apply without converting to a draft.
+  A human who wants to keep a PR out of autonomous merging just adds one of
+  those labels. Setting AUTO_DISCOVER=0 restores the legacy opt-IN behaviour
+  (only PRs already carrying QUEUE_LABEL are considered).
+
+Head-of-line (HOL) safety has two complementary layers:
+  (a) The queue SCANS THROUGH the FIFO candidate list and skips any non-ready
+      PR (REQUEST_CHANGES, mergeable!=True, insufficient genuine approvals, or
+      red required CI) instead of locking on the oldest and waiting, so a PR
+      that can never become ready without human action does not block newer
+      ready PRs.
+  (b) For the candidate the scan acts on, two permanent failure modes HOLD the
+      PR (apply HOLD_LABEL) and let the scan CONTINUE to the next candidate
+      rather than re-selecting the same wedged PR every tick:
+        - a permanent permission/4xx merge error (403/404/405), and
+        - a persistent branch-update conflict (the /update endpoint returns
+          HTTP 409 because the PR branch cannot be merged with main without a
+          manual rebase). A conflict will not self-resolve, so retrying it
+          every tick would HOL-block every ready PR behind it (issue #2352).
+
+Status-fetch is fail-closed: if the combined status for a sha cannot be
+fetched, the PR is skipped this tick (never treated as green).

 The script is intentionally one-PR-per-run. Workflow/cron concurrency should
 serialize invocations so two green PRs cannot merge against the same main.
@@ -40,6 +101,33 @@ WATCH_BRANCH = _env("WATCH_BRANCH", default="main")
 QUEUE_LABEL = _env("QUEUE_LABEL", default="merge-queue")
 HOLD_LABEL = _env("HOLD_LABEL", default="merge-queue-hold")
 UPDATE_STYLE = _env("UPDATE_STYLE", default="merge")
+# Auto-discovery (opt-OUT). When truthy (default), the queue considers ALL open
+# same-repo PRs that meet the merge bar, not only PRs already carrying
+# QUEUE_LABEL — so the queue is self-sustaining without any human/agent labeling
+# (agent tokens lack write:issue and cannot self-label). Set AUTO_DISCOVER=0 to
+# restore the legacy opt-IN behaviour (QUEUE_LABEL required to be considered).
+AUTO_DISCOVER = _env("AUTO_DISCOVER", default="1").strip().lower() not in {
+    "0",
+    "false",
+    "no",
+    "off",
+    "",
+}
+# Opt-OUT labels. A PR carrying ANY of these is skipped (never auto-considered,
+# never merged) — the human escape hatch from autonomous merging. HOLD_LABEL is
+# always included so the existing hold semantics keep working. `do-not-auto-merge`
+# and `wip` let a human keep a PR out of the auto-merge path without removing it.
+# `draft` is included as a literal label too: Gitea draft STATE (draft=true) is
+# already skipped via _issue_is_draft, but a "draft" LABEL is an additional,
+# explicit opt-out signal a human can apply without converting the PR to a draft.
+OPT_OUT_LABELS = {
+    name.strip()
+    for name in _env(
+        "OPT_OUT_LABELS",
+        default="do-not-auto-merge,wip,draft",
+    ).split(",")
+    if name.strip()
+} | ({HOLD_LABEL} if HOLD_LABEL else set())
 REQUIRED_CONTEXTS_RAW = _env(
    "REQUIRED_CONTEXTS",
    default=(
@@ -57,6 +145,24 @@ PUSH_REQUIRED_CONTEXTS_RAW = _env(
    default="CI / all-required (push)",
 )

+# Recognised official-reviewer set. A merge requires this many DISTINCT genuine
+# approvals (not stale/dismissed, on the current head sha) from accounts in
+# this set. The set is the real agents-team reviewer roster; founder/CTO-agent
+# accounts are intentionally excluded so the queue cannot be satisfied by a
+# human/owner approval alone — it must be a genuine peer review.
+REVIEWER_SET = {
+    name.strip()
+    for name in _env(
+        "REVIEWER_SET",
+        default="agent-reviewer,agent-researcher,agent-reviewer-cr2",
+    ).split(",")
+    if name.strip()
+}
+# Default mirrors molecule-core branch protection (required_approvals: 2). The
+# authoritative value is read from branch protection at runtime; this is only
+# the fallback when BP does not specify one.
+REQUIRED_APPROVALS_DEFAULT = int(_env("REQUIRED_APPROVALS", default="2") or "2")
+
 OWNER, NAME = (REPO.split("/", 1) + [""])[:2] if REPO else ("", "")
 API = f"https://{GITEA_HOST}/api/v1" if GITEA_HOST else ""

@@ -67,7 +173,27 @@ class ApiError(RuntimeError):

 class MergePermissionError(ApiError):
    """Merge failed with a permanent permission error (403/404/405).
-    The queue should skip this PR and move to the next one."""
+    The queue should HOLD this PR and move to the next one."""
+
+
+class BranchUpdateConflictError(ApiError):
+    """Updating the PR branch with the base hit a merge-conflict (HTTP 409).
+
+    A true merge-conflict is NOT transient: the branch cannot be auto-updated
+    until a human/agent rebases it. The queue should HOLD this PR (apply
+    HOLD_LABEL) and advance to the next candidate, exactly like the permission
+    path — otherwise the conflicted PR sits at the queue head and is retried
+    every tick forever, head-of-line-blocking every ready PR behind it.
+
+    NOTE: distinct from mergeable=None, which is Gitea STILL COMPUTING conflict
+    state — that case is handled as a transient WAIT (no hold). This error is
+    only raised on an explicit 409 returned by the /update endpoint."""
+
+
+class BranchProtectionUnavailable(ApiError):
+    """Branch protection (the authoritative required-context source) could not
+    be enumerated. The queue must HOLD rather than merge with an unverified
+    required-context set (fail-closed, no fail-open)."""


@dataclasses.dataclass(frozen=True)
@@ -75,6 +201,20 @@ class MergeDecision:
    ready: bool
    action: str
    reason: str
+    # When ready is True, force indicates the merge is blocked SOLELY by
+    # missing-but-non-required governance contexts (required are green +
+    # genuine approvals present), so force_merge=true is justified to bypass
+    # ONLY those non-required contexts. Defaults False.
+    force: bool = False
+
+
+@dataclasses.dataclass(frozen=True)
+class BranchProtection:
+    """The subset of branch protection the queue depends on."""
+
+    required_contexts: list[str]
+    required_approvals: int
+    block_on_rejected_reviews: bool


 def _require_runtime_env() -> None:
@@ -191,6 +331,117 @@ def required_contexts_green(
    return not missing_or_bad, missing_or_bad


+def parse_branch_protection(body: Any) -> BranchProtection:
+    """Extract the queue-relevant fields from a branch_protections payload.
+
+    Fail-closed: raises BranchProtectionUnavailable when status checks are
+    expected but the required-context list cannot be enumerated. We never fall
+    back to a hand-maintained env list as the authoritative required set —
+    doing so risks merging when a real required context is red/missing.
+    """
+    if not isinstance(body, dict):
+        raise BranchProtectionUnavailable("branch protection response not an object")
+    enable = bool(body.get("enable_status_check"))
+    contexts_raw = body.get("status_check_contexts")
+    if not enable:
+        # Status checks not enforced by BP at all. With no required contexts
+        # the queue would gate on approvals only — acceptable, but make it
+        # explicit and let the caller decide.
+        contexts: list[str] = []
+    else:
+        if not isinstance(contexts_raw, list):
+            raise BranchProtectionUnavailable(
+                "enable_status_check is true but status_check_contexts is not a list"
+            )
+        contexts = [c for c in contexts_raw if isinstance(c, str) and c.strip()]
+        if not contexts:
+            raise BranchProtectionUnavailable(
+                "enable_status_check is true but status_check_contexts is empty"
+            )
+    approvals = body.get("required_approvals")
+    required_approvals = (
+        int(approvals) if isinstance(approvals, int) else REQUIRED_APPROVALS_DEFAULT
+    )
+    return BranchProtection(
+        required_contexts=contexts,
+        required_approvals=required_approvals,
+        block_on_rejected_reviews=bool(body.get("block_on_rejected_reviews")),
+    )
+
+
+def get_branch_protection(branch: str) -> BranchProtection:
+    """Fetch branch protection for `branch`; fail-closed if unavailable."""
+    try:
+        _, body = api("GET", f"/repos/{OWNER}/{NAME}/branch_protections/{branch}")
+    except ApiError as exc:
+        raise BranchProtectionUnavailable(
+            f"could not fetch branch protection for {branch}: {exc}"
+        ) from exc
+    return parse_branch_protection(body)
+
+
+def genuine_approvals(
+    reviews: list[dict],
+    *,
+    head_sha: str,
+    reviewer_set: set[str],
+) -> tuple[set[str], list[str]]:
+    """Reduce a PR's reviews to genuine official approvals on the CURRENT head.
+
+    Returns (approvers, request_changes) where:
+      - approvers is the set of distinct logins (in reviewer_set) whose LATEST
+        review on the current head is an official, non-stale, non-dismissed
+        APPROVED, and
+      - request_changes is the list of logins (in reviewer_set) whose latest
+        official review on the current head is REQUEST_CHANGES.
+
+    "Current head" is enforced two ways, because Gitea exposes both signals:
+    a review must be `official` and NOT `stale`/`dismissed`, AND when the
+    review carries a commit_id it must equal head_sha. A review with no
+    commit_id but stale=False/dismissed=False is accepted (older Gitea rows).
+    We take each reviewer's LATEST submission (reviews arrive oldest-first), so
+    a later REQUEST_CHANGES correctly supersedes an earlier APPROVED and vice
+    versa.
+    """
+    latest_by_user: dict[str, dict] = {}
+    for review in reviews:
+        if not isinstance(review, dict):
+            continue
+        user = (review.get("user") or {}).get("login")
+        if not isinstance(user, str) or user not in reviewer_set:
+            continue
+        state = str(review.get("state") or "").upper()
+        if state not in {"APPROVED", "REQUEST_CHANGES"}:
+            continue  # ignore COMMENT/PENDING/DISMISSED-state rows
+        # reviews are returned oldest-first; later entries overwrite → latest wins
+        latest_by_user[user] = review
+
+    approvers: set[str] = set()
+    request_changes: list[str] = []
+    for user, review in latest_by_user.items():
+        if not review.get("official"):
+            continue
+        if review.get("stale") or review.get("dismissed"):
+            continue
+        commit_id = review.get("commit_id")
+        if isinstance(commit_id, str) and commit_id and head_sha:
+            if commit_id != head_sha:
+                continue  # review was on a previous head
+        state = str(review.get("state") or "").upper()
+        if state == "APPROVED":
+            approvers.add(user)
+        elif state == "REQUEST_CHANGES":
+            request_changes.append(user)
+    return approvers, request_changes
+
+
+def get_pull_reviews(pr_number: int) -> list[dict]:
+    _, body = api("GET", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/reviews")
+    if not isinstance(body, list):
+        raise ApiError(f"PR #{pr_number} reviews response not list")
+    return body
+
+
 def label_names(issue: dict) -> set[str]:
    return {
        label["name"]
@@ -219,6 +470,85 @@ def choose_next_queued_issue(
    return candidates[0] if candidates else None


+def _issue_is_draft(issue: dict) -> bool:
+    """True if the issue/PR is a draft.
+
+    The /issues listing exposes draft state under the `pull_request` sub-object
+    (`{"draft": true}`); some Gitea versions also surface a top-level `draft`.
+    Either is honoured. Drafts are never auto-considered for merging.
+    """
+    pr = issue.get("pull_request")
+    if isinstance(pr, dict) and pr.get("draft") is True:
+        return True
+    return issue.get("draft") is True
+
+
+def choose_candidate_issues(
+    issues: list[dict],
+    *,
+    queue_label: str,
+    opt_out_labels: set[str],
+    auto_discover: bool,
+) -> list[dict]:
+    """All open PRs eligible for a merge attempt this tick, oldest-first.
+
+    This is the auto-discovery selector. It does NOT change the merge bar — it
+    only changes WHICH PRs are considered:
+
+      - auto_discover=True (default): every open same-repo PR is a candidate,
+        EXCEPT those carrying an opt-out label or marked draft. The QUEUE_LABEL
+        is optional metadata, not a gate, so a ready PR reaches the queue with no
+        human/agent labeling (the write:issue gap is removed).
+      - auto_discover=False: legacy opt-IN — only PRs carrying queue_label are
+        candidates (still skipping opt-out labels and drafts).
+
+    Opt-out is the safety escape hatch: any opt_out_labels member present skips
+    the PR entirely (never considered, never merged). Ordering is oldest-first
+    (created_at, then number) to preserve the serialized FIFO ordering.
+
+    Returns the FULL ordered list (not just the head) so process_once can SCAN
+    THROUGH non-ready candidates instead of locking on the oldest. A non-ready
+    auto-discovered PR (e.g. one with REQUEST_CHANGES or mergeable=false, which
+    can never become ready without human action) must NOT head-of-line block the
+    newer ready PRs behind it — the readiness check happens per-candidate in
+    process_once, and a `wait` candidate is skipped to the next one.
+    """
+    candidates = []
+    for issue in issues:
+        if "pull_request" not in issue:
+            continue
+        labels = label_names(issue)
+        if opt_out_labels & labels:
+            continue  # opt-out: human kept this PR out of autonomous merging
+        if _issue_is_draft(issue):
+            continue  # drafts are never auto-merged
+        if not auto_discover and queue_label not in labels:
+            continue  # legacy opt-IN: require the queue label
+        candidates.append(issue)
+    candidates.sort(key=lambda issue: (issue.get("created_at") or "", int(issue["number"])))
+    return candidates
+
+
+def choose_next_candidate_issue(
+    issues: list[dict],
+    *,
+    queue_label: str,
+    opt_out_labels: set[str],
+    auto_discover: bool,
+) -> dict | None:
+    """The oldest eligible candidate, or None. Thin head-of-list wrapper around
+    choose_candidate_issues; retained for callers/tests that only want the head.
+    process_once uses the full list (choose_candidate_issues) so it can scan past
+    non-ready PRs rather than HOL-block on the oldest."""
+    candidates = choose_candidate_issues(
+        issues,
+        queue_label=queue_label,
+        opt_out_labels=opt_out_labels,
+        auto_discover=auto_discover,
+    )
+    return candidates[0] if candidates else None
+
+
 def pr_contains_base_sha(commits: list[dict], base_sha: str) -> bool:
    for commit in commits:
        sha = commit.get("sha") or commit.get("id")
@@ -233,36 +563,87 @@ def pr_has_current_base(pr: dict, commits: list[dict], main_sha: str) -> bool:
    return pr_contains_base_sha(commits, main_sha)


+def _non_required_red_present(
+    latest: dict[str, dict],
+    required_contexts: list[str],
+) -> bool:
+    """True if any NON-required context is non-success.
+
+    Such reds are the governance/SOP/advisory checks Gitea may still treat as
+    "missing required context" at merge time even though branch protection does
+    not require them. Their presence is what justifies force_merge=true (we
+    have already verified every REQUIRED context is green and approvals are
+    genuine, so force only bypasses these non-required reds).
+    """
+    required = set(required_contexts)
+    for context, status in latest.items():
+        if context in required:
+            continue
+        if status_state(status) != "success":
+            return True
+    return False
+
+
 def evaluate_merge_readiness(
    *,
    main_status: dict,
    pr_status: dict,
    required_contexts: list[str],
+    required_approvals: int,
+    approvers: set[str],
+    request_changes: list[str],
    pr_has_current_base: bool,
+    mergeable: bool,
    pr_labels: set[str] | None = None,
 ) -> MergeDecision:
-    # Check push-required contexts explicitly instead of combined state.
-    # Combined state can be "failure" due to non-blocking jobs
-    # (continue-on-error: true) that don't actually gate merges.
-    # CI / all-required (push) is the authoritative gate — it respects
-    # continue-on-error and correctly aggregates all blocking failures.
+    # 1) Main's push-required contexts must be green. Combined state can be
+    #    "failure" due to non-blocking jobs (continue-on-error: true) that do
+    #    not gate merges, so check the explicit required set, not combined.
    main_latest = latest_statuses_by_context(main_status.get("statuses") or [])
    main_ok, main_bad = required_contexts_green(main_latest, push_required_contexts())
    if not main_ok:
        return MergeDecision(False, "pause", "main required contexts not green: " + ", ".join(main_bad))
+
+    # 2) PR head must contain current main.
    if not pr_has_current_base:
        return MergeDecision(False, "update", "PR head does not contain current main")

-    # Check explicit required contexts instead of combined state. Combined state
-    # can be "failure" due to non-blocking jobs with continue-on-error: true
-    # (e.g. publish-runtime-autobump/pr-validate, qa-review on stale tokens).
-    # The required_contexts list is the authoritative gate — it includes only
-    # the checks that actually block merges.
+    # 3) No open official REQUEST_CHANGES on the current head.
+    if request_changes:
+        return MergeDecision(
+            False, "wait",
+            "open REQUEST_CHANGES on current head from: " + ", ".join(sorted(request_changes)),
+        )
+
+    # 4) Enough distinct genuine official approvals on the current head.
+    if len(approvers) < required_approvals:
+        return MergeDecision(
+            False, "wait",
+            f"insufficient genuine approvals on current head: have "
+            f"{len(approvers)} ({', '.join(sorted(approvers)) or 'none'}), "
+            f"need {required_approvals}",
+        )
+
+    # 5) Every BRANCH-PROTECTION-REQUIRED status context must be green. This is
+    #    the authoritative status gate — NON-required reds (qa-review,
+    #    security-review, sop-tier/sop-checklist when not BP-required, E2E Chat,
+    #    Staging SaaS, ci-arm64-advisory, continue-on-error jobs) are NOT
+    #    consulted here and must not block.
    latest = latest_statuses_by_context(pr_status.get("statuses") or [])
    ok, missing_or_bad = required_contexts_green(latest, required_contexts, pr_labels)
    if not ok:
        return MergeDecision(False, "wait", "required contexts not green: " + ", ".join(missing_or_bad))
-    return MergeDecision(True, "merge", "ready")
+
+    # 6) Gitea must consider the PR mergeable (no conflicts).
+    if not mergeable:
+        return MergeDecision(False, "wait", "PR is not mergeable (conflicts)")
+
+    # Ready. Use force_merge ONLY if the merge would otherwise be blocked by
+    # missing-but-non-required governance contexts. Required are green and
+    # approvals are genuine, so force only bypasses non-required reds — never a
+    # failing required context or missing approval.
+    force = _non_required_red_present(latest, required_contexts)
+    return MergeDecision(True, "merge", "ready", force=force)


 def get_branch_head(branch: str) -> str:
@@ -280,6 +661,12 @@ def get_combined_status(sha: str) -> dict:
    The /status endpoint caps the `statuses` array at 30 entries (Gitea
    default page size), so we fetch the full list via /statuses with a
    higher limit. The combined `state` still comes from /status.
+
+    Fail-closed: the PRIMARY /status fetch must succeed. If it raises, the
+    error propagates so the caller skips this PR this tick (we never treat a
+    failed status fetch as green — dev-sop "no fail-open"). Only the SECONDARY
+    /statuses enrichment (which merely extends the per-context list beyond the
+    30-entry cap) is best-effort; if it fails we still have the combined set.
    """
    _, combined = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status")
    if not isinstance(combined, dict):
@@ -329,6 +716,31 @@ def list_queued_issues() -> list[dict]:
    return body


+def list_candidate_issues(*, auto_discover: bool) -> list[dict]:
+    """Open PR issues eligible for consideration this tick.
+
+    With auto_discover=True (default) this enumerates ALL open PRs (no label
+    filter) so the queue is self-sustaining — a ready PR is considered without
+    any human/agent first adding QUEUE_LABEL. With auto_discover=False it falls
+    back to the legacy label-filtered listing (opt-IN). Opt-out filtering and
+    draft-skipping happen in choose_next_candidate_issue, not here.
+    """
+    if not auto_discover:
+        return list_queued_issues()
+    _, body = api(
+        "GET",
+        f"/repos/{OWNER}/{NAME}/issues",
+        query={
+            "state": "open",
+            "type": "pulls",
+            "limit": "50",
+        },
+    )
+    if not isinstance(body, list):
+        raise ApiError("candidate issues response not list")
+    return body
+
+
 def get_pull(pr_number: int) -> dict:
    _, body = api("GET", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}")
    if not isinstance(body, dict):
@@ -354,30 +766,97 @@ def update_pull(pr_number: int, *, dry_run: bool) -> None:
    print(f"::notice::updating PR #{pr_number} with base branch via style={UPDATE_STYLE}")
    if dry_run:
        return
+    try:
+        api(
+            "POST",
+            f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/update",
+            query={"style": UPDATE_STYLE},
+            expect_json=False,
+        )
+    except ApiError as exc:
+        # Gitea returns HTTP 409 when the base cannot be merged into the PR
+        # branch because of a real conflict. The queue cannot auto-resolve a
+        # conflict, so re-raise as BranchUpdateConflictError; process_once HOLDs
+        # the PR and advances (HOL guard) instead of retrying it forever.
+        # Match the HTTP STATUS token ("-> HTTP 409") specifically, not a bare
+        # "409" substring — the PR number or path can itself contain "409"
+        # (e.g. /pulls/1409/update) and must not be misread as a conflict.
+        if "-> HTTP 409" in str(exc):
+            raise BranchUpdateConflictError(str(exc)) from exc
+        raise  # re-raise other ApiErrors unchanged
+
+
+def add_label_by_name(pr_number: int, label_name: str, *, dry_run: bool) -> None:
+    """Apply an existing repo label (by name) to a PR/issue.
+
+    Used to HOLD a wedged PR so the queue advances. Resolves the label id from
+    the repo label set; if the label does not exist, raises ApiError (the
+    caller decides whether that is fatal).
+    """
+    print(f"::notice::applying label '{label_name}' to PR #{pr_number}")
+    if dry_run:
+        return
+    _, labels = api("GET", f"/repos/{OWNER}/{NAME}/labels", query={"limit": "100"})
+    label_id = None
+    if isinstance(labels, list):
+        for label in labels:
+            if isinstance(label, dict) and label.get("name") == label_name:
+                label_id = label.get("id")
+                break
+    if label_id is None:
+        raise ApiError(f"label '{label_name}' not found in repo {OWNER}/{NAME}")
    api(
        "POST",
-        f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/update",
-        query={"style": UPDATE_STYLE},
-        expect_json=False,
+        f"/repos/{OWNER}/{NAME}/issues/{pr_number}/labels",
+        body={"labels": [label_id]},
    )


-def merge_pull(pr_number: int, *, dry_run: bool) -> None:
-    payload = {
+def hold_pr(pr_number: int, hold_note: str, *, dry_run: bool) -> None:
+    """Apply HOLD_LABEL to a wedged PR so the queue advances past it.
+
+    choose_next_queued_issue skips HOLD_LABEL-bearing PRs, so this is the HOL
+    guard: a PR the queue cannot make progress on (permanent permission error
+    or unresolvable branch-update conflict) is held and a human/agent fixes it,
+    rather than the queue re-selecting it every tick forever. If the label
+    cannot be applied we still post the explanatory comment so the wedge is at
+    least visible — but we never loop on the PR.
+    """
+    try:
+        add_label_by_name(pr_number, HOLD_LABEL, dry_run=dry_run)
+    except ApiError as label_exc:
+        sys.stderr.write(
+            f"::error::could not apply HOLD_LABEL to PR #{pr_number}: {label_exc}\n"
+        )
+        hold_note += (
+            f"\n\n(NOTE: could not apply the hold label automatically: "
+            f"{label_exc}. Please add `{HOLD_LABEL}` manually.)"
+        )
+    post_comment(pr_number, hold_note, dry_run=dry_run)
+
+
+def merge_pull(pr_number: int, *, dry_run: bool, force: bool = False) -> None:
+    payload: dict[str, Any] = {
        "Do": "merge",
        "MergeTitleField": f"Merge PR #{pr_number} via Gitea merge queue",
        "MergeMessageField": (
            "Serialized merge by gitea-merge-queue after current-main, "
-            "SOP, and required CI checks were green."
+            "genuine approvals, and required CI checks were green."
        ),
    }
-    print(f"::notice::merging PR #{pr_number}")
+    if force:
+        # force_merge bypasses ONLY missing-but-non-required governance
+        # contexts. The caller has already verified required contexts are green
+        # and genuine approvals are present, so this never bypasses a failing
+        # required context or an approval shortfall.
+        payload["force_merge"] = True
+    print(f"::notice::merging PR #{pr_number}{' (force_merge: non-required reds)' if force else ''}")
    if dry_run:
        return
    try:
        api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
    except ApiError as exc:
-        # Re-raise permission-like errors so process_once can skip this PR.
+        # Re-raise permission-like errors so process_once can HOLD this PR.
        # 403 = no push access, 404 = repo/pr not found, 405 = not allowed.
        msg = str(exc)
        for code in ("403", "404", "405"):
@@ -387,7 +866,25 @@ def merge_pull(pr_number: int, *, dry_run: bool) -> None:


 def process_once(*, dry_run: bool = False) -> int:
-    contexts = required_contexts(REQUIRED_CONTEXTS_RAW)
+    # Required status contexts come from BRANCH PROTECTION, not a hand-kept env
+    # list. Fail-closed: if BP cannot be enumerated, HOLD the whole tick rather
+    # than merge against an unverified required set.
+    try:
+        bp = get_branch_protection(WATCH_BRANCH)
+    except BranchProtectionUnavailable as exc:
+        sys.stderr.write(
+            f"::error::queue held: branch protection for {WATCH_BRANCH} "
+            f"unavailable (fail-closed): {exc}\n"
+        )
+        return 0
+    contexts = bp.required_contexts
+    required_approvals = bp.required_approvals
+    print(
+        f"::notice::queue policy from branch protection: "
+        f"required_approvals={required_approvals} "
+        f"required_contexts={contexts or '[none]'}"
+    )
+
    main_sha = get_branch_head(WATCH_BRANCH)
    main_status = get_combined_status(main_sha)
    # Check push-required contexts explicitly instead of combined state.
@@ -398,83 +895,199 @@ def process_once(*, dry_run: bool = False) -> int:
        print(f"::notice::queue paused: {WATCH_BRANCH}@{main_sha[:8]} required contexts not green: {', '.join(main_bad)}")
        return 0

-    issue = choose_next_queued_issue(
-        list_queued_issues(),
+    candidates = choose_candidate_issues(
+        list_candidate_issues(auto_discover=AUTO_DISCOVER),
        queue_label=QUEUE_LABEL,
-        hold_label=HOLD_LABEL,
+        opt_out_labels=OPT_OUT_LABELS,
+        auto_discover=AUTO_DISCOVER,
    )
-    if not issue:
-        print("::notice::merge queue empty")
+    if not candidates:
+        print(
+            "::notice::no merge candidates "
+            f"(auto_discover={'on' if AUTO_DISCOVER else 'off'})"
+        )
        return 0

+    # HOL fix: SCAN THROUGH the FIFO candidate list until a PR we can ACT on is
+    # found, instead of locking on the oldest and waiting. A non-ready candidate
+    # (decision.action == "wait": REQUEST_CHANGES, mergeable!=True, insufficient
+    # genuine approvals, or red required CI) is SKIPPED — it must NOT head-of-line
+    # block the newer ready PRs behind it. The merge bar is unchanged: a skipped
+    # PR is never merged, and the first ACTIONABLE candidate (an "update" that
+    # advances a stale branch, or a fully-ready "merge") terminates the scan.
+    #
+    # `update` is treated as actionable, not skippable: a PR whose head merely
+    # lacks current main is in a legitimate in-progress state (updating it +
+    # rerunning CI moves it toward ready), unlike a PR that can never become
+    # ready without a human (RC / conflict), which is a `wait` and gets skipped.
+    for issue in candidates:
+        decision, ctx = _evaluate_candidate(
+            issue,
+            main_sha=main_sha,
+            main_status=main_status,
+            required_contexts=contexts,
+            required_approvals=required_approvals,
+            dry_run=dry_run,
+        )
+        if decision is None:
+            continue  # not merge-eligible (not-open / opted-out / fork / wrong base)
+        pr_number = ctx["pr_number"]
+        print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}")
+        if decision.action == "wait":
+            # Non-ready: skip to the next candidate (no HOL block, no merge).
+            continue
+        if decision.action == "update":
+            try:
+                update_pull(pr_number, dry_run=dry_run)
+            except BranchUpdateConflictError as exc:
+                # The branch cannot be updated with main because of a real
+                # conflict (HTTP 409 from /update). This is the #2352 HOL guard:
+                # a conflict will not self-resolve without a human/agent rebase,
+                # so re-attempting the update every tick would head-of-line block
+                # every ready PR behind it. HOLD this PR (apply HOLD_LABEL, which
+                # is an opt-out label so later ticks skip it) and CONTINUE the
+                # scan so a newer ready PR can still merge this tick. Fail-closed:
+                # a held PR is skipped, never merged.
+                sys.stderr.write(
+                    f"::error::branch-update conflict for PR #{pr_number}: {exc}\n"
+                )
+                hold_note = (
+                    "merge-queue: could not update this branch with "
+                    f"`{WATCH_BRANCH}` — the update returned a merge conflict "
+                    f"(HTTP 409) that the queue cannot auto-resolve ({exc}). "
+                    f"Applied `{HOLD_LABEL}` to unblock the queue (HOL guard). "
+                    f"Fix: rebase/merge `{WATCH_BRANCH}` into this branch and "
+                    f"resolve the conflicts, then remove `{HOLD_LABEL}` to requeue."
+                )
+                hold_pr(pr_number, hold_note, dry_run=dry_run)
+                continue  # held — keep scanning for a mergeable candidate
+            post_comment(
+                pr_number,
+                (
+                    f"merge-queue: updated this branch with `{WATCH_BRANCH}` at "
+                    f"`{main_sha[:12]}`. Waiting for CI on the refreshed head."
+                ),
+                dry_run=dry_run,
+            )
+            return 0
+        if decision.ready:
+            latest_main_sha = get_branch_head(WATCH_BRANCH)
+            if latest_main_sha != main_sha:
+                print(
+                    f"::notice::main moved {main_sha[:8]} -> {latest_main_sha[:8]}; "
+                    "deferring to next tick"
+                )
+                return 0
+            try:
+                merge_pull(pr_number, dry_run=dry_run, force=decision.force)
+            except MergePermissionError as exc:
+                # Permanent merge failure (HTTP 403/404/405). HOLD this PR by
+                # applying HOLD_LABEL (it becomes an opt-out label, so subsequent
+                # ticks skip it) and CONTINUE scanning so the queue still advances
+                # to the next ready PR this tick rather than stalling.
+                sys.stderr.write(f"::error::merge permission error for PR #{pr_number}: {exc}\n")
+                hold_note = (
+                    "merge-queue: merge failed with a permanent permission error "
+                    f"({exc}). No available token has Can-merge permission for this "
+                    f"PR. Applied `{HOLD_LABEL}` to unblock the queue (HOL guard). "
+                    f"Fix: grant Can-merge to the queue token, then remove "
+                    f"`{HOLD_LABEL}` to requeue."
+                )
+                try:
+                    add_label_by_name(pr_number, HOLD_LABEL, dry_run=dry_run)
+                except ApiError as label_exc:
+                    # If we cannot even apply the hold label, fall back to a comment
+                    # so the wedge is at least visible; do NOT loop on this PR.
+                    sys.stderr.write(
+                        f"::error::could not apply HOLD_LABEL to PR #{pr_number}: {label_exc}\n"
+                    )
+                    hold_note += (
+                        f"\n\n(NOTE: could not apply the hold label automatically: "
+                        f"{label_exc}. Please add `{HOLD_LABEL}` manually.)"
+                    )
+                post_comment(pr_number, hold_note, dry_run=dry_run)
+                continue  # held — keep scanning for a mergeable candidate
+            return 0
+    return 0
+
+
+def _evaluate_candidate(
+    issue: dict,
+    *,
+    main_sha: str,
+    main_status: dict,
+    required_contexts: list[str],
+    required_approvals: int,
+    dry_run: bool,
+) -> tuple[MergeDecision | None, dict]:
+    """Evaluate a single auto-discovered candidate against the full merge bar.
+
+    Returns (decision, ctx) where ctx carries {"pr_number"}. A None decision
+    means the PR is not merge-eligible at all (not open / opted-out / draft /
+    fork / wrong base) and the caller should skip to the next candidate; for
+    fork / wrong-base the explanatory comment is posted here before returning.
+
+    The merge bar is UNCHANGED from the single-PR path — this only factors the
+    per-PR evaluation out so process_once can scan multiple candidates. A failed
+    status fetch still raises (fail-closed): it propagates to the caller so the
+    PR is never treated as green.
+    """
    pr_number = int(issue["number"])
+    ctx = {"pr_number": pr_number}
    pr = get_pull(pr_number)
    if pr.get("state") != "open":
        print(f"::notice::PR #{pr_number} is not open; skipping")
-        return 0
+        return None, ctx
+    # Defensive opt-out/draft re-check on the authoritative pull payload: the
+    # /issues listing's label/draft view can lag, but the merge bar must respect
+    # the live pull state. (choose_candidate_issues already filtered on the
+    # listing; this guards against a stale listing racing a just-added opt-out.)
+    if OPT_OUT_LABELS & label_names(pr):
+        print(f"::notice::PR #{pr_number} carries an opt-out label; skipping")
+        return None, ctx
+    if pr.get("draft") is True:
+        print(f"::notice::PR #{pr_number} is a draft; skipping")
+        return None, ctx
    if pr.get("base", {}).get("ref") != WATCH_BRANCH:
        post_comment(pr_number, f"merge-queue: skipped; base branch is not `{WATCH_BRANCH}`.", dry_run=dry_run)
-        return 0
+        return None, ctx
    if pr.get("head", {}).get("repo_id") != pr.get("base", {}).get("repo_id"):
        post_comment(pr_number, "merge-queue: skipped; fork PRs are not supported by the serialized queue.", dry_run=dry_run)
-        return 0
+        return None, ctx

    head_sha = pr.get("head", {}).get("sha")
    if not isinstance(head_sha, str) or len(head_sha) < 7:
        raise ApiError(f"PR #{pr_number} missing head sha")
    commits = get_pull_commits(pr_number)
    current_base = pr_has_current_base(pr, commits, main_sha)
+    # Fail-closed: a failed status fetch raises here and propagates (the PR is
+    # never treated as green).
    pr_status = get_combined_status(head_sha)
    pr_labels = label_names(pr)
+    # FAIL-CLOSED: Gitea returns mergeable=None (or omits the field) while it is
+    # still COMPUTING conflict state. Only the literal True is decisive proof the
+    # PR is conflict-free; None and False both mean "not (yet) mergeable". We must
+    # NOT autonomously merge on an unknown — treat anything but True as not-yet-
+    # mergeable so evaluate_merge_readiness returns a "wait" decision.
+    mergeable = pr.get("mergeable") is True
+
+    reviews = get_pull_reviews(pr_number)
+    approvers, request_changes = genuine_approvals(
+        reviews, head_sha=head_sha, reviewer_set=REVIEWER_SET
+    )
+
    decision = evaluate_merge_readiness(
        main_status=main_status,
        pr_status=pr_status,
-        required_contexts=contexts,
+        required_contexts=required_contexts,
+        required_approvals=required_approvals,
+        approvers=approvers,
+        request_changes=request_changes,
        pr_has_current_base=current_base,
+        mergeable=mergeable,
        pr_labels=pr_labels,
    )
-
-    print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}")
-    if decision.action == "update":
-        update_pull(pr_number, dry_run=dry_run)
-        post_comment(
-            pr_number,
-            (
-                f"merge-queue: updated this branch with `{WATCH_BRANCH}` at "
-                f"`{main_sha[:12]}`. Waiting for CI on the refreshed head."
-            ),
-            dry_run=dry_run,
-        )
-        return 0
-    if decision.ready:
-        latest_main_sha = get_branch_head(WATCH_BRANCH)
-        if latest_main_sha != main_sha:
-            print(
-                f"::notice::main moved {main_sha[:8]} -> {latest_main_sha[:8]}; "
-                "deferring to next tick"
-            )
-            return 0
-        try:
-            merge_pull(pr_number, dry_run=dry_run)
-        except MergePermissionError as exc:
-            # Permanent merge failure (HTTP 403/404/405). Post a comment so
-            # maintainers know why, then return 0 so this tick is done.
-            # The PR stays in the queue; future ticks can retry after the
-            # permission issue is resolved.
-            sys.stderr.write(f"::error::merge permission error for PR #{pr_number}: {exc}\n")
-            post_comment(
-                pr_number,
-                (
-                    "merge-queue: merge failed with HTTP 405 'User not allowed to merge PR'. "
-                    "No available token has Can-merge permission on this repo. "
-                    "Fix: grant Can-merge to a token, or add a maintain/admin collaborator. "
-                    "Skipping to next queued PR on next tick."
-                ),
-                dry_run=dry_run,
-            )
-            return 0
-        return 0
-    return 0
+    return decision, ctx


 def main() -> int:
@@ -40,20 +40,24 @@ Context-format note (Gitea 1.22.6):

 Exit codes:
  0 — no required workflow has a paths/paths-ignore filter (clean) OR
-      branch_protections endpoint returned 403/404 (token-scope issue;
-      surfaced via ::error:: but non-fatal so a missing scope doesn't
-      red-X every PR — fix the token, not the lint).
+      branch_protections returned an authenticated 404 (branch
+      genuinely has no protection; ::warning:: surfaced).
  1 — at least one required workflow has a paths/paths-ignore filter
      (the gate-degrading defect class).
  2 — env contract violation (missing GITEA_TOKEN/HOST/REPO/BRANCH).
  3 — workflows directory missing or workflow YAML unparseable.
-  4 — protection response shape unexpected (non-dict body on 2xx).
+  4 — FAIL-CLOSED verification failure: branch_protections 401/403
+      auth failure (token can't read BP), 5xx transient (propagated
+      ApiError), or unexpected response shape. This is a HARD gate on
+      a protected context — it MUST NOT green when it cannot verify.

 Auth note: `GET /repos/.../branch_protections/{branch}` requires
 repo-admin role in Gitea 1.22.6. The workflow-default `GITHUB_TOKEN`
 is non-admin; we re-use `DRIFT_BOT_TOKEN` (same persona that powers
-ci-required-drift.yml). If `DRIFT_BOT_TOKEN` is unavailable in a future
-context, the script falls through gracefully (exit 0 + ::error::).
+ci-required-drift.yml). A 401/403 from a missing-scope token is an
+AUTH FAILURE that FAILS CLOSED (exit 4) — fix the token, not the
+lint. Only an authenticated 404 (genuinely-absent protection) is a
+tolerated graceful skip.
 """
 from __future__ import annotations

@@ -309,14 +313,36 @@ def run() -> int:
        msg = str(e)
        m = re.search(r"HTTP (\d{3})", msg)
        http_status = int(m.group(1)) if m else None
-        if http_status in (403, 404):
+        # FAIL-CLOSED contract (was fail-open: 403 AND 404 both exit 0 —
+        # fixed). This is a HARD gate (no continue-on-error → false) on a
+        # PROTECTED context: pull_request (same-repo; fork PRs can't carry
+        # DRIFT_BOT_TOKEN) + workflow_dispatch. We split auth-failure from
+        # genuinely-absent:
+        #   401/403 → AUTH FAILURE: the token cannot read branch
+        #     protections, so we CANNOT enumerate the required-check set
+        #     and CANNOT verify the no-paths-filter invariant. Fail loud /
+        #     fail closed (exit 4) — do NOT green an unverifiable gate.
+        #   404 → authenticated absent resource: branch genuinely has no
+        #     protection. Nothing to enumerate; tolerated degradation,
+        #     surfaced loudly (exit 0 with ::warning::).
+        if http_status in (401, 403):
            sys.stderr.write(
-                f"::error::GET {protection_path} returned HTTP {http_status} — "
-                f"DRIFT_BOT_TOKEN lacks repo-admin scope (Gitea 1.22.6 "
-                f"requires it for this endpoint) OR branch '{BRANCH}' has "
-                f"no protection configured. Cannot enumerate required "
-                f"checks; skipping lint with exit 0 to avoid red-X on "
-                f"every PR. Fix: grant repo-admin to mc-drift-bot.\n"
+                f"::error::GET {protection_path} returned HTTP "
+                f"{http_status} — DRIFT_BOT_TOKEN cannot read branch "
+                f"protections (needs repo-admin scope). AUTH FAILURE: "
+                f"cannot enumerate required checks, so this lint FAILS "
+                f"CLOSED rather than greening a gate it could not verify. "
+                f"Fix: grant repo-admin to mc-drift-bot (org team "
+                f"`drift-bot`, perm=admin) — fix the token, not the lint.\n"
+            )
+            return 4
+        if http_status == 404:
+            sys.stderr.write(
+                f"::warning::GET {protection_path} returned HTTP 404 — "
+                f"branch '{BRANCH}' has no protection configured "
+                f"(authenticated absent resource). No required contexts to "
+                f"check. If '{BRANCH}' SHOULD be protected, this is a real "
+                f"finding.\n"
            )
            return 0
        raise
@@ -36,7 +36,8 @@ Daily scheduled run + workflow_dispatch:

  1. GET `branch_protections/{BRANCH}` (needs DRIFT_BOT_TOKEN with
     repo-admin scope; same persona as ci-required-drift.yml).
-     Graceful-degrade on 403/404 per Tier 2a contract.
+     FAIL CLOSED on 401/403 (auth failure → exit 2); a genuine
+     authenticated 404 (no protection) is a loud ::warning:: skip.

  2. Walk `.gitea/workflows/*.yml` via PyYAML AST. For each workflow,
     enumerate its emitted contexts: `{workflow.name} / {job.name or
@@ -59,10 +60,14 @@ Daily scheduled run + workflow_dispatch:

 Exit codes
 ----------
-  0 — clean OR API 403/404 (graceful-degrade, surfaces ::error::).
+  0 — clean, OR an authenticated 404 (branch genuinely has no
+      protection — surfaces ::warning::, not a fail-open).
  1 — at least one BP context has no emitter.
-  2 — env contract violation, workflows-dir missing, or YAML parse
-      error.
+  2 — env contract violation, workflows-dir missing, YAML parse
+      error, OR a fail-closed verification failure: 401/403 auth
+      failure (token can't read BP) or transient/unexpected API
+      error. This is a HARD gate on a protected context (schedule/
+      dispatch on main) — it MUST NOT green when it cannot verify.

 Env
 ---
@@ -394,28 +399,49 @@ def run() -> int:
        return 2

    # 1. Pull BP.
+    #
+    # FAIL-CLOSED contract (was fail-open with exit 0 — fixed). This lint
+    # is a HARD gate (continue-on-error: false) and only ever runs on a
+    # PROTECTED context: schedule + workflow_dispatch on `main`. There is
+    # NO fork/advisory split here — the DRIFT_BOT_TOKEN secret is always
+    # present and trusted, so an auth failure or transient error is a real
+    # inability-to-verify, not a legitimate degradation. We MUST fail loud
+    # (`::error::` + nonzero) rather than green a gate we could not check.
    status, bp = api("GET", f"/repos/{repo}/branch_protections/{branch}")
    if status == "forbidden":
        sys.stderr.write(
-            f"::error::GET branch_protections/{branch} returned HTTP 403 — "
-            f"DRIFT_BOT_TOKEN lacks repo-admin scope (Gitea 1.22.6 requires "
-            f"it for this endpoint). Skipping lint with exit 0 to avoid "
-            f"red-X on every run. Fix: grant repo-admin to mc-drift-bot. "
-            f"Per Tier 2a contract.\n"
+            f"::error::GET branch_protections/{branch} returned HTTP "
+            f"401/403 — DRIFT_BOT_TOKEN cannot read branch protections "
+            f"(needs repo-admin scope; Gitea requires it for this "
+            f"endpoint). This is an AUTH FAILURE, not an absent resource: "
+            f"the lint CANNOT verify the BP↔emitter invariant, so it FAILS "
+            f"CLOSED instead of greening a gate it could not check. Fix: "
+            f"grant repo-admin to mc-drift-bot (org team `drift-bot`, "
+            f"perm=admin) — fix the token, not the lint.\n"
        )
-        return 0
+        return 2
    if status == "not_found":
+        # Genuine 404 WITH a valid token = branch has no protection
+        # configured. On `main` this is itself suspicious (main should
+        # always be protected) but it is a real, authenticated read of an
+        # absent resource — not an auth failure — so we surface it loudly
+        # but do not hard-fail on the genuinely-absent case.
        print(
-            f"::notice::branch '{branch}' has no protection configured; "
-            f"nothing to lint."
+            f"::warning::branch '{branch}' has no protection configured "
+            f"(authenticated 404); nothing to lint. If '{branch}' SHOULD be "
+            f"protected, this is a real finding — configure branch "
+            f"protection."
        )
        return 0
    if status != "ok" or not isinstance(bp, dict):
        sys.stderr.write(
-            f"::error::branch_protections/{branch} response unexpected; "
-            f"status={status}. Treating as transient; exit 0.\n"
+            f"::error::branch_protections/{branch} read failed with "
+            f"status={status} (transient/unexpected). The lint CANNOT "
+            f"verify the BP↔emitter invariant on this run; FAILING CLOSED "
+            f"rather than greening unverified. Re-run; if it persists, "
+            f"investigate Gitea API health / token validity.\n"
        )
-        return 0
+        return 2

    bp_contexts: list[str] = list(bp.get("status_check_contexts") or [])
    if not bp_contexts:
@@ -57,10 +57,14 @@ comment unrelated to the new job.
 Exit codes
 ----------
  0 — no new emissions, all new emissions have valid directives,
-      or BP read errored (graceful-degrade per Tier 2a contract).
+      OR an authenticated 404 (branch genuinely has no protection
+      to verify against — surfaces ::warning::, not a fail-open).
  1 — at least one new emission lacks a directive, or has
      `bp-required: yes` but the context is missing from BP.
-  2 — env contract violation or YAML parse error.
+  2 — env contract violation, YAML parse error, OR a fail-closed
+      verification failure: 401/403 auth failure (token can't read
+      BP) or transient/unexpected API error. HARD gate on a
+      same-repo PR context — MUST NOT green when it cannot verify.

 Env
 ---
@@ -420,33 +424,51 @@ def run() -> int:
        return 0

    # Step 3 — fetch BP context list.
+    #
+    # FAIL-CLOSED contract (was fail-open with exit 0 — fixed). This is a
+    # HARD gate (continue-on-error: false) that runs on `pull_request`
+    # against `main`. On molecule-core, `pull_request` runs are same-repo
+    # (fork PRs cannot carry the DRIFT_BOT_TOKEN secret), so this is a
+    # PROTECTED/trusted context with no legitimate fork-degradation. An
+    # auth failure or transient error means we CANNOT verify a NEW
+    # bp-required emission is actually in BP — so we MUST fail loud rather
+    # than green the gate. (A genuinely-absent 404 read with a valid token
+    # is the one tolerated degradation: there is no BP to check against.)
    status, bp = api("GET", f"/repos/{repo}/branch_protections/{branch}")
    bp_contexts: set[str] = set()
    if status == "forbidden":
        sys.stderr.write(
-            f"::error::GET branch_protections/{branch} returned HTTP 403 — "
-            f"DRIFT_BOT_TOKEN lacks repo-admin scope. Cannot verify "
-            f"bp-required directives; skipping lint with exit 0 per "
-            f"Tier 2a contract. Fix the token, not the lint.\n"
+            f"::error::GET branch_protections/{branch} returned HTTP "
+            f"401/403 — DRIFT_BOT_TOKEN cannot read branch protections "
+            f"(needs repo-admin scope). This is an AUTH FAILURE: the lint "
+            f"CANNOT verify the bp-required directives on this PR, so it "
+            f"FAILS CLOSED instead of greening unverified. Fix: grant "
+            f"repo-admin to mc-drift-bot (org team `drift-bot`) — fix the "
+            f"token, not the lint.\n"
        )
-        return 0
+        return 2
    elif status == "not_found":
-        # Branch has no protection — nothing to verify against; the
-        # bp-required: yes directive can't be satisfied. Treat as
-        # graceful-skip rather than red-X.
+        # Authenticated 404 — branch genuinely has no protection. There is
+        # nothing to verify a `bp-required: yes` directive against, so this
+        # is the one tolerated degradation. Surface loudly (on `main` a
+        # missing protection is itself a real finding) but do not hard-fail.
        print(
-            f"::notice::branch '{branch}' has no protection; cannot verify "
-            f"bp-required directives. Skipping (exit 0)."
+            f"::warning::branch '{branch}' has no protection (authenticated "
+            f"404); cannot verify bp-required directives. If '{branch}' "
+            f"SHOULD be protected this is a real finding."
        )
        return 0
    elif status == "ok" and isinstance(bp, dict):
        bp_contexts = set(bp.get("status_check_contexts") or [])
    else:
        sys.stderr.write(
-            f"::error::branch_protections/{branch} response unexpected; "
-            f"status={status}. Treating as transient; exit 0.\n"
+            f"::error::branch_protections/{branch} read failed with "
+            f"status={status} (transient/unexpected). CANNOT verify "
+            f"bp-required directives on this PR; FAILING CLOSED rather than "
+            f"greening unverified. Re-run; if persistent, check Gitea API "
+            f"health / token validity.\n"
        )
-        return 0
+        return 2

    # Step 4 — validate each new emission's directive.
    violations: list[str] = []
@@ -174,6 +174,16 @@ def parse_directives(
        if not parts:
            continue
        first = parts[0]
+        # Em-dash (U+2014) is a common visual separator in user-written
+        # notes, e.g.  /sop-ack Five-Axis — five-axis-review
+        # If raw_slug contains an em-dash, split on the first one so
+        # the part before becomes the slug and the rest becomes the note.
+        note_from_slug = ""
+        slug_source = raw_slug
+        emdash_idx = raw_slug.find("—")
+        if emdash_idx != -1:
+            slug_source = raw_slug[:emdash_idx].strip()
+            note_from_slug = raw_slug[emdash_idx + 1 :].strip()
        # If the slug-capture greedily matched multiple words (e.g.
        # "comprehensive testing"), preserve normalize behavior: join
        # the WHOLE first-word-token only; trailing words get appended to
@@ -186,13 +196,19 @@ def parse_directives(
            # as slug and "testing extra-note" as note. We defer the
            # disambiguation to the caller via the returned canonical
            # slug. For simplicity: try the WHOLE captured string first.
-            canonical = normalize_slug(raw_slug, numeric_aliases)
+            canonical = normalize_slug(slug_source, numeric_aliases)
        else:
-            canonical = normalize_slug(first, numeric_aliases)
+            canonical = normalize_slug(slug_source, numeric_aliases)
        note_from_group = (m.group(3) or "").strip()
-        # If we collapsed multi-word slug into kebab and there's a
-        # trailing-text group too, append it.
-        entry = (kind, canonical, note_from_group)
+        # The em-dash (U+2014) is a visual separator; the regex puts it
+        # in group(3) because it is outside the slug character class.
+        # Strip it so "/sop-ack slug — note" yields just "note".
+        if note_from_group.startswith("—"):
+            note_from_group = note_from_group[1:].strip()
+        # Combine note_from_slug (em-dash split) with note_from_group
+        # (trailing text after the slug captured by the regex group).
+        combined_note = (note_from_slug + " " + note_from_group).strip()
+        entry = (kind, canonical, combined_note)
        if kind == "sop-n/a":
            na_directives.append(entry)
        else:
@@ -48,7 +48,6 @@ set -euo pipefail
 # workflow-level jq install can fail on runners with network restrictions
 # (GitHub releases not reachable from some runner networks — infra#241
 # follow-up). This fallback is idempotent — no-op when jq is already on PATH.
-# SOP_FAIL_OPEN=1 makes this always exit 0 so CI never blocks on jq absence.
 if ! command -v jq >/dev/null 2>&1; then
  echo "::notice::jq not found on PATH — attempting install..."
  _jq_installed="no"
@@ -67,12 +66,6 @@ if ! command -v jq >/dev/null 2>&1; then
  if ! command -v jq >/dev/null 2>&1; then
    echo "::error::jq installation failed — apt-get and GitHub binary both failed."
    echo "::error::sop-tier-check requires jq for all JSON API parsing."
-    # SOP_FAIL_OPEN=1 is set in the workflow step's env — makes script always
-    # exit 0 so CI never blocks. The SOP-6 tier review gate remains enforced.
-    if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
-      echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
-      exit 0
-    fi
    exit 1
  fi
 fi
@@ -101,15 +94,10 @@ echo "::notice::tier-check start: repo=$OWNER/$NAME pr=$PR_NUMBER author=$PR_AUT
 # cause the script to exit prematurely when the token is empty/invalid — the
 # if check below handles that case gracefully. Without || true, a 401 from an
 # empty/invalid token causes jq to exit 1, triggering set -e and exiting the
-# entire script before SOP_FAIL_OPEN can be evaluated (the check is in the jq-
-# install block; if jq is already on PATH, that block is skipped entirely).
+# entire script before the error can be logged.
 WHOAMI=$(curl -sS -H "$AUTH" "${API}/user" | jq -r '.login // ""') || true
 if [ -z "$WHOAMI" ]; then
  echo "::error::GITEA_TOKEN cannot resolve a user via /api/v1/user — check the token scope and that the secret is wired correctly."
-  if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
-    echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
-    exit 0
-  fi
  exit 1
 fi
 echo "::notice::token resolves to user: $WHOAMI"
@@ -119,10 +107,6 @@ echo "::notice::token resolves to user: $WHOAMI"
 HEAD_SHA=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}" | jq -r '.head.sha // ""') || true
 if [ -z "$HEAD_SHA" ]; then
  echo "::error::Failed to fetch PR head SHA — token may be invalid."
-  if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
-    echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
-    exit 0
-  fi
  exit 1
 fi
 debug "pr-head-sha=$HEAD_SHA"
@@ -215,10 +199,6 @@ if [ "${SOP_DEBUG:-}" = "1" ]; then
 fi
 if [ "$_HTTP_EXIT" -ne 0 ] || [ "$HTTP_CODE" != "200" ]; then
  echo "::error::GET /orgs/${OWNER}/teams failed (curl exit=$_HTTP_EXIT HTTP=$HTTP_CODE) — token may lack read:org scope or be invalid."
-  if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
-    echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
-    exit 0
-  fi
  exit 1
 fi

@@ -265,17 +245,13 @@ done

 # 5. Read approving reviewers. set +e disables set -e temporarily so that curl
 # failures (e.g. empty/invalid token → HTTP 401) do not abort the script before
-# SOP_FAIL_OPEN is evaluated. set -e is restored immediately after.
+# set -e is restored immediately after.
 set +e
 REVIEWS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}/reviews")
 _REVIEWS_EXIT=$?
 set -e
 if [ $_REVIEWS_EXIT -ne 0 ] || [ -z "$REVIEWS" ]; then
  echo "::error::Failed to fetch reviews (curl exit=$_REVIEWS_EXIT) — token may be invalid or unreachable."
-  if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
-    echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
-    exit 0
-  fi
  exit 1
 fi
 APPROVERS=$(echo "$REVIEWS" | jq -r --arg head_sha "$HEAD_SHA" '[.[] | select(.state=="APPROVED" and .commit_id == $head_sha) | .user.login] | unique | .[]') || true
@@ -290,48 +266,75 @@ debug "approvers: $(echo "$APPROVERS" | tr '\n' ' ')"
 # Pre/post spaces ensure case patterns *${_t}* match even when the name
 # is the first or last entry (bash case *word* needs delimiters on both sides).
 #
-# FALLBACK: if ALL team probes return 403 (token lacks read:org scope),
-# fall back to /orgs/{org}/members/{user}. This returns 204 for any org
-# member — a superset of team membership. Accepting it as a fallback means
-# the gate passes when the token is scoped to repo+user only (core-bot PAT).
-# This is safe because: (a) org membership is a prerequisite for every
-# eligible team; (b) the AND-composition of internal#189 still requires
-# multiple independent approvers; (c) any token with read:repository can
-# see the approving reviews, so bypass requires a colluding approver.
+# FAIL-CLOSED AUTHORIZATION (security: SOP tier gate is an AUTHORIZATION gate).
+#
+# This used to fall back to /orgs/{org}/members/{user} whenever every team
+# probe failed and credit any org member as a member of EVERY queried team.
+# That was a privilege-escalation: org membership is NOT team membership, so
+# a 403/visibility/token-scope gap on the team probes silently promoted a
+# plain org member to satisfy tier:high (ceo). An inability-to-verify became
+# an authorization GRANT. The fallback is REMOVED — org membership must never
+# satisfy a team-gated tier.
+#
+# A team-membership probe has exactly three meaningful outcomes:
+#   200 / 204  → the user IS a member of that team       (credit it)
+#   404        → the user is definitively NOT a member    (no credit, verified)
+#   anything else (403 / 401 / 5xx / curl failure / non-numeric)
+#              → membership CANNOT be read                 (cannot-verify)
+#
+# Per the dev-sop fail-closed rule (inability-to-verify = failure, never a
+# pass — and here, never an authorization grant), a cannot-verify outcome on
+# ANY probe is a HARD infra failure: we publish a loud cannot-verify error and
+# exit non-zero. We do NOT proceed to evaluate the tier expression on a partial
+# / unverifiable membership picture, because doing so could let an unverifiable
+# approver's clause silently fail-or-pass on incomplete data. Fix the token
+# scope (read:organization) or the runner network — not the gate.
 declare -A APPROVER_TEAMS
+_verify_failed=""   # accumulates "<user>:<team>(HTTP <code>)" for probes we could not read
 for U in $APPROVERS; do
  [ "$U" = "$PR_AUTHOR" ] && debug "skip self-review by $U" && continue
-  _any_team_success="no"
  for T in "${!TEAM_ID[@]}"; do
    ID="${TEAM_ID[$T]}"
+    set +e
    CODE=$(curl -sS -o /dev/null -w '%{http_code}' -H "$AUTH" \
      "${API}/teams/${ID}/members/${U}")
-    debug "probe: $U in team $T (id=$ID) → HTTP $CODE"
-    if [ "$CODE" = "200" ] || [ "$CODE" = "204" ]; then
-      APPROVER_TEAMS[$U]="${APPROVER_TEAMS[$U]:- } ${APPROVER_TEAMS[$U]:+ }$T "
-      debug "$U qualifies for team $T"
-      _any_team_success="yes"
+    _curl_exit=$?
+    set -e
+    debug "probe: $U in team $T (id=$ID) → HTTP $CODE (curl exit=$_curl_exit)"
+    if [ "$_curl_exit" -ne 0 ]; then
+      # curl itself failed (DNS, connection refused, timeout) — unreachable.
+      _verify_failed="${_verify_failed}${_verify_failed:+, }${U}:${T}(curl exit ${_curl_exit})"
+      continue
    fi
-  done
-  # Fallback: if every team probe returned 403, try org membership.
-  # "??" teams were never resolved to IDs so they never entered the loop.
-  # If the user is an org member, credit them as being in each queried team
-  # (engineers, managers, ceo are all org-level). This is safe because org
-  # membership is a prerequisite for all three, and bypass requires a colluding
-  # approver (same risk as before the AND-composition).
-  if [ "$_any_team_success" = "no" ]; then
-    ORG_CODE=$(curl -sS -o /dev/null -w '%{http_code}' -H "$AUTH" \
-      "${API}/orgs/${OWNER}/members/${U}")
-    debug "probe: $U in org $OWNER (fallback) → HTTP $ORG_CODE"
-    if [ "$ORG_CODE" = "204" ]; then
-      for T in "${!TEAM_ID[@]}"; do
+    case "$CODE" in
+      200|204)
        APPROVER_TEAMS[$U]="${APPROVER_TEAMS[$U]:- } ${APPROVER_TEAMS[$U]:+ }$T "
-      done
-      debug "$U credited as org member for all queried teams (fallback — token may lack read:org)"
-    fi
-  fi
+        debug "$U qualifies for team $T"
+        ;;
+      404)
+        # Definitively not a member of this team — a verified negative.
+        debug "$U is NOT a member of team $T (verified 404)"
+        ;;
+      *)
+        # 403/401/5xx/etc — membership is unreadable. Do NOT treat as "not a
+        # member" and do NOT fall back to org membership. This is cannot-verify.
+        _verify_failed="${_verify_failed}${_verify_failed:+, }${U}:${T}(HTTP ${CODE})"
+        ;;
+    esac
+  done
 done

+# Fail-closed: if ANY membership probe could not be read, we cannot make an
+# authorization decision. Publish a loud cannot-verify / infra-failed status
+# and exit non-zero. Never grant the tier on unverifiable membership.
+if [ -n "$_verify_failed" ]; then
+  echo "::error::sop-tier-check CANNOT VERIFY team membership — gate FAILS CLOSED."
+  echo "::error::Unreadable membership probe(s): ${_verify_failed}"
+  echo "::error::A team-membership probe returned 403/401/5xx (or curl failed). The SOP tier gate is an authorization gate; an inability to verify team membership is treated as a FAILURE, never a pass. Org membership is NOT team membership and is never credited as a fallback."
+  echo "::error::Fix: ensure GITEA_TOKEN (SOP_TIER_CHECK_TOKEN) has read:organization scope and the Gitea API is reachable from the runner, then re-run. Do NOT relax this gate."
+  exit 1
+fi
+
 # 7. Evaluate the tier expression.
 #
 # legacy OR-gate: use the simplified loop from before internal#189.
@@ -105,12 +105,26 @@ if [ "${SOP_REFIRE_DISABLE_RATE_LIMIT:-}" != "1" ]; then
 fi

 # 3. Invoke sop-tier-check.sh with the env it expects.
-# The canonical workflow intentionally fail-opens the job conclusion
-# (`bash .gitea/scripts/sop-tier-check.sh || true`) while Gitea branch
-# protection enforces reviewer approvals separately. Keep the refire path
-# aligned with that workflow status behavior; otherwise /refire-tier-check can
-# post a hard failure that the canonical pull_request_target workflow would
-# not publish.
+#
+# FAIL-CLOSED contract (was fail-open — fixed 2026-06-05,
+# fix/core-ci-fail-closed). The previous shape was:
+#     bash "$SCRIPT" || true
+#     TIER_EXIT=0          # <-- hardcoded success
+# which discarded the real verdict and ALWAYS POSTed
+# `state=success` for the REQUIRED context
+# `sop-tier-check / tier-check (pull_request)`. That meant ANY
+# collaborator could comment `/refire-tier-check` to forcibly green
+# the SOP-6 approval gate on the PR head SHA — a fail-open AND a
+# privilege bypass of branch protection. The canonical
+# pull_request_target workflow's conclusion publishes the same
+# context honestly (red on a real violation); the refire MUST mirror
+# THAT honesty, not a discarded exit code.
+#
+# We now capture the script's real exit code under `set +e` and POST
+# success ONLY when it actually exited 0. sop-tier-check.sh itself
+# fails closed on infra faults (no SOP_FAIL_OPEN in this refire env),
+# so a bad token / unreachable API / missing jq → non-zero → we POST
+# `state=failure`, never a false green.
 #
 # SOP_REFIRE_TIER_CHECK_SCRIPT env var lets tests substitute a mock —
 # sop-tier-check.sh uses bash 4+ associative arrays which trigger a known
@@ -125,7 +139,10 @@ if [ ! -f "$SCRIPT" ]; then
 fi

 # Re-invoke. Pipe stdout/stderr through so the runner log shows the
-# tier-check decision inline.
+# tier-check decision inline. Capture the REAL exit code (set +e so a
+# non-zero verdict doesn't abort this script under set -e) — the POST
+# below keys off it, so a failed tier-check posts state=failure.
+set +e
 GITEA_TOKEN="$GITEA_TOKEN" \
  GITEA_HOST="$GITEA_HOST" \
  REPO="$REPO" \
@@ -133,8 +150,9 @@ GITEA_TOKEN="$GITEA_TOKEN" \
  PR_AUTHOR="$PR_AUTHOR" \
  SOP_DEBUG="${SOP_DEBUG:-0}" \
  SOP_LEGACY_CHECK="${SOP_LEGACY_CHECK:-0}" \
-  bash "$SCRIPT" || true
-TIER_EXIT=0
+  bash "$SCRIPT"
+TIER_EXIT=$?
+set -e
 debug "sop-tier-check.sh exit=$TIER_EXIT"

 # 4. POST the resulting status.
@@ -170,4 +188,12 @@ if [ "$POST_HTTP" != "200" ] && [ "$POST_HTTP" != "201" ]; then
 fi

 echo "::notice::sop-tier-refire posted state=$STATE for context=\"$CONTEXT\" on sha=$HEAD_SHA"
-exit "$TIER_EXIT"
+# Exit 0: the refire JOB succeeded — it re-evaluated the gate and posted
+# an HONEST status. The gate VERDICT is carried by the POSTed status
+# ($STATE), which is what branch protection reads; a failing tier-check
+# posts state=failure (red on the PR), so there is no fail-open. We do
+# NOT also exit non-zero on a failing verdict — that would double-signal
+# the same failure as both a red status AND a red refire job. The
+# fail-open that mattered (TIER_EXIT hardcoded to 0 → always state=success)
+# is fixed above by capturing the real exit code.
+exit 0
@@ -208,6 +208,22 @@ class TestParseDirectives(unittest.TestCase):
        d = self.parse_ack_revoke("/sop-ack Comprehensive_Testing")
        self.assertEqual(d[0][1], "comprehensive-testing")

+    def test_emdash_separator_parsed_correctly(self):
+        # Em-dash (U+2014) between slug and note is common in practice.
+        # /sop-ack Five-Axis — five-axis-review
+        # → slug = five-axis, note = — five-axis-review
+        d = self.parse_ack_revoke("/sop-ack Five-Axis — five-axis-review")
+        self.assertEqual(len(d), 1)
+        self.assertEqual(d[0][1], "five-axis")
+        self.assertIn("five-axis-review", d[0][2])
+
+    def test_emdash_no_note(self):
+        # Em-dash at end of slug: only slug, no note content
+        d = self.parse_ack_revoke("/sop-ack Five-Axis —")
+        self.assertEqual(len(d), 1)
+        self.assertEqual(d[0][1], "five-axis")
+        self.assertEqual(d[0][2], "")  # em-dash is separator-only → empty note
+

 # ---------------------------------------------------------------------------
 # section_marker_present
@@ -0,0 +1,272 @@
+#!/usr/bin/env bash
+# Security regression test for the SOP tier-gate AUTHORIZATION bypass.
+#
+# Bug (fixed in fix/sop-tier-authz-no-org-fallback):
+#   sop-tier-check.sh probed team membership at /teams/{id}/members/{user}.
+#   If EVERY team probe failed (e.g. 403 — token lacks read:organization, or
+#   any visibility/flakiness gap), it FELL BACK to /orgs/{org}/members/{user}
+#   and credited that org member as a member of EVERY queried team. The
+#   evaluator then treated those synthetic memberships as real, so a plain
+#   NON-CEO org member satisfied tier:high (ceo). A visibility/auth gap became
+#   a real highest-tier authorization PASS — privilege escalation.
+#
+# Fix (fail-closed authorization):
+#   - The org-member ⇒ "member of all teams" fallback is REMOVED. Org
+#     membership is never credited as team membership.
+#   - A team probe that returns anything other than 200/204 (member) or 404
+#     (verified non-member) is a CANNOT-VERIFY condition: the gate fails loud
+#     (exit 1) with a cannot-verify status and never grants the tier.
+#
+# Method: this is a true end-to-end test. It prepends a fake `curl` to PATH
+# that serves canned Gitea API responses keyed by URL, then runs the REAL
+# sop-tier-check.sh. The fake exercises the genuine probe→credit→evaluate
+# path — no logic is re-implemented in the test.
+
+set -euo pipefail
+
+THIS_DIR="$(cd "$(dirname "$0")" && pwd)"
+SCRIPT_DIR="$(cd "$THIS_DIR/.." && pwd)"
+SCRIPT="$SCRIPT_DIR/sop-tier-check.sh"
+
+command -v jq >/dev/null 2>&1 || { echo "::error::jq required but not found"; exit 1; }
+[ -f "$SCRIPT" ] || { echo "::error::sop-tier-check.sh not found at $SCRIPT — test must fail loudly if the script is absent"; exit 1; }
+
+# sop-tier-check.sh uses `declare -A` (associative arrays), which require
+# bash >= 4. CI runners (Ubuntu) ship bash 5; macOS ships 3.2. Resolve a
+# bash >= 4 to run the script under.
+pick_bash() {
+  local c
+  for c in bash /opt/homebrew/bin/bash /usr/local/bin/bash /bin/bash; do
+    local p; p="$(command -v "$c" 2>/dev/null || true)"
+    [ -n "$p" ] || continue
+    local maj; maj="$("$p" -c 'echo "${BASH_VERSINFO[0]}"' 2>/dev/null || echo 0)"
+    if [ "${maj:-0}" -ge 4 ]; then echo "$p"; return 0; fi
+  done
+  return 1
+}
+BASH4="$(pick_bash)" || { echo "::error::need bash >= 4 to run sop-tier-check.sh (associative arrays); none found"; exit 1; }
+echo "using bash: $BASH4 ($("$BASH4" -c 'echo $BASH_VERSION'))"
+
+PASS=0
+FAIL=0
+
+assert_eq() {
+  local label="$1" expected="$2" got="$3"
+  if [ "$expected" = "$got" ]; then
+    echo "  PASS  $label"
+    PASS=$((PASS + 1))
+  else
+    echo "  FAIL  $label"
+    echo "        expected: <$expected>"
+    echo "        got:      <$got>"
+    FAIL=$((FAIL + 1))
+  fi
+}
+
+assert_contains() {
+  local label="$1" haystack="$2" needle="$3"
+  if printf '%s' "$haystack" | grep -qF -- "$needle"; then
+    echo "  PASS  $label"
+    PASS=$((PASS + 1))
+  else
+    echo "  FAIL  $label (missing substring: <$needle>)"
+    FAIL=$((FAIL + 1))
+  fi
+}
+
+assert_not_contains() {
+  local label="$1" haystack="$2" needle="$3"
+  if printf '%s' "$haystack" | grep -qF -- "$needle"; then
+    echo "  FAIL  $label (unexpected substring present: <$needle>)"
+    FAIL=$((FAIL + 1))
+  else
+    echo "  PASS  $label"
+    PASS=$((PASS + 1))
+  fi
+}
+
+# ---------------------------------------------------------------------------
+# Fake-curl harness.
+#
+# The real script calls curl in two shapes:
+#   (a) body capture:   curl -sS -H AUTH URL                 -> prints JSON body
+#   (b) http-code:      curl -sS -o FILE -w '%{http_code}' -H AUTH URL
+#   (c) http-code only: curl -sS -o /dev/null -w '%{http_code}' -H AUTH URL
+#
+# Our fake reads the URL (last non-flag arg), looks up a response in fixture
+# files under $FIXDIR, and emits body and/or http-code accordingly.
+# ---------------------------------------------------------------------------
+
+make_harness() {
+  # $1 = scenario dir to populate with fixtures
+  local FIXDIR="$1"
+  local BIN="$FIXDIR/bin"
+  mkdir -p "$BIN"
+  cat > "$BIN/curl" <<'FAKE'
+#!/usr/bin/env bash
+# Fake curl for sop-tier-check authz tests. Looks up canned responses by URL.
+set -u
+FIXDIR="${SOP_TEST_FIXDIR:?SOP_TEST_FIXDIR unset}"
+
+url=""
+out=""
+want_code="no"
+prev=""
+for a in "$@"; do
+  case "$prev" in
+    -o) out="$a" ;;
+  esac
+  case "$a" in
+    http*://*) url="$a" ;;
+    '%{http_code}') want_code="yes" ;;
+  esac
+  # -w '%{http_code}' arrives as the value of the -w flag
+  if [ "$prev" = "-w" ] && [ "$a" = '%{http_code}' ]; then want_code="yes"; fi
+  prev="$a"
+done
+
+# Map URL -> fixture key (a filename-safe slug).
+# We only need the path after /api/v1.
+path="${url#*/api/v1}"
+slug="$(printf '%s' "$path" | tr '/?=&' '____')"
+
+body_file="$FIXDIR/body${slug}"
+code_file="$FIXDIR/code${slug}"
+
+# Emit body to -o target (or capture for stdout) when a body fixture exists.
+body=""
+if [ -f "$body_file" ]; then body="$(cat "$body_file")"; fi
+if [ -n "$out" ]; then
+  printf '%s' "$body" > "$out"
+else
+  printf '%s' "$body"
+fi
+
+# Emit http code when requested.
+if [ "$want_code" = "yes" ]; then
+  if [ -f "$code_file" ]; then
+    printf '%s' "$(cat "$code_file")"
+  else
+    printf '200'
+  fi
+fi
+exit 0
+FAKE
+  chmod +x "$BIN/curl"
+  echo "$BIN"
+}
+
+# Common fixtures shared by scenarios. $1 = FIXDIR, $2 = approver login,
+# $3 = tier label name (e.g. tier:high), $4 = teams JSON.
+seed_common() {
+  local FIXDIR="$1" approver="$2" tier="$3" teams_json="$4"
+  mkdir -p "$FIXDIR"
+  # /user -> whoami
+  printf '%s' '{"login":"sop-bot"}' > "$FIXDIR/body_user"
+  # PR head sha
+  printf '%s' '{"head":{"sha":"headsha1"}}' \
+    > "$FIXDIR/body_repos_molecule-ai_molecule-core_pulls_42"
+  # labels
+  printf '%s' "[{\"name\":\"$tier\"}]" \
+    > "$FIXDIR/body_repos_molecule-ai_molecule-core_issues_42_labels"
+  # org teams list
+  printf '%s' "$teams_json" > "$FIXDIR/body_orgs_molecule-ai_teams"
+  printf '%s' '200' > "$FIXDIR/code_orgs_molecule-ai_teams"
+  # reviews: one APPROVED on current head by $approver
+  printf '%s' "[{\"state\":\"APPROVED\",\"commit_id\":\"headsha1\",\"user\":{\"login\":\"$approver\"}}]" \
+    > "$FIXDIR/body_repos_molecule-ai_molecule-core_pulls_42_reviews"
+}
+
+run_script() {
+  # $1 = FIXDIR (must contain bin/curl). Returns combined stdout+stderr; sets RC.
+  local FIXDIR="$1"
+  local BIN="$FIXDIR/bin"
+  set +e
+  OUT=$(
+    SOP_TEST_FIXDIR="$FIXDIR" \
+    PATH="$BIN:$PATH" \
+    GITEA_TOKEN="faketoken" \
+    GITEA_HOST="git.moleculesai.app" \
+    REPO="molecule-ai/molecule-core" \
+    PR_NUMBER="42" \
+    PR_AUTHOR="pr-author" \
+    SOP_DEBUG="0" \
+    SOP_LEGACY_CHECK="0" \
+    "$BASH4" "$SCRIPT" 2>&1
+  )
+  RC=$?
+  set -e
+  printf '%s' "$OUT"
+  return $RC
+}
+
+TEAMS_JSON='[{"name":"ceo","id":10},{"name":"engineers","id":11},{"name":"managers","id":12}]'
+
+echo "=============================================================="
+echo "Scenario 1: tier:high, team probe 403 (cannot read), approver"
+echo "            is a plain org member but NOT in ceo team."
+echo "            EXPECT: tier NOT granted (fail-closed cannot-verify)."
+echo "=============================================================="
+S1="$(mktemp -d)"
+make_harness "$S1" >/dev/null
+seed_common "$S1" "org-only-bob" "tier:high" "$TEAMS_JSON"
+# Team membership probe for ceo (id=10) returns 403 — cannot read.
+printf '%s' '403' > "$S1/code_teams_10_members_org-only-bob"
+# The OLD bug path: org membership probe would 204 and synthetic-credit.
+printf '%s' '204' > "$S1/code_orgs_molecule-ai_members_org-only-bob"
+set +e
+OUT1="$(run_script "$S1")"; RC1=$?
+set -e
+echo "$OUT1" | sed 's/^/    /'
+echo "    (exit=$RC1)"
+assert_eq "S1 exit non-zero (tier NOT granted)" "1" "$([ "$RC1" -ne 0 ] && echo 1 || echo 0)"
+assert_not_contains "S1 did NOT print PASSED" "$OUT1" "sop-tier-check PASSED"
+assert_contains "S1 cannot-verify error surfaced" "$OUT1" "CANNOT VERIFY"
+assert_contains "S1 names the unreadable probe (403)" "$OUT1" "HTTP 403"
+rm -rf "$S1"
+
+echo
+echo "=============================================================="
+echo "Scenario 2: tier:high, genuine ceo team member (probe 204)."
+echo "            EXPECT: tier GRANTED."
+echo "=============================================================="
+S2="$(mktemp -d)"
+make_harness "$S2" >/dev/null
+seed_common "$S2" "real-ceo" "tier:high" "$TEAMS_JSON"
+printf '%s' '204' > "$S2/code_teams_10_members_real-ceo"   # ceo team: member
+set +e
+OUT2="$(run_script "$S2")"; RC2=$?
+set -e
+echo "$OUT2" | sed 's/^/    /'
+echo "    (exit=$RC2)"
+assert_eq "S2 exit zero (granted)" "0" "$RC2"
+assert_contains "S2 printed PASSED" "$OUT2" "sop-tier-check PASSED"
+rm -rf "$S2"
+
+echo
+echo "=============================================================="
+echo "Scenario 3: tier:high, approver is an org member but a VERIFIED"
+echo "            non-member of ceo (team probe 404). Org probe would"
+echo "            204 — must NEVER be synthetic-credited."
+echo "            EXPECT: tier NOT granted (clause FAIL), no fallback."
+echo "=============================================================="
+S3="$(mktemp -d)"
+make_harness "$S3" >/dev/null
+seed_common "$S3" "org-member-carol" "tier:high" "$TEAMS_JSON"
+printf '%s' '404' > "$S3/code_teams_10_members_org-member-carol"  # verified NOT in ceo
+printf '%s' '204' > "$S3/code_orgs_molecule-ai_members_org-member-carol" # org member (must be ignored)
+set +e
+OUT3="$(run_script "$S3")"; RC3=$?
+set -e
+echo "$OUT3" | sed 's/^/    /'
+echo "    (exit=$RC3)"
+assert_eq "S3 exit non-zero (tier NOT granted)" "1" "$([ "$RC3" -ne 0 ] && echo 1 || echo 0)"
+assert_not_contains "S3 did NOT print PASSED" "$OUT3" "sop-tier-check PASSED"
+assert_contains "S3 reported a real clause FAIL (not cannot-verify)" "$OUT3" "FAILED for tier:high"
+assert_not_contains "S3 did NOT cannot-verify (404 is a verified negative)" "$OUT3" "CANNOT VERIFY"
+rm -rf "$S3"
+
+echo
+echo "------"
+echo "PASS=$PASS FAIL=$FAIL"
+[ "$FAIL" -eq 0 ]
@@ -246,21 +246,24 @@ assert_contains "T1 POST context is sop-tier-check / tier-check" \
  '"context": "sop-tier-check / tier-check (pull_request)"' "$POSTED"
 assert_contains "T1 description names commenter" "test-runner" "$POSTED"

-# T2: missing tier label → tier-check fails internally, but refire status
-# matches the canonical workflow's fail-open job conclusion.
+# T2: missing tier label → tier-check fails internally (mock exits 1).
+# FAIL-CLOSED contract (fix/core-ci-fail-closed): refire now captures the
+# REAL exit code and POSTs state=failure — it does NOT forge a green on
+# the required context. The refire job itself still exits 0 (it succeeded
+# at posting an honest failure status).
 run_scenario "T2_no_tier_label" "fail_no_label"
 RC=$(cat "$FIX_STATE_DIR/last_rc")
 POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
-assert_eq "T2 exit code 0 (canonical fail-open)" "0" "$RC"
-assert_contains "T2 POSTed state=success" '"state": "success"' "$POSTED"
+assert_eq "T2 exit code 0 (posted an honest status)" "0" "$RC"
+assert_contains "T2 POSTed state=failure (no forged green)" '"state": "failure"' "$POSTED"

-# T3: tier:low present but ZERO approving reviews → internal tier check fails,
-# refire status remains aligned with the canonical workflow.
+# T3: tier:low present but ZERO approving reviews → internal tier check
+# fails (mock exits 1). Refire POSTs state=failure, never a false green.
 run_scenario "T3_no_approvals" "fail_no_approvals"
 RC=$(cat "$FIX_STATE_DIR/last_rc")
 POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
-assert_eq "T3 exit code 0 (canonical fail-open)" "0" "$RC"
-assert_contains "T3 POSTed state=success" '"state": "success"' "$POSTED"
+assert_eq "T3 exit code 0 (posted an honest status)" "0" "$RC"
+assert_contains "T3 POSTed state=failure (no forged green)" '"state": "failure"' "$POSTED"

 # T4: closed PR — refire is a no-op (no POST, exit 0)
 run_scenario "T4_closed" "pass"
@@ -205,5 +205,5 @@ n/a_gates:
    required_teams: [security, managers, ceo]
    description: >-
      Security review N/A when this change has no security surface
-      (docs-only, pure-frontend, dependency-only). A security/owners
+      (docs-only, pure-frontend, dependency-only). A security/managers/ceo
      member must post /sop-n/a security-review to activate.
@@ -34,11 +34,6 @@ jobs:
  check:
    name: Block forbidden paths
    runs-on: ubuntu-latest
-    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
-    # the PR. Follow-up PR flips this off after surfaced defects are
-    # triaged.
-    # mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
-    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
@@ -290,6 +290,15 @@ jobs:
          echo "ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
          echo "MOLECULE_ADMIN_TOKEN=${E2E_ADMIN_TOKEN}" >> "$GITHUB_ENV"
          echo "Admin token configured for the e2e platform (ADMIN_TOKEN + MOLECULE_ADMIN_TOKEN)."
+          # Channels e2e test seam (core#2332 P1.10). These env-gated overrides
+          # let the LIVE Slack-webhook send path + Telegram discover path target
+          # the local mock upstreams that tests/e2e/test_channels_e2e.sh binds,
+          # so the outbound serialize+POST is provable in CI (was unit-mock-only).
+          # Inert in prod/staging — those deploys never set these. The fixed
+          # loopback ports MUST match the script's E2E_CHANNELS_*_PORT defaults.
+          echo "MOLECULE_CHANNELS_TEST_WEBHOOK_BASE=http://127.0.0.1:18099/" >> "$GITHUB_ENV"
+          echo "MOLECULE_CHANNELS_TEST_TELEGRAM_API_BASE=http://127.0.0.1:18098" >> "$GITHUB_ENV"
+          echo "Channels test seam configured (webhook+telegram mock bases on fixed loopback ports)."
      - name: Build platform
        if: needs.detect-changes.outputs.api == 'true'
        working-directory: workspace-server
@@ -430,6 +439,20 @@ jobs:
      - name: Run notify-with-attachments E2E
        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_notify_attachments_e2e.sh
+      - name: "Run channels + data-prune E2E (REQUIRE-LIVE: mock upstream proves send+discover, purge proves prune)"
+        # core#2332 P1.10. Stands up a local mock upstream, points the LIVE
+        # Slack-webhook send + Telegram discover paths at it via the
+        # production-inert test seam configured above, and asserts the mock
+        # RECEIVED the serialized payload (send) + round-tripped the bot/chat
+        # (discover). Then exercises the RFC #734 data-prune: DELETE
+        # ?purge=true removes the target's durable child data while a sibling
+        # survives. E2E_REQUIRE_LIVE=1 ⇒ a missing/regressed seam is RED, not a
+        # silent skip. The platform inherits the MOLECULE_CHANNELS_TEST_* bases
+        # from $GITHUB_ENV; the script's mock ports match them (18099/18098).
+        if: needs.detect-changes.outputs.api == 'true'
+        env:
+          E2E_REQUIRE_LIVE: '1'
+        run: bash tests/e2e/test_channels_e2e.sh
      - name: "Run priority-runtimes E2E (REQUIRE-LIVE: mock validates the runtime plumbing end-to-end)"
        # E2E_REQUIRE_LIVE=1 is ON: the run MUST validate >=1 runtime end-to-end
        # or it exits NON-zero (RED). This is now SAFE because the `mock` arm can
@@ -0,0 +1,129 @@
+name: E2E Workspace Lifecycle (staginge2e)
+
+# core#2332 P1.10 — close the workspace-lifecycle coverage gap.
+#
+# soft-restart / pause / resume / hibernate were only unit-tested (httptest in
+# workspace-server/internal/handlers/*_test.go) and never proven against a real
+# container. This drives the Go staginge2e suite
+# (workspace-server/internal/staginge2e/workspace_lifecycle_test.go) which
+# provisions a REAL throwaway staging tenant, exercises each lifecycle endpoint,
+# and asserts OBSERVABLE container state (status transitions + serve reachability
+# + url-cleared-on-stop) — not just HTTP 200.
+#
+# ADVISORY-BY-INFRA. It needs a live staging tenant (~30+ min cold EC2 path), so
+# the real run is workflow_dispatch / schedule only — NOT per-PR and NOT a
+# required check. Promotion to a required branch-protection context is a separate
+# CTO decision (mirrors the cp internal/staginge2e suite, cp#386, and the
+# peer-visibility flip-to-required pattern, molecule-core#1296).
+#
+# HONEST GATE — NO continue-on-error mask (feedback_fix_root_not_symptom). The
+# PR job validates that the suite COMPILES under -tags=staging_e2e and SKIPs LOUD
+# without creds (the suite's contract) — a broken test file fails at PR time. The
+# real assertion runs on dispatch/cron with staging creds.
+#
+# Gitea 1.22.6 / act_runner notes honored: no cross-repo uses (mirrored
+# actions/checkout SHA), per-SHA concurrency, pinned GITHUB_SERVER_URL.
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - 'workspace-server/internal/handlers/workspace_restart.go'
+      - 'workspace-server/internal/handlers/workspace_crud.go'
+      - 'workspace-server/internal/staginge2e/**'
+      - '.gitea/workflows/e2e-workspace-lifecycle.yml'
+  pull_request:
+    branches: [main]
+    paths:
+      - 'workspace-server/internal/handlers/workspace_restart.go'
+      - 'workspace-server/internal/handlers/workspace_crud.go'
+      - 'workspace-server/internal/staginge2e/**'
+      - '.gitea/workflows/e2e-workspace-lifecycle.yml'
+  workflow_dispatch:
+  schedule:
+    # 08:00 UTC daily — offset from e2e-staging-saas (07:00) and
+    # e2e-peer-visibility (07:30) so the three don't collide on the staging
+    # org-creation quota.
+    - cron: '0 8 * * *'
+
+concurrency:
+  # Per-SHA (feedback_concurrency_group_per_sha).
+  group: e2e-workspace-lifecycle-${{ github.event.pull_request.head.sha || github.sha }}
+  cancel-in-progress: false
+
+env:
+  GITHUB_SERVER_URL: https://git.moleculesai.app
+
+jobs:
+  # PR / compile gate: prove the staginge2e suite compiles under the build tag
+  # and skips LOUD without creds. Cheap, honest, non-required. This is NOT a
+  # fake-green mask of the real assertion — it fails if the test file stops
+  # compiling. bp-required: pending CTO decision (see header).
+  lifecycle-compile-skip:
+    name: E2E Workspace Lifecycle (compile+skip)
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
+        with:
+          go-version: 'stable'
+          cache: true
+          cache-dependency-path: workspace-server/go.sum
+      - name: go vet (staging_e2e tag)
+        working-directory: workspace-server
+        run: go vet -tags staging_e2e ./internal/staginge2e/...
+      - name: Compile + skip-run (must SKIP LOUD without STAGING_E2E)
+        working-directory: workspace-server
+        run: |
+          # No STAGING_E2E / creds → the suite MUST skip (not pass-with-zero-
+          # assertions, not fail-open). `go test` exit 0 with a SKIP line is the
+          # contract. -run pins to the one test so this stays fast.
+          out=$(go test -tags staging_e2e ./internal/staginge2e/ -run TestWorkspaceLifecycle -count=1 -v 2>&1)
+          echo "$out"
+          echo "$out" | grep -q "SKIP: TestWorkspaceLifecycle_Staging" \
+            || { echo "::error::expected a LOUD skip of TestWorkspaceLifecycle_Staging without creds"; exit 1; }
+
+  # Real STAGING gate: provisions a throwaway tenant, drives the lifecycle
+  # endpoints, asserts observable transitions, scoped teardown.
+  # dispatch / schedule only (30+ min cold EC2).
+  lifecycle-staging:
+    name: E2E Workspace Lifecycle (staging)
+    runs-on: ubuntu-latest
+    if: github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
+    timeout-minutes: 60
+    env:
+      CP_BASE_URL: https://staging-api.moleculesai.app
+      CP_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
+      STAGING_E2E: '1'
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
+        with:
+          go-version: 'stable'
+          cache: true
+          cache-dependency-path: workspace-server/go.sum
+      - name: Verify admin token present
+        run: |
+          if [ -z "$CP_ADMIN_API_TOKEN" ]; then
+            echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
+            exit 2
+          fi
+          echo "Admin token present"
+      - name: CP staging health preflight
+        run: |
+          code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$CP_BASE_URL/health")
+          if [ "$code" != "200" ]; then
+            echo "::error::Staging CP unhealthy (HTTP $code) — infra, not a lifecycle bug. Failing loud per feedback_fix_root_not_symptom."
+            exit 1
+          fi
+          echo "Staging CP healthy"
+      - name: Run workspace-lifecycle staginge2e
+        working-directory: workspace-server
+        run: go test -tags staging_e2e ./internal/staginge2e/ -run TestWorkspaceLifecycle_Staging -count=1 -v -timeout 50m
+      # Teardown: the test installs a t.Cleanup admin-DELETE of its own tenant
+      # (runs even on a Fatal). We deliberately do NOT add a broad in-workflow
+      # "sweep all e2e-life-* slugs" net here — that could delete a concurrently
+      # running dispatch's fresh tenant (the slug is not run-id scoped). The
+      # age-guarded `sweep-stale-e2e-orgs` workflow (30-min floor, e2e- prefix)
+      # is the final safety net for a tenant orphaned by a hard runner cancel.
@@ -7,10 +7,13 @@ name: gitea-merge-queue
 # the user-space queue bot, one PR per tick, using the non-bypass merge actor.
 #
 # Queue contract:
-#   - add label `merge-queue` to an open same-repo PR
+#   - auto-discovery (default): any open same-repo PR is considered — no
+#     `merge-queue` label required (the label is optional metadata now)
 #   - bot updates stale PR heads with current main, then waits for CI
-#   - bot merges only when current main is green and required PR contexts pass
-#   - add `merge-queue-hold` to pause a queued PR without removing it
+#   - bot merges only when current main is green, genuine approvals are present
+#     on the current head, required PR contexts pass, and the PR is mergeable
+#   - add `merge-queue-hold`, `do-not-auto-merge`, or `wip` to keep a PR OUT of
+#     autonomous merging; draft PRs are also skipped

 on:
  # Schedule moved to operator-config:
@@ -48,10 +51,34 @@ jobs:
          WATCH_BRANCH: ${{ github.event.repository.default_branch }}
          QUEUE_LABEL: merge-queue
          HOLD_LABEL: merge-queue-hold
+          # Auto-discovery (opt-OUT). When on (default), the queue considers ALL
+          # open same-repo PRs that meet the merge bar — it does NOT wait for a
+          # human/agent to add `merge-queue`. Agent Gitea tokens lack
+          # write:issue (labels are issue-scoped) and could never self-label,
+          # which stalled the queue; the label is now OPTIONAL metadata. The
+          # merge bar is UNCHANGED — only candidate selection widens. Set
+          # AUTO_DISCOVER=0 to restore legacy opt-IN (require the merge-queue
+          # label to be considered).
+          AUTO_DISCOVER: "1"
+          # Opt-OUT labels: any of these on a PR keeps it OUT of autonomous
+          # merging (the human escape hatch). HOLD_LABEL is always also honoured.
+          # A human who wants a PR held just adds one of these labels.
+          OPT_OUT_LABELS: do-not-auto-merge,wip
          UPDATE_STYLE: merge
-          REQUIRED_CONTEXTS: >-
-            CI / all-required (pull_request),
-            sop-checklist / all-items-acked (pull_request)
+          # Recognised official-reviewer set. A merge needs >= required_approvals
+          # DISTINCT genuine official approvals from these accounts on the
+          # CURRENT head sha (not stale/dismissed). The required_approvals count
+          # itself is read from branch protection at runtime.
+          REVIEWER_SET: agent-reviewer,agent-researcher,agent-reviewer-cr2
+          # NOTE: REQUIRED_CONTEXTS is no longer the authoritative PR gate. The
+          # queue now reads the required status contexts from BRANCH PROTECTION
+          # (status_check_contexts) so non-required governance reds (qa-review,
+          # security-review, sop-tier, sop-checklist when not branch-required,
+          # E2E Chat, Staging SaaS, ci-arm64-advisory) cannot block a merge.
+          # If branch protection cannot be enumerated the queue HOLDS
+          # (fail-closed). REQUIRED_APPROVALS below is only a fallback used when
+          # branch protection does not specify required_approvals.
+          REQUIRED_APPROVALS: "2"
          # Push-side required contexts. Checking CI / all-required (push)
          # explicitly instead of the combined state avoids false-pause when
          # non-blocking jobs (continue-on-error: true) have failed — those
@@ -99,7 +99,7 @@ jobs:
    # all violate this lint at first — intentional. Flip to false
    # follow-up after main is clean for 3 days. mc#1982.
    # mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
-    continue-on-error: true  # mc#1982 Phase 3 mask — 14d forced-renewal cadence
+    continue-on-error: true  # internal#837 Phase 3 mask — 14d forced-renewal cadence
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
@@ -123,7 +123,14 @@ jobs:
      # with a per-entry ::error:: annotation naming the missing repo
      # (issue #2192). This is the push-time complement to PR #2186's
      # PR-time manifest-entry-existence gate.
+      #
+      # Token: workspace-template-* repos are PRIVATE, so the existence check
+      # must authenticate (same AUTO_SYNC_TOKEN as the clone step). Without it
+      # an unauthenticated GET 404s on private repos and false-prunes them
+      # (regression that dropped seo-agent/google-adk from the palette).
      - name: Validate manifest entries exist
+        env:
+          MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
        run: |
          set -euo pipefail
          bash scripts/check-manifest-repos-exist.sh manifest.json
@@ -41,12 +41,16 @@
 #   - the `|| true` after the sop-tier-check.sh invocation, which masked
 #     real tier-gate verdicts.
 # AND-composition is now fully enforced and the tier-check step can
-# honestly red CI on a real SOP-6 violation. SOP_FAIL_OPEN=1 is RETAINED
-# as sanctioned infra-resilience: it fails-open only on token/network/jq
-# faults, never on a real gate verdict. If you need to temporarily
-# re-introduce a mask, file a tracker and follow the mc#1982 protocol
-# (Tier 2e lint requires a current tracker within 2 lines of any
-# continue-on-error: true).
+# honestly red CI on a real SOP-6 violation.
+#
+# SOP_FAIL_OPEN REMOVED 2026-06-05 (fix/core-ci-fail-closed): this is a
+# REQUIRED branch-protected gate on `pull_request_target` (always
+# same-repo, secrets always present — no fork/advisory split). Failing
+# open on a token/network/jq fault greened the SOP-6 approval gate
+# WITHOUT verifying approvals — a fail-open on a required context. The
+# gate now FAILS CLOSED on infra faults too: fix the token/runner, not
+# the gate. If you ever need to temporarily re-introduce a mask, file a
+# tracker and follow the mc#1982 protocol.

 name: sop-tier-check

@@ -122,9 +126,9 @@ jobs:
      - name: Verify tier label + reviewer team membership
        # continue-on-error REMOVED 2026-06-04 (expired internal#189 Phase 1
        # burn-in, window closed 2026-05-17; mc#1982 directive: root-fix and
-        # remove, do not renew). SOP_FAIL_OPEN=1 below still fails-open on
-        # token/network/infra errors only (never on a real tier-gate verdict),
-        # so this step can now honestly fail CI on a genuine SOP-6 violation.
+        # remove, do not renew). SOP_FAIL_OPEN REMOVED 2026-06-05
+        # (fix/core-ci-fail-closed): the gate now fails CLOSED on infra
+        # faults too (see the env block below), not just on a real verdict.
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
          GITEA_HOST: git.moleculesai.app
@@ -133,13 +137,26 @@ jobs:
          PR_AUTHOR: ${{ github.event.pull_request.user.login }}
          SOP_DEBUG: '0'
          SOP_LEGACY_CHECK: '0'
-          # SOP_FAIL_OPEN=1 fails-open ONLY on infra faults (empty/invalid
-          # token, unreachable Gitea API, missing jq) — see the guarded
-          # `exit 0` branches in sop-tier-check.sh. It does NOT mask a real
-          # tier-gate verdict: a missing tier label, no approving review, or
-          # an unsatisfied AND-clause still `exit 1`. Kept as sanctioned
-          # infra-resilience; the `|| true` mask was REMOVED with the burn-in
-          # COE (2026-06-04) so a genuine SOP-6 violation now reds CI.
-          SOP_FAIL_OPEN: '1'
+          # SOP_FAIL_OPEN REMOVED 2026-06-05 (fix/core-ci-fail-closed).
+          #
+          # This is the REQUIRED branch-protected gate
+          # `sop-tier-check / tier-check (pull_request)`. It runs on
+          # `pull_request_target`, which ALWAYS executes from the base
+          # branch WITH secrets present — there is NO fork/advisory split
+          # and no legitimate "secrets genuinely absent" degradation here.
+          #
+          # SOP_FAIL_OPEN=1 made the script `exit 0` on an empty/invalid
+          # token, an unreachable Gitea API, or missing jq — i.e. an AUTH
+          # FAILURE or unreachable-dependency would green the SOP-6
+          # approval gate WITHOUT verifying that the required teams
+          # actually approved. That is a fail-open on a required gate: a
+          # mis-wired or under-scoped SOP_TIER_CHECK_TOKEN would let any PR
+          # merge past the approval requirement.
+          #
+          # Removing the env unsets it → `${SOP_FAIL_OPEN:-}` is empty in
+          # sop-tier-check.sh → every guarded `exit 0` branch instead falls
+          # through to `exit 1`. Infra faults (bad token / API down / no
+          # jq) now FAIL CLOSED with a loud `::error::`, exactly like a real
+          # SOP-6 violation. Fix the token/runner, not the gate.
        run: |
          bash .gitea/scripts/sop-tier-check.sh
@@ -90,7 +90,13 @@ jobs:
          # checked-in artifact; exit 1 (RED) on any drift. This is the
          # single source of the gate's verdict — the same code path
          # `go test ./cmd/gen-providers` exercises.
-          go run ./cmd/gen-providers -check
+          if ! go run ./cmd/gen-providers -check; then
+            echo "::error::workspace-server/internal/providers/gen/registry_gen.go is stale (drifted from providers.yaml)."
+            echo "Regenerate and commit it (run from repo root):"
+            echo "  make gen          # native (needs a local Go toolchain)"
+            echo "  make gen-docker   # Docker only — no local Go needed"
+            exit 1
+          fi

      - name: Belt-and-braces — regenerate in place and assert clean tree
        run: |
@@ -101,7 +107,9 @@ jobs:
          go generate ./...
          if ! git diff --quiet -- internal/providers/gen/registry_gen.go; then
            echo "::error::workspace-server/internal/providers/gen/registry_gen.go drifted from providers.yaml."
-            echo "Run 'go generate ./...' (or 'go run ./cmd/gen-providers') in workspace-server/ and commit the result."
+            echo "Regenerate and commit it. No local Go? Use Docker (run from repo root):"
+            echo "  make gen          # native (needs a local Go toolchain)"
+            echo "  make gen-docker   # Docker only — no local Go needed"
            git --no-pager diff -- internal/providers/gen/registry_gen.go | head -80
            exit 1
          fi
@@ -4,7 +4,27 @@
 # use this Makefile; CI calls docker compose / go test directly so the
 # Makefile can evolve without breaking the build.

-.PHONY: help dev up down logs build test e2e-peer-visibility openapi-spec openapi-spec-check
+.PHONY: help dev up down logs build test e2e-peer-visibility openapi-spec openapi-spec-check gen gen-docker gen-check gen-check-docker
+
+# ─── Provider-registry SSOT codegen (internal#718) ─────────────────────
+# The Go module lives in workspace-server/. The checked-in artifact
+# workspace-server/internal/providers/gen/registry_gen.go is a gofmt'd
+# projection of providers.yaml, drift-gated by
+# .gitea/workflows/verify-providers-gen.yml. `make gen-docker` runs the SAME
+# generator inside the pinned golang image so a toolchain-less env (an agent
+# without Go) can regenerate without a local Go install (core#2332 follow-up).
+#
+# BYTE-EQUIVALENCE: gen-docker is byte-identical to native only while
+# GO_VERSION below matches the `go` directive in workspace-server/go.mod.
+# NOTE: the CI verify workflow pins setup-go go-version: 'stable' (not '1.25');
+# that is a latent hazard — a future Go minor could reformat the artifact in CI
+# vs a 1.25 local. Pin CI to '1.25' to close it (tracked alongside this change).
+GO_VERSION ?= 1.25
+GO_IMAGE   ?= golang:$(GO_VERSION)
+DOCKER     ?= docker
+# Mount the Go module (workspace-server) read-write; Go's default -mod=readonly
+# keeps go.mod/go.sum untouched — only the artifact is written in-place.
+DOCKER_RUN_WS = $(DOCKER) run --rm -v "$(CURDIR)/workspace-server":/src -w /src $(GO_IMAGE)

 help: ## Show this help.
 	@grep -E '^[a-zA-Z0-9_-]+:.*?## ' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-22s\033[0m %s\n", $$1, $$2}'
@@ -56,3 +76,16 @@ openapi-spec: ## Regenerate OpenAPI spec from workspace-server handler annotatio
 openapi-spec-check: openapi-spec ## CI gate — fail if openapi-spec produces a diff vs the committed file.
 	@git diff --exit-code -- workspace-server/docs/openapi/ \
 	  || (echo "openapi-spec is stale — run 'make openapi-spec' and commit the result" && exit 1)
+
+# ─── Provider-registry codegen targets ────────────────────────────────
+gen: ## Regenerate the providers registry artifact natively (needs local Go).
+	cd workspace-server && go generate ./...
+
+gen-docker: ## Same, inside the pinned $(GO_IMAGE) — Docker only, no local Go.
+	$(DOCKER_RUN_WS) go generate ./...
+
+gen-check: ## Drift gate (native): exit 1 if the artifact is stale.
+	cd workspace-server && go run ./cmd/gen-providers -check
+
+gen-check-docker: ## Drift gate inside the pinned $(GO_IMAGE) — Docker only.
+	$(DOCKER_RUN_WS) go run ./cmd/gen-providers -check
@@ -0,0 +1,461 @@
+/**
+ * Staging canvas E2E — desktop take-control RECONNECT + LEASE-RENEWAL path
+ * (core#2332 "P0.7", the e2e gap left by core#2216).
+ *
+ * Sibling to staging-display.spec.ts. That spec proves the happy path
+ * (acquire → noVNC WS upgrade → first framebuffer frame). It does NOT cover
+ * the two behaviours core#2216 added on top of that happy path:
+ *
+ *   (A) RECONNECT re-acquires a FRESH token. When the live WS drops uncleanly
+ *       (idle/network blip), DisplayTab.tsx:391-446 calls connect(reacquire=true),
+ *       which first awaits reacquireSession() (DisplayTab.tsx:83-99 →
+ *       POST /display/control/acquire) to mint a NON-stale lease+token before
+ *       reopening the socket. Without this, the cached ~300s token can be past
+ *       its expiry and the reconnect would 401 — a dead session that LOOKS like
+ *       a reconnect. We assert the reconnect path yields a token bound to a NEW
+ *       expires_at AND that a NEW WS opened with that fresh token resumes the
+ *       framebuffer (a real frame, not a 1006/403).
+ *
+ *   (B) The lease SURVIVES past the 300s window via the renewal cadence.
+ *       The lock is a 300s lease with NO server-side auto-renewal
+ *       (workspace_display_control.go:27 displayControlDefaultTTLSeconds=300;
+ *       loadActiveDisplayControl filters `expires_at > now()`). DisplayTab.tsx:105-111
+ *       runs a 120_000ms setInterval that re-acquires as the same holder, which
+ *       the server's ON-CONFLICT upsert (workspace_display_control.go:116-123,
+ *       `controlled_by = EXCLUDED.controlled_by`) treats as a lease EXTENSION:
+ *       expires_at moves forward by a fresh 300s each renewal. We do NOT sleep
+ *       300s of wall-clock to prove this — we drive the renewal CALL the timer
+ *       fires (reacquireSession === the same POST) and assert it pushes
+ *       expires_at strictly past the ORIGINAL lease window, then confirm the
+ *       lock is still live (GET /display/control returns the holder) after a
+ *       point in time at which the original, un-renewed lease would already be
+ *       expired. That is the observable, deterministic proxy for "the 120s
+ *       timer keeps the user from being kicked every ~5 min."
+ *
+ * Auth model, gating, and fail-closed philosophy are IDENTICAL to
+ * staging-display.spec.ts — see that file's header for the full rationale
+ * (same-origin-canvas Origin for the WS upgrade; per-tenant admin bearer for
+ * the acquire/GET POSTs; STAGING_DISPLAY_WORKSPACE_ID is the single activation
+ * knob and a standing desktop EC2 is a CTO cost item; any failure once the gate
+ * env is present is a HARD error, never a silent green, no "flaky" disposition).
+ *
+ * Promote-to-required is a CTO call: like its sibling this only runs when a
+ * standing desktop-capable staging workspace exists, so it cannot be a blanket
+ * required context until that workspace is funded and STAGING_DISPLAY_* is wired
+ * into the e2e-staging-canvas workflow.
+ */
+
+import { test, expect } from "@playwright/test";
+
+const STAGING = process.env.CANVAS_E2E_STAGING === "1";
+
+// The standing desktop-capable workspace id. Absent => skip loud. Same single
+// activation knob as staging-display.spec.ts; see that file's header.
+const DISPLAY_WS_ID = process.env.STAGING_DISPLAY_WORKSPACE_ID;
+
+test.skip(!STAGING, "CANVAS_E2E_STAGING not set — skipping staging-only tests");
+test.skip(
+  !DISPLAY_WS_ID,
+  "STAGING_DISPLAY_WORKSPACE_ID not set — no standing desktop-capable staging " +
+    "workspace to exercise the reconnect/renewal path. Set it to a workspace whose " +
+    "compute.display.mode == 'desktop-control' to activate this real-e2e gate. " +
+    "(Standing that workspace up is a CTO cost item — one always-on desktop EC2.)",
+);
+
+// WS upgrade + first-frame budgets mirror staging-display.spec.ts:75-76 — the
+// EIC tunnel + websockify handshake adds real latency; bounded so a dead path
+// fails LOUD instead of hanging to the suite timeout.
+const WS_UPGRADE_TIMEOUT_MS = 30_000;
+const FIRST_FRAME_TIMEOUT_MS = 30_000;
+
+// The production lease/renewal contract we are asserting against:
+//   - DEFAULT_TTL_SECONDS: the 300s lease the canvas requests
+//     (DisplayTab.tsx:88 ttl_seconds:300; server default
+//     workspace_display_control.go:27).
+//   - RENEWAL_INTERVAL_MS: the cadence the canvas renews on
+//     (DisplayTab.tsx:109 setInterval(..., 120_000)). We don't sleep it; we
+//     assert the renewal CALL pushes the lease forward.
+const DEFAULT_TTL_SECONDS = 300;
+const RENEWAL_INTERVAL_MS = 120_000;
+
+// Open a real noVNC WebSocket from inside the page (so the browser sends
+// Origin: <tenant> and the same-origin-canvas AdminAuth path accepts the
+// upgrade — a browser WS can't set Authorization). Returns the outcome of the
+// upgrade + first-frame, exactly like staging-display.spec.ts's evaluate
+// block. Reused here for BOTH the initial connect and the post-drop reconnect
+// so the two are compared on identical wire mechanics.
+type WsResult = {
+  ok: boolean;
+  stage: string;
+  detail: string;
+  frameBytes?: number;
+  frameKind?: string;
+  closeCode?: number;
+};
+
+async function openDisplayWs(
+  page: import("@playwright/test").Page,
+  rawSessionUrl: string,
+): Promise<WsResult> {
+  return page.evaluate(
+    async ({ rawSessionUrl, upgradeTimeoutMs, frameTimeoutMs }) => {
+      // Reproduce DisplayTab.tsx:545-552 (displayWebSocketConnection): resolve
+      // against the tenant origin, pull token from the #token fragment, strip
+      // the fragment, switch http(s)->ws(s). Then connect with the exact
+      // subprotocols the canvas uses (DisplayTab.tsx:402).
+      const u = new URL(rawSessionUrl, window.location.href);
+      const token =
+        new URLSearchParams(u.hash.replace(/^#/, "")).get("token") ?? "";
+      if (!token) {
+        return { ok: false, stage: "token-parse", detail: "no #token in session_url" };
+      }
+      u.hash = "";
+      u.protocol = window.location.protocol === "https:" ? "wss:" : "ws:";
+      const wsUrl = u.toString();
+
+      return await new Promise<{
+        ok: boolean;
+        stage: string;
+        detail: string;
+        frameBytes?: number;
+        frameKind?: string;
+        closeCode?: number;
+      }>((resolve) => {
+        let upgraded = false;
+        let settled = false;
+        const finish = (r: {
+          ok: boolean;
+          stage: string;
+          detail: string;
+          frameBytes?: number;
+          frameKind?: string;
+          closeCode?: number;
+        }) => {
+          if (settled) return;
+          settled = true;
+          try {
+            ws.close();
+          } catch {
+            /* ignore */
+          }
+          resolve(r);
+        };
+
+        let ws: WebSocket;
+        try {
+          ws = new WebSocket(wsUrl, [`binary`, `molecule-display-token.${token}`]);
+        } catch (e) {
+          resolve({ ok: false, stage: "construct", detail: String(e) });
+          return;
+        }
+        ws.binaryType = "arraybuffer";
+
+        const upgradeTimer = setTimeout(() => {
+          finish({
+            ok: false,
+            stage: "upgrade-timeout",
+            detail: `WS did not open within ${upgradeTimeoutMs}ms (readyState=${ws.readyState})`,
+          });
+        }, upgradeTimeoutMs);
+
+        let frameTimer: ReturnType<typeof setTimeout> | null = null;
+
+        ws.onopen = () => {
+          upgraded = true;
+          clearTimeout(upgradeTimer);
+          frameTimer = setTimeout(() => {
+            finish({
+              ok: false,
+              stage: "frame-timeout",
+              detail: `WS upgraded but no framebuffer message within ${frameTimeoutMs}ms`,
+            });
+          }, frameTimeoutMs);
+        };
+
+        ws.onmessage = (ev) => {
+          if (frameTimer) clearTimeout(frameTimer);
+          let bytes = 0;
+          let kind: string = typeof ev.data;
+          if (ev.data instanceof ArrayBuffer) {
+            bytes = ev.data.byteLength;
+            kind = "ArrayBuffer";
+          } else if (typeof Blob !== "undefined" && ev.data instanceof Blob) {
+            bytes = ev.data.size;
+            kind = "Blob";
+          } else if (typeof ev.data === "string") {
+            bytes = ev.data.length;
+            kind = "string";
+          }
+          finish({
+            ok: bytes > 0,
+            stage: "frame",
+            detail:
+              bytes > 0 ? "received framebuffer message" : "first message was empty",
+            frameBytes: bytes,
+            frameKind: kind,
+          });
+        };
+
+        ws.onclose = (ev) => {
+          if (!upgraded) {
+            clearTimeout(upgradeTimer);
+            finish({
+              ok: false,
+              stage: "upgrade-close",
+              detail: `WS closed before upgrade (code=${ev.code}, reason="${ev.reason}") — handshake rejected somewhere in edge → ws-proxy → EIC → websockify → x11vnc`,
+              closeCode: ev.code,
+            });
+          }
+        };
+
+        ws.onerror = () => {
+          if (!upgraded) {
+            clearTimeout(upgradeTimer);
+            finish({
+              ok: false,
+              stage: "upgrade-error",
+              detail: "WS error before upgrade — proxy chain rejected the handshake",
+            });
+          }
+        };
+      });
+    },
+    {
+      rawSessionUrl,
+      upgradeTimeoutMs: WS_UPGRADE_TIMEOUT_MS,
+      frameTimeoutMs: FIRST_FRAME_TIMEOUT_MS,
+    },
+  );
+}
+
+// Pull the opaque signed token out of a session_url's #token= fragment so we
+// can compare reconnect tokens for freshness (a reconnect MUST mint a new one
+// — same token would mean the cached, possibly-expired URL was reused).
+function tokenOf(sessionUrl: string): string {
+  const hashIdx = sessionUrl.indexOf("#token=");
+  return hashIdx >= 0 ? sessionUrl.slice(hashIdx + "#token=".length) : "";
+}
+
+test.describe("staging desktop take-control — reconnect + lease renewal (core#2216)", () => {
+  // Shared staging context resolution — identical to staging-display.spec.ts:90-120.
+  function resolveTenant() {
+    const tenantURL =
+      process.env.STAGING_DISPLAY_TENANT_URL || process.env.STAGING_TENANT_URL;
+    const tenantToken =
+      process.env.STAGING_DISPLAY_TENANT_TOKEN || process.env.STAGING_TENANT_TOKEN;
+    const orgID = process.env.STAGING_DISPLAY_ORG_ID || process.env.STAGING_ORG_ID;
+    if (!tenantURL || !tenantToken) {
+      throw new Error(
+        "STAGING_DISPLAY_WORKSPACE_ID is set but no tenant URL/token is available " +
+          "for the reconnect/renewal gate. Set STAGING_DISPLAY_SLUG so staging-setup.ts " +
+          "resolves STAGING_DISPLAY_TENANT_URL / STAGING_DISPLAY_TENANT_TOKEN for the " +
+          "standing desktop org (or ensure the ephemeral STAGING_TENANT_* exports exist).",
+      );
+    }
+    return { tenantURL, tenantToken, orgID };
+  }
+
+  test.beforeEach(async ({ context }) => {
+    const { tenantToken, orgID } = resolveTenant();
+    await context.setExtraHTTPHeaders({
+      Authorization: `Bearer ${tenantToken}`,
+      ...(orgID ? { "X-Molecule-Org-Id": orgID } : {}),
+    });
+  });
+
+  test("reconnect re-acquires a FRESH token and the framebuffer resumes", async ({
+    page,
+  }) => {
+    const { tenantURL } = resolveTenant();
+    const workspaceId = DISPLAY_WS_ID as string;
+
+    // Sanity: workspace must be display-available, else the gate is meaningless.
+    const availResp = await page.request.get(
+      `${tenantURL}/workspaces/${workspaceId}/display`,
+    );
+    expect(availResp.status(), `GET /display for ${workspaceId} should be 200`).toBe(200);
+    const avail = await availResp.json();
+    expect(
+      avail.available,
+      `workspace ${workspaceId} is not display-available (reason=${avail.reason}).`,
+    ).toBe(true);
+
+    // 1. Initial acquire — the happy-path lease the user starts with.
+    const firstResp = await page.request.post(
+      `${tenantURL}/workspaces/${workspaceId}/display/control/acquire`,
+      { data: { controller: "user", ttl_seconds: DEFAULT_TTL_SECONDS } },
+    );
+    expect(
+      firstResp.status(),
+      `initial acquire should be 200; body: ${await firstResp.text()}`,
+    ).toBe(200);
+    const first = await firstResp.json();
+    expect(first.controller, "controller should be 'user'").toBe("user");
+    expect(typeof first.session_url, "acquire missing session_url").toBe("string");
+    const firstUrl: string = first.session_url;
+    expect(firstUrl, "session_url should carry #token=").toContain("#token=");
+    const firstToken = tokenOf(firstUrl);
+    expect(firstToken.length, "first token should be non-empty").toBeGreaterThan(0);
+
+    // Anchor Origin to the tenant so the same-origin-canvas WS upgrade is accepted.
+    await page.goto(tenantURL, { waitUntil: "domcontentloaded" });
+
+    // 2. Establish the live WS on the FIRST token — proves the session is real.
+    const initial = await openDisplayWs(page, firstUrl);
+    expect(
+      initial.ok,
+      `initial connect failed at stage="${initial.stage}": ${initial.detail}` +
+        (initial.closeCode ? ` (close code ${initial.closeCode})` : ""),
+    ).toBe(true);
+    expect(initial.stage, `initial connect should reach 'frame'; got '${initial.stage}'`).toBe(
+      "frame",
+    );
+
+    // 3. Simulate an unclean drop. openDisplayWs() already closed its socket
+    //    on finish(), so the live stream is gone here — exactly the state
+    //    DisplayTab's "disconnect" handler (DisplayTab.tsx:426-442) enters
+    //    before it calls connect(reacquire=true).
+
+    // 4. Reconnect path: mint a FRESH lease+token FIRST, the way
+    //    connect(reacquire=true) → reacquireSession() does (DisplayTab.tsx:397
+    //    / :83-99). This is a re-acquire by the SAME holder, so the server's
+    //    ON-CONFLICT upsert extends the lease and returns a new signed URL.
+    const reResp = await page.request.post(
+      `${tenantURL}/workspaces/${workspaceId}/display/control/acquire`,
+      { data: { controller: "user", ttl_seconds: DEFAULT_TTL_SECONDS } },
+    );
+    expect(
+      reResp.status(),
+      `reconnect re-acquire should be 200 (same holder extends, not 409); body: ${await reResp.text()}`,
+    ).toBe(200);
+    const re = await reResp.json();
+    expect(re.controller, "reconnect controller should still be 'user'").toBe("user");
+    expect(typeof re.session_url, "reconnect acquire missing session_url").toBe("string");
+    const reUrl: string = re.session_url;
+    const reToken = tokenOf(reUrl);
+    expect(reToken.length, "reconnect token should be non-empty").toBeGreaterThan(0);
+
+    // The reconnect token MUST be fresh — bound to the new expires_at. A
+    // reused token would mean the canvas fell back to a cached, soon-expiring
+    // URL, which is precisely the 401-on-reconnect bug core#2216 fixed. The
+    // signed token embeds expires_at.Unix() (workspace_display_control.go:390),
+    // so a later expiry => a different signature => a different token.
+    expect(
+      reToken,
+      "reconnect should mint a FRESH token (bound to the renewed expires_at), " +
+        "not reuse the original ~300s token — a reused token is the core#2216 401 bug.",
+    ).not.toBe(firstToken);
+    expect(
+      new Date(re.expires_at).getTime(),
+      "renewed expires_at should be >= the original (lease extended, not shrunk)",
+    ).toBeGreaterThanOrEqual(new Date(first.expires_at).getTime());
+
+    // 5. Reopen the WS on the FRESH token and assert the framebuffer RESUMES —
+    //    a real frame, not a dead 1006/403 session. This is the crux: the
+    //    reconnect produces a LIVE stream, not a stale-token rejection.
+    const reconnected = await openDisplayWs(page, reUrl);
+    expect(
+      reconnected.ok,
+      `RECONNECT failed at stage="${reconnected.stage}": ${reconnected.detail}` +
+        (reconnected.closeCode ? ` (close code ${reconnected.closeCode})` : "") +
+        " — a 1006/403 here means the fresh-token reconnect did NOT re-establish " +
+        "the proxy chain (edge → ws-proxy → EIC → websockify → x11vnc).",
+    ).toBe(true);
+    expect(
+      reconnected.stage,
+      `reconnect should reach 'frame' (framebuffer resumed); got '${reconnected.stage}' (${reconnected.detail})`,
+    ).toBe("frame");
+    expect(
+      reconnected.frameBytes ?? 0,
+      `resumed framebuffer message should be non-empty (kind=${reconnected.frameKind})`,
+    ).toBeGreaterThan(0);
+  });
+
+  test("renewal pushes the lease past the original 300s window (no kick at ~5min)", async ({
+    page,
+  }) => {
+    const { tenantURL } = resolveTenant();
+    const workspaceId = DISPLAY_WS_ID as string;
+
+    // 1. Acquire the initial 300s lease.
+    const firstResp = await page.request.post(
+      `${tenantURL}/workspaces/${workspaceId}/display/control/acquire`,
+      { data: { controller: "user", ttl_seconds: DEFAULT_TTL_SECONDS } },
+    );
+    expect(
+      firstResp.status(),
+      `initial acquire should be 200; body: ${await firstResp.text()}`,
+    ).toBe(200);
+    const first = await firstResp.json();
+    const firstExpiry = new Date(first.expires_at).getTime();
+    expect(Number.isFinite(firstExpiry), "first expires_at should parse").toBe(true);
+
+    // The original lease's hard ceiling: when the un-renewed token/lock dies.
+    const originalLeaseDeadlineMs = firstExpiry;
+
+    // 2. Fire the renewal CALL the 120s timer fires (DisplayTab.tsx:107-109 →
+    //    reacquireSession → this same POST). We don't sleep RENEWAL_INTERVAL_MS
+    //    of wall-clock; we drive the observable call the timer would make and
+    //    assert its EFFECT on the lease. RENEWAL_INTERVAL_MS is asserted to sit
+    //    safely inside the TTL so the renew always lands before expiry — if a
+    //    future change widened the interval past the TTL, this guard fails.
+    expect(
+      RENEWAL_INTERVAL_MS,
+      "renewal interval must be strictly inside the lease TTL, else the lease " +
+        "expires before the timer renews it (user gets kicked).",
+    ).toBeLessThan(DEFAULT_TTL_SECONDS * 1000);
+
+    const renewResp = await page.request.post(
+      `${tenantURL}/workspaces/${workspaceId}/display/control/acquire`,
+      { data: { controller: "user", ttl_seconds: DEFAULT_TTL_SECONDS } },
+    );
+    expect(
+      renewResp.status(),
+      `renewal re-acquire should be 200 (same holder extends); body: ${await renewResp.text()}`,
+    ).toBe(200);
+    const renew = await renewResp.json();
+    const renewedExpiry = new Date(renew.expires_at).getTime();
+
+    // 3. The renewal MUST push expires_at strictly PAST the original lease
+    //    window — that is the whole point of core#2216's renewal timer: a
+    //    fresh 300s starting now, so the lease outlives the original ~300s
+    //    deadline and the user is not kicked every ~5 minutes. (now()+300s,
+    //    fired before the original 300s elapsed, is strictly later than the
+    //    original now()+300s.)
+    expect(
+      renewedExpiry,
+      "renewal should extend the lease strictly past the original 300s deadline " +
+        `(original=${first.expires_at}, renewed=${renew.expires_at}). Equal-or-earlier ` +
+        "means the renewal did NOT extend — the 120s timer would not save the session.",
+    ).toBeGreaterThan(originalLeaseDeadlineMs);
+
+    // 4. Confirm the lock is still LIVE after renewal — GET /display/control
+    //    only returns a holder when expires_at > now() (loadActiveDisplayControl,
+    //    workspace_display_control.go:280). A held controller here proves the
+    //    renewed lease is active, not expired.
+    const ctrlResp = await page.request.get(
+      `${tenantURL}/workspaces/${workspaceId}/display/control`,
+    );
+    expect(ctrlResp.status(), "GET /display/control should be 200").toBe(200);
+    const ctrl = await ctrlResp.json();
+    expect(
+      ctrl.controller,
+      "after renewal the lock should still report a live holder (not 'none')",
+    ).toBe("user");
+    expect(
+      new Date(ctrl.expires_at).getTime(),
+      "the live lock's expires_at should match the renewed lease (lease is the " +
+        "renewed one, not the original).",
+    ).toBeGreaterThan(originalLeaseDeadlineMs);
+
+    // TODO(core#2332, CTO cost item): the assertions above prove the renewal
+    // CALL extends the lease past the original window — the deterministic proxy
+    // for "the 120s interval keeps the lease alive past 300s." To additionally
+    // prove the lease survives a FULL real-time 300s+ idle WS (the literal
+    // wall-clock claim), a long-lived test would hold one WS open >300s while
+    // the 120s timer renews underneath and assert the SAME socket never 1006s.
+    // That needs >5 min of standing-desktop wall-clock per run and is gated on
+    // the standing desktop EC2 being funded; it is NOT exercised here. Promote
+    // either form to a REQUIRED context only on CTO sign-off (cost + cadence).
+  });
+});
@@ -93,12 +93,12 @@ For "do we have any backend?", use `HasProvisioner()`, never bare `h.provisioner
 3. **Restart divergence on runtime changes.** Docker re-reads `/configs/config.yaml` from the container before stop, so a changed `runtime:` survives a restart even if the DB isn't synced. EC2 trusts the DB only. If you change the runtime via the Config tab and the handler races the restart, Docker will land on the new runtime, EC2 will land on the old one. **Fix path:** make the Config-tab save explicitly flush to DB before kicking off a restart, not deferred.
 4. **Console-output asymmetry.** Users debugging a stuck workspace on Docker see `docker logs`; on EC2 they see `GetConsoleOutput`. The two outputs look nothing alike. **Fix path:** expose a unified `GET /workspaces/:id/boot-log` that proxies to whichever backend serves the data. Already partly there via `cp_provisioner.Console`.
 5. **Template script drift.** `install.sh` and `start.sh` in each template repo do the same high-level work (install hermes-agent, write .env, write config.yaml, start gateway) but must be kept byte-level consistent on the provider-key forwarding block. Easy to forget. Enforced now by `tools/check-template-parity.sh` (see below) — run it in each template repo's CI.
-6. **Both backends panic when underlying client is nil.** Discovered by the contract-test scaffold landing in this PR: `Provisioner.{Stop,IsRunning}` nil-dereferences the Docker client, and `CPProvisioner.{Stop,IsRunning}` nil-dereferences `httpClient`. The real code always sets these, so this is theoretical in prod — but it means the contract runner can't execute scenarios against zero-value backends. **Fix path:** guard each method with `if p.docker == nil { return false, errNoBackend }` (and equivalent for CP), then flip the `t.Skip` in the contract tests to `t.Run`.
+6. **Both backends panic when underlying client is nil.** ✅ **Resolved** (`fix/provisioner-nil-guards-1813`). `Provisioner.{Stop,IsRunning}` and `CPProvisioner.{Stop,IsRunning}` now guard against nil clients with `ErrNoBackend`, so the contract-test runner executes scenarios against zero-valued backends without panic.

 ## Enforcement

 - **`tools/check-template-parity.sh`** (this repo) — ensures `install.sh` and `start.sh` in a template repo forward identical sets of provider keys. Wire into each template repo's CI as `bash $MONOREPO/tools/check-template-parity.sh install.sh start.sh`.
- **Contract tests** (stub) — `workspace-server/internal/provisioner/backend_contract_test.go` defines the behaviors every `provisioner.Provisioner` implementation must satisfy. Fails compile when a method drifts between `Docker` and `CPProvisioner`. Scenario-level runs are `t.Skip`'d today pending drift risk #6 (see above) — compile-time assertions still catch method drift.
+- **Contract tests** — `workspace-server/internal/provisioner/backend_contract_test.go` defines the behaviors every `provisioner.Provisioner` implementation must satisfy. Fails compile when a method drifts between `Docker` and `CPProvisioner`. Scenario-level runs execute against zero-valued backends since drift risk #6 was resolved (`fix/provisioner-nil-guards-1813`).
 - **Source-level dispatcher pins** — `workspace_provision_auto_test.go` enforces the SoT pattern documented above:
  - `TestNoCallSiteCallsDirectProvisionerExceptAuto` — no handler calls `.provisionWorkspace(` or `.provisionWorkspaceCP(` directly outside the dispatcher's allowlist.
  - `TestNoCallSiteCallsBareStop` — no handler calls `.provisioner.Stop(` or `.cpProv.Stop(` directly outside the dispatcher's allowlist (strips Go comments before substring match so archaeology in code comments doesn't trip the gate).
@@ -0,0 +1,225 @@
+# Fail-closed BYOK billing
+
+**Status:** Proposal — CTO (王泓铭)-refined 2026-06-05.
+Owners: hongming (CTO)
+Base: molecule-core main @ `1955fdd0` (2026-06-04)
+
+This RFC formalizes the **fail-closed BYOK billing** model: the contract that a
+workspace which intends to run an LLM on the tenant's own credential
+(bring-your-own-key) must be **rejected at the create API** if that credential is
+missing or dead — loudly, comprehensively, and synchronously — never created and
+then wedged at provision time, and never silently fell-through to a
+platform-billed default.
+
+It writes down the four hard requirements, audits the current implementation
+against them (two are met today, one partial, one missing), and specifies the
+two gaps to close. The derive-from-model SSOT and the platform proxy boundary are
+**non-goals** here — this RFC is only about closing the credential-validation
+holes around an already-correct billing-mode resolver.
+
+## TL;DR
+
+```
+create API request (runtime, model[, billing override])
+        │
+        ▼
+  derive provider/mode from providers.yaml registry SSOT   ── Req1 MET today
+  (explicit operator-override column = escape hatch)
+        │
+        ├─ mode == platform_managed ──────────────► create OK (proxy bills)
+        │
+        └─ mode == BYOK
+              │
+              ├─ GAP A: credential PRESENT for the derived provider?
+              │         (no → 422 MISSING_BYOK_CREDENTIAL, synchronous, loud)
+              │
+              ├─ GAP B: credential VALID? (cheap authed provider call;
+              │         401/403 → 422 INVALID_BYOK_CREDENTIAL, loud)
+              │
+              ▼
+        create OK → provision (re-checks presence as defense-in-depth)
+```
+
+## The model — four hard requirements
+
+1. **Explicit selection drives the adapter.** Provider/mode is *selected*, never
+   guessed. Today the selection is **derived deterministically** from the chosen
+   model via the `providers.yaml` registry SSOT (`DeriveProvider(runtime, model,
+   availableAuthEnv)`); the per-workspace operator-override column is the explicit
+   escape hatch with top precedence. There is no heuristic fallback to a vendor.
+
+2. **BYOK requires the credential, validated AT CREATION, fail-closed.** A
+   BYOK workspace with no usable credential for the derived provider must be
+   **REJECTED at the create API** with a clear, comprehensive error (which
+   credential / env var, which provider, what to do). It must NOT be created
+   (201) and then wedged late at provision.
+
+3. **Preflight-validate the credential is VALID, not just present.** Presence is
+   necessary but not sufficient: a present-but-dead token (revoked, expired,
+   wrong-scope) must be caught by a *cheap authenticated provider call* (a
+   models-list or a 1-token completion) and the workspace rejected on 401/403
+   before it goes live.
+
+4. **Fail LOUD, never silent.** Any missing / invalid / rejected credential
+   errors loudly: comprehensive server logs (provider, env var, code, workspace)
+   plus a user-visible structured reason. It must NEVER silently fall through to
+   `platform_managed` or to any default that bills the platform for what the
+   tenant declared as BYOK.
+
+## Current-state audit
+
+References are `path:line` at base `1955fdd0`. Workspace-server paths are relative
+to `workspace-server/`; the proxy/charge layer lives in the controlplane repo.
+
+### Req1 — Explicit selection drives the adapter — **MET**
+
+- `internal/handlers/llm_billing_mode.go:197-264` — `ResolveLLMBillingModeDerived`:
+  precedence 1 = explicit workspace override column; precedence 2 = derive the
+  provider from `(runtime, model)` via the embedded `providers.yaml` registry
+  (`manifest.DeriveProvider`). A specific non-platform vendor → `byok`; a platform
+  provider → `platform_managed`. No guessing.
+- `internal/handlers/workspace.go:420-503` — create-time validation already
+  hard-rejects (422) an unregistered `(runtime, model)` pair
+  (`UNREGISTERED_MODEL_FOR_RUNTIME`) and a model whose derived provider is absent
+  from the catalog (`DERIVED_PROVIDER_NOT_IN_REGISTRY`), and requires an explicit
+  model (`MODEL_REQUIRED`). The selection input is validated against the SSOT at
+  the boundary.
+
+### Req4 — Fail loud, never silent — **MET**
+
+- Default-closed on ambiguity: `internal/handlers/llm_billing_mode.go:26-39` and
+  `:217-252` — every ambiguous / error / no-id path resolves to
+  `platform_managed` *with the error surfaced* (logged + returned on the
+  resolution struct), never a silent BYOK→platform flip that bills the tenant
+  by surprise.
+- Proxy is platform-managed-only: controlplane `internal/handlers/llm_proxy.go:94,
+  158,223,664-748` — the platform LLM proxy only serves platform-managed traffic;
+  BYOK never routes through it.
+- Charge layer never bills the platform for BYOK: controlplane
+  `internal/credits/llm_billing.go:156-233` — BYOK usage is not charged to the
+  platform ledger.
+
+### Req2 — Credential validated at creation, fail-closed — **PARTIAL**
+
+- The fail-closed BYOK check EXISTS but only at **provision** time:
+  `internal/handlers/workspace_provision_shared.go:225-232` — if
+  `ResolvedMode == BYOK && !HasUsableLLMCred`, the provisioner aborts with
+  `MISSING_BYOK_CREDENTIAL` (molecule-core#1994).
+- Gap: a credential-less BYOK **create** returns **201** and only fails later at
+  provision. That violates Req2's "rejected at the create API, not
+  created-then-wedged" — the user gets a workspace row and a delayed, async
+  failure instead of a synchronous 4xx.
+
+### Req3 — Credential is VALID, not just present — **MISSING**
+
+- `HasUsableLLMCred` is **presence-only**:
+  `internal/handlers/workspace_provision.go:1138-1145` —
+  `hasAnyPlatformManagedLLMKey` returns true if any auth-env key is a non-empty
+  string. There is **no liveness probe anywhere** — a present-but-revoked token
+  passes every gate and the workspace goes live, then wedges at first real LLM
+  call (the failure Req3 exists to pull forward).
+
+## Scope of work — the two gaps
+
+### Gap A (Req2): BYOK credential-presence check at the CREATE boundary
+
+Add a synchronous presence check inside the create handler
+(`(h *WorkspaceHandler) Create`, `internal/handlers/workspace.go:242`), after
+billing-mode resolution and the existing registry validation, **in addition to**
+the provision-time check (keep that as defense-in-depth — do not remove it).
+
+- When the resolved mode is `byok`, resolve the derived provider's accepted auth
+  env-var names from the `providers.yaml` registry (`auth_env` list, e.g.
+  `[ANTHROPIC_API_KEY, ANTHROPIC_AUTH_TOKEN]` for `anthropic-api`) and confirm at
+  least one is present (non-empty) for the workspace at any in-scope secret level.
+- On absence: **422** with a structured body:
+  `code: MISSING_BYOK_CREDENTIAL`, plus `provider`, `missing_env` (the candidate
+  env-var names), `billing_mode: byok`, and a human `error` that names the
+  provider, the missing credential, and the remediation ("set
+  `ANTHROPIC_API_KEY` as a workspace or org secret, then retry create"). Reuse the
+  existing `formatMissingBYOKCredentialError` wording where possible so create and
+  provision speak with one voice.
+- Log loudly with the same `MISSING_BYOK_CREDENTIAL` code the provisioner uses, so
+  the two checkpoints are greppable as one class.
+
+### Gap B (Req3): credential LIVENESS preflight
+
+Add a minimal authenticated probe per provider, driven entirely by the
+`providers.yaml` SSOT — no hardcoded endpoints.
+
+- Derive the probe target from the registry entry: `protocol`/`auth_mode`,
+  `base_url_template` or `base_url_anthropic`, and the `auth_env` /
+  `auth_token_env` that carries the secret. Make the cheapest authenticated call
+  the surface offers (models-list where available, else a 1-token completion).
+- Fail-closed on **401/403**: reject the create with **422**
+  `code: INVALID_BYOK_CREDENTIAL` (provider, env var, upstream status, remediation
+  "the credential was found but the provider rejected it — rotate the key").
+- **Recommendation: probe at create** for fast feedback, with a **provision-time
+  re-check** (the credential can be revoked between create and provision; the
+  provisioner is the last gate before the workspace is live). The provision
+  re-check upgrades `workspace_provision_shared.go:225-232` from presence-only to
+  presence-and-liveness for BYOK.
+- The probe **must be cheap and time-bounded** (see Risks).
+- **OAuth-provider nuance:** registry entries with `auth_mode: oauth` and
+  `base_url: null` (e.g. `anthropic-oauth`, codex chatgpt-subscription) have no
+  HTTP surface the platform dials — the CLI talks to the vendor directly. For
+  these, the liveness probe has no cheap server-side equivalent; scope Gap B's
+  *active* probe to keyed providers with a non-null base URL and fall back to the
+  presence check (Gap A) for OAuth modes. Do not block on inventing an OAuth
+  liveness call in this RFC.
+
+## Non-goals
+
+- **Not** changing the derive-from-model SSOT. Selection stays
+  `providers.yaml` → `DeriveProvider`; the operator-override column stays the only
+  escape hatch. No new heuristics.
+- **Not** routing BYOK through the platform proxy. The proxy stays
+  platform-managed-only; this RFC adds validation around BYOK, it does not move
+  BYOK onto a platform code path.
+- **Not** re-billing or changing the charge layer. BYOK stays off the platform
+  ledger.
+- **Not** adding an OAuth-subscription liveness call (deferred — see Gap B
+  nuance).
+
+## Risks
+
+- **Preflight latency on create.** An authenticated provider round-trip adds
+  hundreds of ms to a few seconds to create. Mitigate with a hard, short timeout
+  (target ≤ ~3s) and a clear, distinct error on timeout — a probe timeout must
+  NOT be treated as "valid" (fail-closed) but must also be distinguishable from a
+  real 401/403 so transient upstream blips are diagnosable. Consider whether a
+  probe timeout should 422 (strict fail-closed) or surface a soft warning and
+  defer to the provision-time re-check; default to fail-closed at create for the
+  loud-feedback goal, with the provision re-check as the safety net.
+- **Provider rate-limits.** A models-list / 1-token probe consumes the tenant's
+  quota and can be rate-limited (429). A 429 is NOT an auth failure — treat it as
+  inconclusive (do not reject as `INVALID_BYOK_CREDENTIAL`), log it, and defer to
+  the presence check + provision-time re-check rather than blocking create on a
+  429.
+- **Provider-side flakiness.** 5xx from the provider is inconclusive, same
+  handling as 429 — never silently pass, never hard-reject on a 5xx; log and
+  defer.
+
+## Test plan
+
+1. **Gap A — create-time presence (unit + handler):**
+   - BYOK-deriving `(runtime, model)` with NO credential in any scope → **422
+     `MISSING_BYOK_CREDENTIAL`**, body names provider + missing env; no workspace
+     row created.
+   - Same with the credential present → create proceeds (mode `byok`).
+   - `platform_managed`-deriving model with no tenant key → create proceeds
+     (unchanged; proxy path).
+2. **Gap B — liveness (unit with a stubbed provider HTTP surface):**
+   - Present-but-401/403 key → **422 `INVALID_BYOK_CREDENTIAL`**.
+   - Valid key → create proceeds.
+   - 429 / 5xx / timeout → inconclusive: create NOT rejected as invalid; logged;
+     provision re-check still runs.
+   - `auth_mode: oauth` + `base_url: null` provider → active probe skipped,
+     presence check governs.
+3. **Provision defense-in-depth (existing + extended):**
+   - Credential revoked between create and provision → provisioner aborts
+     (presence today; liveness re-check after Gap B).
+   - Existing `MISSING_BYOK_CREDENTIAL` provision-abort test stays green.
+4. **Req4 regression guard:** assert no path flips a BYOK selection to
+   `platform_managed` silently — an absent/dead BYOK credential always produces a
+   loud 4xx with a code, never a 201 that bills the platform.
@@ -28,7 +28,9 @@
    {"name": "claude-code-default", "repo": "molecule-ai/molecule-ai-workspace-template-claude-code", "ref": "main"},
    {"name": "hermes", "repo": "molecule-ai/molecule-ai-workspace-template-hermes", "ref": "main"},
    {"name": "openclaw", "repo": "molecule-ai/molecule-ai-workspace-template-openclaw", "ref": "main"},
-    {"name": "codex", "repo": "molecule-ai/molecule-ai-workspace-template-codex", "ref": "main"}
+    {"name": "codex", "repo": "molecule-ai/molecule-ai-workspace-template-codex", "ref": "main"},
+    {"name": "google-adk", "repo": "molecule-ai/molecule-ai-workspace-template-google-adk", "ref": "main"},
+    {"name": "seo-agent", "repo": "molecule-ai/molecule-ai-workspace-template-seo-agent", "ref": "main"}
  ],
  "org_templates": [
    {"name": "molecule-dev", "repo": "molecule-ai/molecule-ai-org-template-molecule-dev", "ref": "main"},
@@ -8,26 +8,39 @@ against the latest `main`.

 ## Queue Contract

-Add the `merge-queue` label to an open PR when it is ready to merge.
+**Auto-discovery (opt-OUT, default).** You do NOT need to label a PR. The bot
+auto-discovers every open same-repo PR and merges any that meets the bar. The
+`merge-queue` label is now optional metadata, not a gate. This removed the
+historical autonomy gap: agent Gitea tokens lack `write:issue` (labels are
+issue-scoped), so agents could never self-label and ready PRs stalled.
+
+To keep a PR OUT of autonomous merging, add an opt-OUT label:
+`merge-queue-hold`, `do-not-auto-merge`, or `wip`. Draft PRs are also skipped.

 The bot processes one PR per tick:

-1. Confirms `main` is green.
-2. Selects the oldest open PR carrying `merge-queue`.
-3. Skips PRs with `merge-queue-hold`.
-4. Rejects fork PRs because the queue may only update same-repo branches.
-5. If the PR head does not contain current `main`, calls Gitea's
+1. Confirms `main`'s branch-protection-required push contexts are green.
+2. Selects the oldest open same-repo PR that is NOT opt-out-labeled and NOT a
+   draft (auto-discovery). With `AUTO_DISCOVER=0` it falls back to legacy
+   opt-IN: only PRs carrying `merge-queue` are considered.
+3. Rejects fork PRs because the queue may only update same-repo branches.
+4. If the PR head does not contain current `main`, calls Gitea's
   `/pulls/{n}/update?style=merge` endpoint and waits for CI on the new head.
-6. Merges only after the current PR head has required contexts green:
-   - `CI / all-required (pull_request)`
-   - `sop-checklist / all-items-acked (pull_request)`
+5. Merges only when, on the PR's CURRENT head sha:
+   - `>= required_approvals` distinct genuine official `APPROVED` reviews from
+     the recognised reviewer set (read from branch protection; default 2),
+   - no open official `REQUEST_CHANGES`,
+   - every branch-protection-required status context is green, and
+   - the PR is `mergeable` (Gitea returns `True`; `None`/`False` = wait).

-The workflow is serialized with `concurrency`, so two queued PRs cannot be
+The merge bar is unchanged by auto-discovery — only WHICH PRs are considered
+changes. The workflow is serialized with `concurrency`, so two PRs cannot be
 merged against the same observed `main`.

 ## Operator Commands

-Queue a PR:
+Queue a PR (optional — auto-discovery already considers every ready PR; the
+label is just visible metadata):

 ```bash
 curl -fsS -X POST \
@@ -37,7 +50,8 @@ curl -fsS -X POST \
  -d '{"labels":["merge-queue"]}'
 ```

-Temporarily hold a queued PR:
+Keep a PR OUT of autonomous merging (opt-OUT — use `merge-queue-hold`,
+`do-not-auto-merge`, or `wip`):

 ```bash
 curl -fsS -X POST \
@@ -56,9 +70,11 @@ REPO=molecule-ai/molecule-core \
 WATCH_BRANCH=main \
 QUEUE_LABEL=merge-queue \
 HOLD_LABEL=merge-queue-hold \
+AUTO_DISCOVER=1 \
+OPT_OUT_LABELS=do-not-auto-merge,wip \
+REVIEWER_SET=agent-reviewer,agent-researcher,agent-reviewer-cr2 \
 UPDATE_STYLE=merge \
-REQUIRED_CONTEXTS='CI / all-required (pull_request),sop-checklist / all-items-acked (pull_request)' \
-python3 .gitea/scripts/gitea-merge-queue.py
+python3 .gitea/scripts/gitea-merge-queue.py --dry-run
 ```

 Dry run:
@@ -50,8 +50,22 @@ check_category() {
        repo=$(echo "$MANIFEST_JSON" | jq -r ".${category}[$i].repo")
        TOTAL=$((TOTAL + 1))

-        # Check repo existence via Gitea API (public endpoint, no auth needed)
-        http_code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "${GITEA_API}/${repo}" 2>/dev/null || true)
+        # Check repo existence via Gitea API. Many manifest repos are PRIVATE
+        # (e.g. the workspace templates), so an *unauthenticated* GET returns
+        # 404 even when the repo exists — indistinguishable from a genuinely
+        # missing repo. We therefore authenticate with the same token
+        # clone-manifest.sh uses (MOLECULE_GITEA_TOKEN). A 404 *with* a valid
+        # token still means the repo is truly missing, which is what we want
+        # to catch. If the token is unset (local dev), fall back to an
+        # unauthenticated request — private repos will then 404, so run the
+        # check in CI where the token is present.
+        if [ -n "${MOLECULE_GITEA_TOKEN:-}" ]; then
+            http_code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 \
+                -H "Authorization: token ${MOLECULE_GITEA_TOKEN}" \
+                "${GITEA_API}/${repo}" 2>/dev/null || true)
+        else
+            http_code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "${GITEA_API}/${repo}" 2>/dev/null || true)
+        fi

        if [ "$http_code" != "200" ]; then
            echo "::error::manifest.json ${category} entry '${name}' → repo '${repo}' returned HTTP ${http_code} (expected 200). Delete the manifest entry BEFORE deleting the repo." >&2
@@ -0,0 +1,468 @@
+#!/usr/bin/env bash
+# GATING E2E for the social-channels outbound + discover + data-prune paths
+# (core#2332 P1.10). Closes two coverage gaps that were previously only
+# unit-mocked, so a regression in any of them goes RED in the required
+# `E2E API Smoke Test` lane instead of slipping through:
+#
+#  (1) Channel SEND end-to-end. Every adapter's SendMessage was only ever
+#      asserted by unit tests that reconstruct the payload by hand and POST
+#      it themselves (see internal/channels/lark_test.go's "we can't change
+#      the prefix const" comment) — nothing proved that a message submitted
+#      through the LIVE platform API actually serializes and POSTs to a
+#      provider endpoint. Here we stand up a local mock-upstream, point a
+#      Slack Incoming-Webhook channel at it, send via
+#      POST /channels/:id/send, and assert the MOCK RECEIVED the correctly
+#      serialized {"text":"..."} body. Real serialize+POST, real HTTP stack,
+#      no real Slack account.
+#
+#  (2) Channel DISCOVER (POST /channels/discover). Had no test at all. We
+#      point the Telegram discover path at a mock Bot API that serves
+#      getMe + getUpdates and assert the discovered bot username + chat
+#      round-trip back through the handler.
+#
+#  (3) Workspace data-prune (RFC #734). The user-requested permanent delete
+#      with ?purge=true prunes a workspace's durable child data (channels,
+#      secrets, config, …). We create prunable data on a target workspace
+#      AND a sibling, purge the target, then assert the target's child rows
+#      are GONE while the sibling's SURVIVE.
+#
+# ── Test seam (production-inert) ────────────────────────────────────────
+# Adapters pin their outbound host to the real vendor (hooks.slack.com /
+# api.telegram.org). Two env-gated overrides — set ONLY by this lane, never
+# in any prod/staging deploy — let the live send/discover path target a
+# local mock so the round-trip is provable in CI:
+#
+#   MOLECULE_CHANNELS_TEST_WEBHOOK_BASE       (Slack webhook accept-prefix)
+#   MOLECULE_CHANNELS_TEST_TELEGRAM_API_BASE  (Telegram Bot API base)
+#
+# These must be present in the PLATFORM process env (the workflow exports
+# them via $GITHUB_ENV before "Start platform"), pointing at the fixed
+# loopback ports this script binds its mocks on. If they are absent the
+# platform rejects the mock URLs; under E2E_REQUIRE_LIVE=1 that is a hard
+# RED (the seam regressed / the workflow wiring broke), otherwise a LOUD
+# SKIP for ad-hoc local runs that didn't export them.
+#
+# NEVER fail-open: a missing assertion target fails the script.
+#
+# Required env (defaults shown):
+#   BASE                       http://127.0.0.1:8080
+#   MOLECULE_ADMIN_TOKEN       (admin bearer; matches the platform's ADMIN_TOKEN)
+#   E2E_CHANNELS_WEBHOOK_PORT  18099   (mock Slack webhook upstream)
+#   E2E_CHANNELS_TELEGRAM_PORT 18098   (mock Telegram Bot API upstream)
+#   E2E_REQUIRE_LIVE           0        (1 = seam-absent is RED, not skip)
+
+set -uo pipefail
+
+# shellcheck disable=SC1091
+source "$(dirname "$0")/_lib.sh"   # sets BASE default + admin/token helpers
+
+WEBHOOK_PORT="${E2E_CHANNELS_WEBHOOK_PORT:-18099}"
+TELEGRAM_PORT="${E2E_CHANNELS_TELEGRAM_PORT:-18098}"
+REQUIRE_LIVE="${E2E_REQUIRE_LIVE:-0}"
+
+# The base prefixes the PLATFORM must have been started with. We assert the
+# adapter accepted a URL under these — proving the platform's env matches.
+WEBHOOK_BASE="http://127.0.0.1:${WEBHOOK_PORT}/"
+TELEGRAM_BASE="http://127.0.0.1:${TELEGRAM_PORT}"
+
+PASS=0
+FAIL=0
+WORK_DIR="$(mktemp -d)"
+WS_TARGET=""
+WS_SIBLING=""
+WS_TARGET_TOK=""
+WS_SIBLING_TOK=""
+MOCK_PID=""
+
+ADMIN_BEARER="${MOLECULE_ADMIN_TOKEN:-${ADMIN_TOKEN:-}}"
+ADMIN_AUTH=()
+[ -n "$ADMIN_BEARER" ] && ADMIN_AUTH=(-H "Authorization: Bearer $ADMIN_BEARER")
+
+pass() { echo "PASS: $1"; PASS=$((PASS + 1)); }
+fail() { echo "FAIL: $1"; [ -n "${2:-}" ] && echo "  $2"; FAIL=$((FAIL + 1)); }
+
+# loud_skip records a SKIP and exits according to E2E_REQUIRE_LIVE. NEVER
+# silently passes — it either hard-fails (require-live) or exits 0 with a
+# loud banner (ad-hoc local). Mirrors the require-live gate pattern used by
+# test_priority_runtimes_e2e.sh.
+loud_skip() {
+  local reason="$1"
+  echo
+  echo "============================================================"
+  if [ "$REQUIRE_LIVE" = "1" ]; then
+    echo "E2E_REQUIRE_LIVE=1 but channels e2e seam is unavailable:"
+    echo "  $reason"
+    echo "This is a HARD FAILURE — the platform was not started with the"
+    echo "channels test seam env (MOLECULE_CHANNELS_TEST_WEBHOOK_BASE /"
+    echo "MOLECULE_CHANNELS_TEST_TELEGRAM_API_BASE) on the fixed loopback"
+    echo "ports, or the seam regressed. Fix the workflow wiring or the seam."
+    echo "============================================================"
+    cleanup
+    exit 1
+  fi
+  echo "SKIP (loud): $reason"
+  echo "Set MOLECULE_CHANNELS_TEST_WEBHOOK_BASE=$WEBHOOK_BASE and"
+  echo "MOLECULE_CHANNELS_TEST_TELEGRAM_API_BASE=$TELEGRAM_BASE in the"
+  echo "PLATFORM env before starting it, then re-run. (CI sets these.)"
+  echo "============================================================"
+  cleanup
+  exit 0
+}
+
+cleanup() {
+  set +e
+  if [ -n "$MOCK_PID" ]; then
+    kill "$MOCK_PID" 2>/dev/null
+    wait "$MOCK_PID" 2>/dev/null
+  fi
+  # Hard-purge any workspaces we created so repeat runs are deterministic.
+  for pair in "$WS_TARGET|$WS_TARGET_TOK|e2e-chan-target" \
+              "$WS_SIBLING|$WS_SIBLING_TOK|e2e-chan-sibling"; do
+    local wid tok name
+    wid="${pair%%|*}"; pair="${pair#*|}"
+    tok="${pair%%|*}"; name="${pair#*|}"
+    [ -z "$wid" ] && continue
+    local auth=("${ADMIN_AUTH[@]}")
+    [ -n "$tok" ] && auth=(-H "Authorization: Bearer $tok")
+    curl -s -X DELETE "$BASE/workspaces/$wid?confirm=true&purge=true" \
+      -H "X-Confirm-Name: $name" "${auth[@]}" >/dev/null 2>&1
+  done
+  rm -rf "$WORK_DIR" 2>/dev/null
+}
+trap cleanup EXIT INT TERM
+
+# ── mock upstream ───────────────────────────────────────────────────────
+# One Python process serves BOTH mocks (different ports). It records the
+# Slack webhook request body to $WORK_DIR/slack_body.json and answers the
+# Telegram getMe/getUpdates calls with a deterministic bot+chat fixture.
+start_mock() {
+  cat > "$WORK_DIR/mock.py" <<'PY'
+import json
+import os
+import sys
+import threading
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+
+WORK_DIR = os.environ["MOCK_WORK_DIR"]
+WEBHOOK_PORT = int(os.environ["MOCK_WEBHOOK_PORT"])
+TELEGRAM_PORT = int(os.environ["MOCK_TELEGRAM_PORT"])
+
+BOT_USERNAME = "e2e_mock_bot"
+CHAT_ID = -1009876543210
+CHAT_NAME = "E2E Mock Group"
+
+
+class SlackHandler(BaseHTTPRequestHandler):
+    def log_message(self, *a):  # silence
+        pass
+
+    def do_POST(self):
+        n = int(self.headers.get("Content-Length", "0") or "0")
+        body = self.rfile.read(n)
+        # Persist EXACTLY what the live Slack send path POSTed so the bash
+        # side can assert the serialized payload.
+        with open(os.path.join(WORK_DIR, "slack_body.json"), "wb") as f:
+            f.write(body)
+        with open(os.path.join(WORK_DIR, "slack_meta.json"), "w") as f:
+            json.dump({"path": self.path,
+                       "content_type": self.headers.get("Content-Type", "")}, f)
+        # Real Slack Incoming Webhooks reply 200 "ok".
+        self.send_response(200)
+        self.end_headers()
+        self.wfile.write(b"ok")
+
+
+class TelegramHandler(BaseHTTPRequestHandler):
+    def log_message(self, *a):
+        pass
+
+    def _send(self, obj):
+        payload = json.dumps(obj).encode()
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(payload)))
+        self.end_headers()
+        self.wfile.write(payload)
+
+    def _route(self):
+        # tgbotapi calls <base>/bot<token>/<method>
+        method = self.path.rsplit("/", 1)[-1]
+        if method == "getMe":
+            return self._send({"ok": True, "result": {
+                "id": 4242, "is_bot": True, "first_name": "E2E Mock",
+                "username": BOT_USERNAME, "can_read_all_group_messages": True}})
+        if method == "setMyCommands":
+            return self._send({"ok": True, "result": True})
+        if method == "deleteWebhook":
+            return self._send({"ok": True, "result": True})
+        if method == "getUpdates":
+            # One my_chat_member update so the bot "discovers" a group.
+            return self._send({"ok": True, "result": [{
+                "update_id": 1,
+                "my_chat_member": {
+                    "chat": {"id": CHAT_ID, "title": CHAT_NAME, "type": "supergroup"},
+                    "from": {"id": 1, "is_bot": False, "first_name": "Op"},
+                    "date": 0,
+                    "old_chat_member": {"user": {"id": 4242, "is_bot": True,
+                                                 "first_name": "E2E Mock"},
+                                        "status": "left"},
+                    "new_chat_member": {"user": {"id": 4242, "is_bot": True,
+                                                 "first_name": "E2E Mock"},
+                                        "status": "member"},
+                }}]})
+        # Default OK for any other bot method tgbotapi may probe.
+        return self._send({"ok": True, "result": True})
+
+    def do_POST(self):
+        n = int(self.headers.get("Content-Length", "0") or "0")
+        if n:
+            self.rfile.read(n)
+        self._route()
+
+    def do_GET(self):
+        self._route()
+
+
+def serve(port, handler):
+    ThreadingHTTPServer(("127.0.0.1", port), handler).serve_forever()
+
+
+t = threading.Thread(target=serve, args=(TELEGRAM_PORT, TelegramHandler), daemon=True)
+t.start()
+serve(WEBHOOK_PORT, SlackHandler)
+PY
+  MOCK_WORK_DIR="$WORK_DIR" MOCK_WEBHOOK_PORT="$WEBHOOK_PORT" \
+    MOCK_TELEGRAM_PORT="$TELEGRAM_PORT" \
+    python3 "$WORK_DIR/mock.py" &
+  MOCK_PID=$!
+  # Wait for both ports to accept connections (fail loudly if they never do).
+  local up=0
+  for _ in $(seq 1 50); do
+    if curl -s -o /dev/null "http://127.0.0.1:${WEBHOOK_PORT}/" \
+       && curl -s -o /dev/null "http://127.0.0.1:${TELEGRAM_PORT}/botX/getMe"; then
+      up=1; break
+    fi
+    sleep 0.1
+  done
+  if [ "$up" != "1" ]; then
+    echo "FATAL: mock upstream did not come up on ports $WEBHOOK_PORT/$TELEGRAM_PORT" >&2
+    cleanup
+    exit 2
+  fi
+}
+
+json_field() { python3 -c "import sys,json; print(json.load(sys.stdin).get('$1',''))"; }
+
+create_external_ws() {
+  local name="$1" resp wid
+  resp=$(curl -s -X POST "$BASE/workspaces" "${ADMIN_AUTH[@]}" \
+    -H "Content-Type: application/json" \
+    -d "{\"name\":\"$name\",\"runtime\":\"external\",\"external\":true,\"tier\":1}")
+  wid=$(printf '%s' "$resp" | json_field id)
+  if [ -z "$wid" ]; then
+    echo "FATAL: could not create workspace $name: $resp" >&2
+    cleanup
+    exit 1
+  fi
+  local tok
+  tok=$(printf '%s' "$resp" | e2e_extract_token)
+  [ -z "$tok" ] && tok=$(e2e_mint_workspace_token "$wid" 2>/dev/null || true)
+  printf '%s\t%s\n' "$wid" "$tok"
+}
+
+# ════════════════════════════════════════════════════════════════════════
+echo "=== Channels + data-prune E2E (core#2332 P1.10) ==="
+echo "BASE=$BASE  webhook_mock=$WEBHOOK_BASE  telegram_mock=$TELEGRAM_BASE"
+
+if ! curl -sf "$BASE/health" >/dev/null 2>&1; then
+  echo "FATAL: platform not reachable at $BASE/health" >&2
+  exit 2
+fi
+
+start_mock
+
+# ── workspaces ──────────────────────────────────────────────────────────
+IFS=$'\t' read -r WS_TARGET WS_TARGET_TOK < <(create_external_ws "e2e-chan-target-$$")
+IFS=$'\t' read -r WS_SIBLING WS_SIBLING_TOK < <(create_external_ws "e2e-chan-sibling-$$")
+echo "target=$WS_TARGET sibling=$WS_SIBLING"
+
+WS_AUTH=("${ADMIN_AUTH[@]}")
+[ -n "$WS_TARGET_TOK" ] && WS_AUTH=(-H "Authorization: Bearer $WS_TARGET_TOK")
+SIB_AUTH=("${ADMIN_AUTH[@]}")
+[ -n "$WS_SIBLING_TOK" ] && SIB_AUTH=(-H "Authorization: Bearer $WS_SIBLING_TOK")
+
+# ── (1) SEND end-to-end via a Slack Incoming-Webhook channel ────────────
+echo
+echo "--- (1) channel SEND → mock upstream receives serialized payload ---"
+
+# Create a slack channel whose webhook_url points at our mock. If the
+# platform wasn't started with the webhook test-base, ValidateConfig
+# rejects this URL → loud_skip / RED. chat_id is required by SendOutbound.
+SLACK_CFG=$(python3 -c "import json,sys; print(json.dumps({
+  'webhook_url': sys.argv[1] + 'services/T000/B000/e2e',
+  'chat_id': 'mock-chat'}))" "$WEBHOOK_BASE")
+CREATE=$(curl -s -X POST "$BASE/workspaces/$WS_TARGET/channels" "${WS_AUTH[@]}" \
+  -H "Content-Type: application/json" \
+  -d "{\"channel_type\":\"slack\",\"config\":$SLACK_CFG,\"enabled\":true}")
+CH_ID=$(printf '%s' "$CREATE" | json_field id)
+if [ -z "$CH_ID" ]; then
+  case "$CREATE" in
+    *"invalid channel config"*)
+      loud_skip "platform rejected mock webhook_url (MOLECULE_CHANNELS_TEST_WEBHOOK_BASE not set on platform): $CREATE" ;;
+    *)
+      fail "create slack channel" "$CREATE" ;;
+  esac
+else
+  pass "create slack channel pointed at mock upstream (id=$CH_ID)"
+
+  SEND_TEXT="hello from e2e $$"
+  # Send route: wsAuth.POST /workspaces/:id/channels/:channelId/send (the
+  # handler keys off :channelId; :id scopes the workspace bearer).
+  SEND=$(curl -s -w $'\n%{http_code}' -X POST \
+    "$BASE/workspaces/$WS_TARGET/channels/$CH_ID/send" "${WS_AUTH[@]}" \
+    -H "Content-Type: application/json" \
+    -d "{\"text\":\"$SEND_TEXT\"}")
+  SEND_CODE=$(printf '%s' "$SEND" | tail -n1)
+  if [ "$SEND_CODE" = "200" ]; then
+    pass "POST /channels/:id/send returned 200"
+  else
+    fail "POST /channels/:id/send" "code=$SEND_CODE body=$(printf '%s' "$SEND" | sed '$d')"
+  fi
+
+  # Give the async-free SendOutbound a beat to land at the mock.
+  RECEIVED=""
+  for _ in $(seq 1 30); do
+    if [ -s "$WORK_DIR/slack_body.json" ]; then RECEIVED=1; break; fi
+    sleep 0.1
+  done
+  if [ -n "$RECEIVED" ]; then
+    pass "mock upstream RECEIVED an outbound POST"
+    GOT_TEXT=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('text',''))" \
+      "$WORK_DIR/slack_body.json" 2>/dev/null || true)
+    if [ "$GOT_TEXT" = "$SEND_TEXT" ]; then
+      pass "mock received correctly-serialized {\"text\":...} payload (text matches end-to-end)"
+    else
+      fail "serialized payload mismatch" "want=[$SEND_TEXT] got=[$GOT_TEXT] raw=$(cat "$WORK_DIR/slack_body.json")"
+    fi
+  else
+    fail "mock upstream never received the outbound POST" "send path did not serialize+POST to the configured endpoint"
+  fi
+fi
+
+# ── (2) DISCOVER via the Telegram mock Bot API ──────────────────────────
+echo
+echo "--- (2) POST /channels/discover (telegram) → mock Bot API ---"
+# A token matching the telegramTokenRegex (\d+:[A-Za-z0-9_-]{30,}).
+DISC_TOKEN="424242:AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+DISC=$(curl -s -w $'\n%{http_code}' -X POST "$BASE/channels/discover" \
+  "${ADMIN_AUTH[@]}" -H "Content-Type: application/json" \
+  -d "{\"channel_type\":\"telegram\",\"bot_token\":\"$DISC_TOKEN\",\"workspace_id\":\"$WS_TARGET\"}")
+DISC_CODE=$(printf '%s' "$DISC" | tail -n1)
+DISC_BODY=$(printf '%s' "$DISC" | sed '$d')
+if [ "$DISC_CODE" = "200" ]; then
+  pass "POST /channels/discover returned 200"
+  if printf '%s' "$DISC_BODY" | grep -qF '"bot_username":"e2e_mock_bot"'; then
+    pass "discover round-tripped the mock bot username"
+  else
+    fail "discover bot_username" "$DISC_BODY"
+  fi
+  if printf '%s' "$DISC_BODY" | grep -qF '"chat_id":"-1009876543210"'; then
+    pass "discover round-tripped the mock chat id"
+  else
+    fail "discover chat list" "$DISC_BODY"
+  fi
+else
+  case "$DISC_BODY" in
+    *"Cannot reach Telegram"*|*"Invalid bot token"*|*"Failed to connect"*)
+      # Platform reached the REAL api.telegram.org (seam not set) → can't prove.
+      loud_skip "discover hit real Telegram, not the mock (MOLECULE_CHANNELS_TEST_TELEGRAM_API_BASE not set on platform): code=$DISC_CODE $DISC_BODY" ;;
+    *)
+      fail "POST /channels/discover" "code=$DISC_CODE body=$DISC_BODY" ;;
+  esac
+fi
+
+# ── (3) Data-prune (RFC #734): purge removes prunable data, sibling survives
+echo
+echo "--- (3) data-prune: purge target's child data, sibling survives ---"
+
+# Seed prunable child data on BOTH workspaces: a channel (already on target)
+# + a secret on each. We assert via GET /channels which lists workspace_channels.
+seed_secret() {
+  local wid="$1"; shift
+  curl -s -o /dev/null -X POST "$BASE/workspaces/$wid/secrets" "$@" \
+    -H "Content-Type: application/json" \
+    -d '{"key":"E2E_PRUNE_PROBE","value":"v"}'
+}
+seed_secret "$WS_TARGET" "${WS_AUTH[@]}"
+# Sibling gets its OWN channel so we can prove its rows survive the target purge.
+SIB_SLACK_CFG=$(python3 -c "import json,sys; print(json.dumps({
+  'webhook_url': sys.argv[1] + 'services/T111/B111/sib',
+  'chat_id': 'sib-chat'}))" "$WEBHOOK_BASE")
+SIB_CH=$(curl -s -X POST "$BASE/workspaces/$WS_SIBLING/channels" "${SIB_AUTH[@]}" \
+  -H "Content-Type: application/json" \
+  -d "{\"channel_type\":\"slack\",\"config\":$SIB_SLACK_CFG,\"enabled\":true}")
+SIB_CH_ID=$(printf '%s' "$SIB_CH" | json_field id)
+
+# Pre-purge: confirm both workspaces have >=1 channel row.
+TGT_CH_PRE=$(curl -s "$BASE/workspaces/$WS_TARGET/channels" "${WS_AUTH[@]}")
+SIB_CH_PRE=$(curl -s "$BASE/workspaces/$WS_SIBLING/channels" "${SIB_AUTH[@]}")
+TGT_PRE_N=$(printf '%s' "$TGT_CH_PRE" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null || echo 0)
+SIB_PRE_N=$(printf '%s' "$SIB_CH_PRE" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null || echo 0)
+if [ "${TGT_PRE_N:-0}" -ge 1 ] && [ "${SIB_PRE_N:-0}" -ge 1 ]; then
+  pass "pre-purge: target ($TGT_PRE_N) and sibling ($SIB_PRE_N) both have channel data"
+else
+  fail "pre-purge seed" "target=$TGT_PRE_N sibling=$SIB_PRE_N (need >=1 each)"
+fi
+
+# Permanent delete WITH purge — the RFC #734 prune of durable child data.
+# DELETE /workspaces/:id is AdminAuth-gated (router.go:167); Tier-2b rejects a
+# workspace bearer when ADMIN_TOKEN is set, so this MUST use the admin bearer.
+# X-Confirm-Name must equal the workspace name (the destructive-delete guard).
+PURGE_AUTH=("${ADMIN_AUTH[@]}")
+[ ${#PURGE_AUTH[@]} -eq 0 ] && [ -n "$WS_TARGET_TOK" ] && PURGE_AUTH=(-H "Authorization: Bearer $WS_TARGET_TOK")
+PURGE=$(curl -s -w $'\n%{http_code}' -X DELETE \
+  "$BASE/workspaces/$WS_TARGET?confirm=true&purge=true" \
+  -H "X-Confirm-Name: e2e-chan-target-$$" "${PURGE_AUTH[@]}")
+PURGE_CODE=$(printf '%s' "$PURGE" | tail -n1)
+PURGE_BODY=$(printf '%s' "$PURGE" | sed '$d')
+if [ "$PURGE_CODE" = "200" ] && printf '%s' "$PURGE_BODY" | grep -qF '"status":"purged"'; then
+  pass "DELETE ?purge=true returned purged"
+else
+  fail "DELETE ?purge=true" "code=$PURGE_CODE body=$PURGE_BODY"
+fi
+# Target was purged → its token is revoked; query its channels with admin
+# bearer. The purge hard-deletes workspace_channels rows for the target.
+TGT_CH_POST=$(curl -s "$BASE/workspaces/$WS_TARGET/channels" "${ADMIN_AUTH[@]}")
+TGT_POST_N=$(printf '%s' "$TGT_CH_POST" | python3 -c "import sys,json
+try:
+  d=json.load(sys.stdin); print(len(d) if isinstance(d,list) else -1)
+except Exception:
+  print(-1)" 2>/dev/null || echo -1)
+if [ "${TGT_POST_N:-1}" = "0" ]; then
+  pass "post-purge: target's prunable channel data is GONE (0 rows)"
+else
+  fail "prune did not remove target channel data" "post-purge target rows=$TGT_POST_N body=$(printf '%s' "$TGT_CH_POST" | head -c 200)"
+fi
+WS_TARGET=""  # purged; don't re-delete in cleanup
+
+# Sibling (NON-prunable relative to the target purge) must be untouched.
+SIB_CH_POST=$(curl -s "$BASE/workspaces/$WS_SIBLING/channels" "${SIB_AUTH[@]}")
+SIB_POST_N=$(printf '%s' "$SIB_CH_POST" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null || echo -1)
+if [ "${SIB_POST_N:-0}" -ge 1 ] && printf '%s' "$SIB_CH_POST" | grep -qF "$SIB_CH_ID"; then
+  pass "post-purge: sibling's non-prunable data SURVIVED ($SIB_POST_N rows, channel $SIB_CH_ID intact)"
+else
+  fail "purge over-reached: sibling data did not survive" "sibling rows=$SIB_POST_N body=$(printf '%s' "$SIB_CH_POST" | head -c 200)"
+fi
+
+# ── verdict ─────────────────────────────────────────────────────────────
+echo
+echo "=== channels+prune e2e: $PASS passed, $FAIL failed ==="
+if [ "$FAIL" -ne 0 ]; then
+  exit 1
+fi
+# Guard against a vacuous green: every section must have produced asserts.
+if [ "$PASS" -lt 9 ]; then
+  echo "FATAL: only $PASS assertions ran — expected >=9 (send + discover + prune). Refusing to report green." >&2
+  exit 1
+fi
+echo "ALL CHANNELS + PRUNE E2E CHECKS PASSED"
@@ -7,12 +7,14 @@
 # extraction (and ongoing template work) can't silently break any
 # runtime.
 #
-# Runtimes covered: claude-code, codex, hermes, openclaw.
+# Runtimes covered: claude-code, codex, hermes, openclaw, google-adk.
 # claude-code + hermes have unique
 # provisioning quirks (claude-code OAuth, hermes 15-min cold-boot)
 # and stay first-class with their own run_<runtime> functions; the
-# OpenAI-backed runtimes share run_openai_runtime. Each phase skips cleanly
-# if its prerequisite secret is missing.
+# OpenAI-backed runtimes share run_openai_runtime. google-adk has its own
+# run_google_adk (it asserts manifest registration unconditionally, then drives
+# its AI-Studio BYOK live arm — keyless-Vertex needs platform WIF CI lacks).
+# Each phase skips cleanly if its prerequisite secret is missing.
 #
 # What this proves:
 #   1. Provisioning + container boot works for each runtime.
@@ -93,6 +95,7 @@
 #   E2E_RUNTIMES=minimax     tests/e2e/test_priority_runtimes_e2e.sh
 #   E2E_RUNTIMES=claude-code tests/e2e/test_priority_runtimes_e2e.sh
 #   E2E_RUNTIMES=hermes      tests/e2e/test_priority_runtimes_e2e.sh
+#   E2E_RUNTIMES=google-adk  tests/e2e/test_priority_runtimes_e2e.sh  # registration always; live arm needs E2E_GOOGLE_API_KEY
 #
 # Prereqs:
 #   - workspace-server on http://localhost:8080
@@ -513,6 +516,132 @@ print(json.dumps({
 run_codex()      { run_openai_runtime "codex"      "codex"; }
 run_openclaw()   { run_openai_runtime "openclaw"   "openclaw"; }

+####################################################################
+# google-adk arm — Gemini. REGISTRATION asserted always; LIVE arm is
+# REQUIRED-when-keyed, LOUD-skip-when-absent (NEVER best-effort/fail-open).
+####################################################################
+# google-adk serves Gemini two ways (providers.yaml runtimes.google-adk):
+#   * platform arm  → keyless Vertex via the Molecule LLM proxy (server-side
+#     WIF mint, platform_managed billing — the org-default PROD path). It needs
+#     a platform WIF identity that CI does NOT have, so this arm does NOT drive
+#     the keyless-Vertex path (no fail-open arm — we never green a path we can't
+#     actually exercise).
+#   * google arm   → AI Studio API-key BYOK (the tenant's OWN GOOGLE/GEMINI
+#     key), bare `gemini-2.5-pro`. This is the CI-/staging-exercisable path and
+#     is what the LIVE portion below drives when E2E_GOOGLE_API_KEY is present.
+#
+# Two-part contract (core#2332 P0.1 — google-adk previously had ZERO e2e):
+#   1. REGISTRATION (always, NO live creds): google-adk MUST be present in the
+#      deployed manifest.json's workspace_templates — that file is the SSOT the
+#      Create-handler's runtime allowlist is derived from (runtime_registry.go::
+#      loadRuntimesFromManifest). If it is absent, a google-adk create 422s
+#      RUNTIME_UNSUPPORTED, so registration is the precondition for ANY serving.
+#      Asserting it offline means even a key-less CI run proves google-adk is
+#      registered (a regression that drops it from the manifest reds the gate).
+#      This does NOT bump VALIDATED — registration is not end-to-end serving.
+#   2. LIVE (REQUIRED-when-keyed): with E2E_GOOGLE_API_KEY set, provision the
+#      AI-Studio BYOK arm end-to-end (online + non-error A2A reply). A miss here
+#      is a HARD fail() (fail-closed-if-present), exactly like the claude-code /
+#      hermes / openai arms — NOT a best-effort miss. Without the key the live
+#      portion is a LOUD skip() (dev-convenience), same as every keyed arm.
+run_google_adk() {
+  echo ""
+  echo "=== google-adk (Gemini) — registration + AI-Studio BYOK happy path ==="
+
+  # ── Part 1: REGISTRATION (always; no live creds needed) ──────────────────
+  # Assert google-adk is in the manifest.json workspace_templates SSOT (the
+  # Create-handler allowlist source). WORKSPACE_MANIFEST_PATH override mirrors
+  # the server's own env (runtime_registry.go::manifestPath); otherwise resolve
+  # the monorepo-root manifest.json relative to this script (tests/e2e/ -> repo
+  # root is two levels up).
+  local manifest="${WORKSPACE_MANIFEST_PATH:-$(cd "$(dirname "$0")/../.." && pwd)/manifest.json}"
+  if [ ! -f "$manifest" ]; then
+    fail "google-adk registration" "manifest.json not found at $manifest (cannot verify the runtime allowlist SSOT)"
+    return 0
+  fi
+  local registered
+  registered=$(python3 -c '
+import json, sys
+try:
+    m = json.load(open(sys.argv[1]))
+except Exception as e:
+    print("ERR:%s" % e); sys.exit(0)
+names = [t.get("name") for t in m.get("workspace_templates", [])]
+# loadRuntimesFromManifest strips the "-default" vanilla suffix; match the same.
+norm = {n[:-len("-default")] if isinstance(n, str) and n.endswith("-default") else n for n in names}
+print("yes" if "google-adk" in norm else "no:%s" % sorted(n for n in norm if n))
+' "$manifest")
+  if [ "$registered" != "yes" ]; then
+    fail "google-adk registered in manifest.json workspace_templates" \
+      "google-adk absent from the Create-handler runtime allowlist SSOT ($registered) — a create would 422 RUNTIME_UNSUPPORTED"
+    return 0
+  fi
+  pass "google-adk registered in manifest.json workspace_templates (Create-handler allowlist SSOT)"
+
+  # ── Part 2: LIVE arm (REQUIRED-when-keyed, LOUD-skip-when-absent) ─────────
+  # AI-Studio BYOK path: the tenant's own GOOGLE_API_KEY/GEMINI_API_KEY. The
+  # keyless-Vertex PROD path needs a platform WIF identity CI lacks, so it is
+  # NOT exercised here (no fail-open arm). Same env name the staging-full-saas
+  # google-adk arm uses (E2E_GOOGLE_API_KEY).
+  if [ -z "${E2E_GOOGLE_API_KEY:-}" ]; then
+    skip "E2E_GOOGLE_API_KEY not set (google-adk live arm needs an AI-Studio Gemini key; keyless-Vertex needs platform WIF, not available in CI)"
+    return 0
+  fi
+  local secrets
+  secrets=$(python3 -c "
+import json, os
+# The google provider (providers.yaml) reads GEMINI_API_KEY / GOOGLE_API_KEY and
+# dials generativelanguage.googleapis.com with the tenant's OWN key. Inject under
+# both names the provider accepts so the adapter resolves regardless of order.
+k = os.environ['E2E_GOOGLE_API_KEY']
+print(json.dumps({'GOOGLE_API_KEY': k, 'GEMINI_API_KEY': k}))
+")
+  local resp wsid
+  # Bare `gemini-2.5-pro` is the registered AI-Studio BYOK id for google-adk
+  # (providers.yaml runtimes.google-adk `google` arm). DeriveProvider routes the
+  # bare gemini- id to the google vendor (third_party_anthropic_compat, BYOK).
+  resp=$(curl -s -X POST "$BASE/workspaces" ${ADMIN_AUTH[@]+"${ADMIN_AUTH[@]}"} -H "Content-Type: application/json" \
+    -d "{\"name\":\"Priority E2E (google-adk)\",\"runtime\":\"google-adk\",\"tier\":1,\"model\":\"gemini-2.5-pro\",\"secrets\":$secrets}")
+  wsid=$(echo "$resp" | python3 -c 'import json,sys;print(json.load(sys.stdin).get("id",""))') || true
+  if [ -z "$wsid" ]; then
+    fail "create google-adk workspace" "$resp"
+    return 0
+  fi
+  CREATED_WSIDS+=("$wsid")
+  echo "  workspace=$wsid"
+
+  # google-adk runtime image cold boot ~30-90s (image already pulled).
+  local final
+  final=$(wait_for_status "$wsid" "online failed" 240) || true
+  if [ "$final" != "online" ]; then
+    fail "google-adk workspace reaches online" "final status: $final"
+    return 0
+  fi
+  pass "google-adk workspace reaches online"
+
+  local token
+  token=$(echo "$resp" | e2e_extract_token)
+  if [ -z "$token" ]; then
+    token=$(e2e_mint_workspace_token "$wsid")
+  fi
+  if [ -z "$token" ]; then
+    fail "resolve google-adk workspace token" "no token returned"
+    return 0
+  fi
+
+  local reply
+  if reply=$(send_test_prompt "$wsid" "$token"); then
+    if echo "$reply" | grep -q "PONG"; then
+      validated "google-adk reply contains PONG"
+    else
+      validated "google-adk reply non-empty (first 80 chars: ${reply:0:80})"
+    fi
+    assert_activity_logged "google-adk" "$wsid" "$token"
+  else
+    fail "google-adk reply" "${reply:-<empty or error>}"
+  fi
+}
+
 ####################################################################
 # Mock arm — the GUARANTEED, always-available REQUIRE-LIVE backbone.
 ####################################################################
@@ -742,10 +871,12 @@ print(json.dumps({'MINIMAX_API_KEY': os.environ['E2E_MINIMAX_API_KEY']}))

 # `mock` runs FIRST and by default: it is the no-key REQUIRE-LIVE backbone
 # that guarantees >=1 validation on a healthy platform (see run_mock). The
-# real-LLM arms (claude-code/codex/hermes/openclaw/minimax) run if their
-# secrets are present and add real-provider coverage on top; minimax is
-# best-effort (never reds the gate).
-WANT="${E2E_RUNTIMES:-mock claude-code codex hermes openclaw minimax}"
+# real-LLM arms (claude-code/codex/hermes/openclaw/minimax/google-adk) run if
+# their secrets are present and add real-provider coverage on top; minimax is
+# best-effort (never reds the gate). google-adk ALSO asserts its registration
+# unconditionally (no key needed), then drives its AI-Studio BYOK live arm as a
+# REQUIRED-when-keyed (fail-closed-if-present), LOUD-skip-when-absent arm.
+WANT="${E2E_RUNTIMES:-mock claude-code codex hermes openclaw minimax google-adk}"
 for r in $WANT; do
  case "$r" in
    mock)        run_mock ;;
@@ -754,7 +885,8 @@ for r in $WANT; do
    hermes)      run_hermes ;;
    openclaw)    run_openclaw ;;
    minimax)     run_minimax ;;
-    all)         run_mock; run_claude_code; run_codex; run_hermes; run_openclaw; run_minimax ;;
+    google-adk)  run_google_adk ;;
+    all)         run_mock; run_claude_code; run_codex; run_hermes; run_openclaw; run_minimax; run_google_adk ;;
    *) echo "unknown runtime in E2E_RUNTIMES: $r" >&2; exit 2 ;;
  esac
 done
@@ -1004,6 +1004,12 @@ for wid in "${WS_TO_CHECK[@]}"; do
  else
    DIAG_FAIL=$(echo "$DIAG_JSON" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('first_failure','unknown'))" 2>/dev/null || echo "unknown")
    DIAG_DETAIL=$(echo "$DIAG_JSON" | python3 -c "import json,sys; d=json.load(sys.stdin); s=[x for x in d.get('steps',[]) if not x.get('ok')]; step=s[0] if s else {}; print(' — '.join(x for x in [step.get('error',''), step.get('detail','')] if x))" 2>/dev/null || echo "")
+    # #767: always emit the full diagnose JSON so operators see every step's
+    # Detail field even when the Python extraction above fails or the shape
+    # drifts. The burst is bracketed like steps 2 and 4 for grep-friendly CI.
+    log "── DIAGNOSTIC BURST (step 7b — terminal diagnose for $wid) ──"
+    echo "$DIAG_JSON" | python3 -m json.tool 2>/dev/null || echo "$DIAG_JSON"
+    log "── END DIAGNOSTIC ──"
    fail "Workspace $wid terminal diagnose failed at step '$DIAG_FAIL': $DIAG_DETAIL — check tenant SG has tcp/22 from the configured EIC endpoint SG, MOLECULE_EIC_ENDPOINT_SG_ID is set in Railway, and EIC endpoint health"
  fi
 done
@@ -203,6 +203,60 @@ def test_f1_job_missing_from_sentinel_needs(drift_module, tmp_path, monkeypatch)
    assert any("F1 —" in f and "test" in f for f in findings), findings


+def test_detect_drift_403_fails_closed(drift_module, tmp_path, monkeypatch):
+    """AUTH FAILURE on branch_protections (HTTP 401/403) → RAISE (fail
+    closed). The token can't read BP, so drift is UNVERIFIABLE; greening
+    the hourly cron here would let jobs↔protection drift go silently
+    undetected — exactly the regression class this sentinel exists to
+    catch. fix/core-ci-fail-closed.
+    """
+    ci = _write_ci_yaml(
+        tmp_path,
+        jobs={"build": {"runs-on": "ubuntu-latest"}},
+        sentinel_needs=["build"],
+    )
+    audit = _write_audit_yaml(tmp_path, ["ci / build (pull_request)"])
+    _patch_paths(drift_module, monkeypatch, ci, audit)
+
+    stub = _make_stub_api({
+        ("GET", "/repos/owner/repo/branch_protections/main"): (
+            drift_module.ApiError(
+                "GET /repos/owner/repo/branch_protections/main → HTTP 403: forbidden"
+            )
+        ),
+    })
+    monkeypatch.setattr(drift_module, "api", stub)
+    with pytest.raises(drift_module.ApiError):
+        drift_module.detect_drift("main")
+
+
+def test_detect_drift_404_skips_branch(drift_module, tmp_path, monkeypatch):
+    """Authenticated 404 (branch genuinely has no protection, e.g. staging
+    pre-rollout) → tolerated skip: return ([], debug) with
+    protection_contexts_skipped True. NOT a fail-open (real read of an
+    absent resource with a valid token)."""
+    ci = _write_ci_yaml(
+        tmp_path,
+        jobs={"build": {"runs-on": "ubuntu-latest"}},
+        sentinel_needs=["build"],
+    )
+    audit = _write_audit_yaml(tmp_path, ["ci / build (pull_request)"])
+    _patch_paths(drift_module, monkeypatch, ci, audit)
+
+    stub = _make_stub_api({
+        ("GET", "/repos/owner/repo/branch_protections/staging"): (
+            drift_module.ApiError(
+                "GET /repos/owner/repo/branch_protections/staging → HTTP 404: not found"
+            )
+        ),
+    })
+    monkeypatch.setattr(drift_module, "api", stub)
+    findings, debug = drift_module.detect_drift("staging")
+    assert findings == []
+    assert debug.get("protection_contexts_skipped") is True
+    assert debug.get("protection_http_status") == 404
+
+
 def test_f1b_sentinel_needs_typo(drift_module, tmp_path, monkeypatch):
    """F1b: sentinel.needs lists a job not present in ci.yml (typo).

@@ -34,9 +34,12 @@ Test classes (per `feedback_branch_count_before_approving`):
    together, not short-circuited.
  - test_bp_empty_lints_nothing            — BP has no contexts.
    Exit 0 cleanly.
-  - test_api_403_skips_gracefully          — branch_protections endpoint
-    403s (token-scope). Exit 0 with ::error::, do NOT red-X.
-  - test_api_404_skips_gracefully          — branch has no protection.
+  - test_api_403_fails_closed              — branch_protections endpoint
+    401/403s (auth failure). FAIL CLOSED (exit 2) with ::error::.
+  - test_api_transient_fails_closed        — transient/unexpected API
+    error. FAIL CLOSED (exit 2).
+  - test_api_404_skips_gracefully          — branch has no protection
+    (authenticated absent resource). Tolerated skip (exit 0 + warning).
    Exit 0 cleanly.
  - test_context_event_match_required      — BP context says `(push)` and
    workflow only emits on `pull_request`. That's NOT a match — the
@@ -247,9 +250,10 @@ def test_bp_empty_lints_nothing(envset, monkeypatch, capsys):


 # ---------------------------------------------------------------------------
-# API 403 — graceful-degrade.
+# API 403 — AUTH FAILURE → FAIL CLOSED (exit 2). This is a HARD gate on a
+# protected context; a token that can't read BP must NOT green the lint.
 # ---------------------------------------------------------------------------
-def test_api_403_skips_gracefully(envset, monkeypatch, capsys):
+def test_api_403_fails_closed(envset, monkeypatch, capsys):
    _write_wf(
        envset,
        "ci.yml",
@@ -259,13 +263,30 @@ def test_api_403_skips_gracefully(envset, monkeypatch, capsys):
    m = _import_lint()
    _stub_api(monkeypatch, m, ("forbidden", None))
    rc = m.run()
-    assert rc == 0
+    assert rc == 2
    err = capsys.readouterr().err
    assert "403" in err or "scope" in err.lower() or "token" in err.lower()


 # ---------------------------------------------------------------------------
-# API 404 — branch has no protection → clean exit.
+# API transient/unexpected error → FAIL CLOSED (exit 2).
+# ---------------------------------------------------------------------------
+def test_api_transient_fails_closed(envset, monkeypatch, capsys):
+    _write_wf(
+        envset,
+        "ci.yml",
+        "name: CI\non:\n  pull_request:\n    branches: [main]\njobs:\n"
+        "  j:\n    runs-on: x\n    steps:\n      - run: echo hi\n",
+    )
+    m = _import_lint()
+    _stub_api(monkeypatch, m, ("error", None))
+    rc = m.run()
+    assert rc == 2
+
+
+# ---------------------------------------------------------------------------
+# API 404 — authenticated absent resource (branch has no protection) →
+# tolerated graceful skip (exit 0 with ::warning::), NOT a fail-open.
 # ---------------------------------------------------------------------------
 def test_api_404_skips_gracefully(envset, monkeypatch, capsys):
    _write_wf(
@@ -47,7 +47,10 @@ Test classes (per `feedback_branch_count_before_approving`):
    (the OLD context name disappears; the NEW one needs validation).
  - test_unrelated_workflow_edit_is_not_new       — edit a comment in
    an existing emitter; no new context introduced; pass.
-  - test_api_403_skips_gracefully                 — BP read 403; exit 0
+  - test_api_403_fails_closed                     — BP read 401/403 auth
+    failure → FAIL CLOSED (exit 2)
+  - test_api_transient_fails_closed               — transient → exit 2
+  - test_api_404_skips_gracefully                 — authenticated 404 → exit 0
    with stderr ::error::.
  - test_directive_must_be_in_workflow_yml        — directive in PR
    body alone is NOT sufficient; the comment must live in the
@@ -392,9 +395,10 @@ def test_unrelated_workflow_edit_is_not_new(env, monkeypatch, capsys):


 # ---------------------------------------------------------------------------
-# BP API 403 → exit 0 with ::error::.
+# BP API 401/403 = AUTH FAILURE → FAIL CLOSED (exit 2). A new emission can't
+# be verified against BP if the token can't read BP — must not green.
 # ---------------------------------------------------------------------------
-def test_api_403_skips_gracefully(env, monkeypatch, capsys):
+def test_api_403_fails_closed(env, monkeypatch, capsys):
    m = _import_lint()
    _stub_git_and_api(
        monkeypatch,
@@ -404,11 +408,44 @@ def test_api_403_skips_gracefully(env, monkeypatch, capsys):
        bp_response=("forbidden", None),
    )
    rc = m.run()
-    assert rc == 0
+    assert rc == 2
    err = capsys.readouterr().err
    assert "403" in err or "scope" in err.lower() or "token" in err.lower()


+# ---------------------------------------------------------------------------
+# BP API transient/unexpected error → FAIL CLOSED (exit 2).
+# ---------------------------------------------------------------------------
+def test_api_transient_fails_closed(env, monkeypatch, capsys):
+    m = _import_lint()
+    _stub_git_and_api(
+        monkeypatch,
+        m,
+        base_files={".gitea/workflows/ci.yml": WF_CI_BASE},
+        head_files={".gitea/workflows/ci.yml": WF_CI_NEW_JOB},
+        bp_response=("error", None),
+    )
+    rc = m.run()
+    assert rc == 2
+
+
+# ---------------------------------------------------------------------------
+# BP API authenticated 404 (branch genuinely unprotected) → tolerated
+# graceful skip (exit 0 with ::warning::), NOT a fail-open.
+# ---------------------------------------------------------------------------
+def test_api_404_skips_gracefully(env, monkeypatch, capsys):
+    m = _import_lint()
+    _stub_git_and_api(
+        monkeypatch,
+        m,
+        base_files={".gitea/workflows/ci.yml": WF_CI_BASE},
+        head_files={".gitea/workflows/ci.yml": WF_CI_NEW_JOB},
+        bp_response=("not_found", None),
+    )
+    rc = m.run()
+    assert rc == 0
+
+
 # ---------------------------------------------------------------------------
 # Directive must be in the workflow YML, not PR body.
 # ---------------------------------------------------------------------------
@@ -527,15 +527,13 @@ def test_multi_required_one_bad_one_good_fails(
            assert "good.yml" not in ln


-def test_protection_403_treated_as_skip(lint_module, monkeypatch, capsys):
-    """If the token can't read branch_protections (HTTP 403), exit 0
-    with a clear ::error::-but-non-fatal note. Same scope-fallback shape
-    as ci-required-drift.py per the precedent.
-
-    Rationale: if the lint workflow itself can't read protection, the PR
-    can't make THIS state worse (a paths-filter PR was already addable
-    without the lint). Better to surface a token-scope problem loudly
-    than to red-X every PR until the token is fixed.
+def test_protection_403_fails_closed(lint_module, monkeypatch, capsys):
+    """AUTH FAILURE → FAIL CLOSED (exit 4). If the token can't read
+    branch_protections (HTTP 401/403), the lint CANNOT enumerate the
+    required-check set and therefore CANNOT verify the no-paths-filter
+    invariant. This is a HARD gate on a protected (same-repo PR) context,
+    so it MUST fail loud rather than green an unverifiable gate — fix the
+    token, not the lint.
    """
    stub = _make_stub_api({
        ("GET", "/repos/owner/repo/branch_protections/main"): (
@@ -546,7 +544,26 @@ def test_protection_403_treated_as_skip(lint_module, monkeypatch, capsys):
    })
    monkeypatch.setattr(lint_module, "api", stub)
    rc = lint_module.run()
-    assert rc == 0
+    assert rc == 4
    err = capsys.readouterr().err
    assert "::error::" in err
    assert "403" in err
+
+
+def test_protection_404_skips_gracefully(lint_module, monkeypatch, capsys):
+    """Authenticated 404 (branch genuinely has no protection) is the one
+    tolerated degradation: there are no required contexts to check.
+    Exit 0 with a ::warning:: — NOT a fail-open (this is a real read of an
+    absent resource with a valid token, not an auth failure)."""
+    stub = _make_stub_api({
+        ("GET", "/repos/owner/repo/branch_protections/main"): (
+            lint_module.ApiError(
+                "GET /repos/owner/repo/branch_protections/main → HTTP 404: not found"
+            )
+        ),
+    })
+    monkeypatch.setattr(lint_module, "api", stub)
+    rc = lint_module.run()
+    assert rc == 0
+    err = capsys.readouterr().err
+    assert "404" in err
@@ -21,6 +21,27 @@ const (

 var slackHTTPClient = &http.Client{Timeout: slackHTTPTimeout}

+// slackWebhookAccepted reports whether a Slack Incoming Webhook URL is allowed
+// as a send destination. Production accepts only the real hooks.slack.com host.
+//
+// TEST SEAM (gating e2e): when MOLECULE_CHANNELS_TEST_WEBHOOK_BASE is set, a
+// URL with that prefix is ALSO accepted so tests/e2e/test_channels_e2e.sh can
+// point the live Slack send path at a local mock-upstream and assert the mock
+// actually received the serialized {"text":...} payload end-to-end (the unit
+// tests can only assert the body shape — see lark_test.go's prefix-gate
+// workaround comment). The env var is NEVER set in any production/staging
+// deploy; channelsTestWebhookBase() returns "" there and only the real
+// hooks.slack.com prefix passes, so this changes no production behaviour.
+func slackWebhookAccepted(u string) bool {
+	if strings.HasPrefix(u, slackWebhookPrefix) {
+		return true
+	}
+	if base := channelsTestWebhookBase(); base != "" && strings.HasPrefix(u, base) {
+		return true
+	}
+	return false
+}
+
 // SlackAdapter implements ChannelAdapter for Slack Incoming Webhooks.
 //
 // Outbound messages are sent via Slack Incoming Webhooks (the simple,
@@ -98,7 +119,7 @@ func (s *SlackAdapter) ValidateConfig(config map[string]interface{}) error {
 			return fmt.Errorf("bot_token mode requires channel_id")
 		}
 	}
-	if webhookURL != "" && !strings.HasPrefix(webhookURL, slackWebhookPrefix) {
+	if webhookURL != "" && !slackWebhookAccepted(webhookURL) {
 		return fmt.Errorf("invalid Slack webhook URL")
 	}
 	return nil
@@ -197,7 +218,7 @@ func (s *SlackAdapter) sendWebhookMessage(ctx context.Context, config map[string
 	if webhookURL == "" {
 		return fmt.Errorf("webhook_url not configured")
 	}
-	if !strings.HasPrefix(webhookURL, slackWebhookPrefix) {
+	if !slackWebhookAccepted(webhookURL) {
 		return fmt.Errorf("invalid Slack webhook URL")
 	}

@@ -148,7 +148,18 @@ func (t *TelegramAdapter) DiscoverChats(ctx context.Context, botToken string) (*
 		return nil, errors.New("invalid bot token format")
 	}

-	bot, err := tgbotapi.NewBotAPI(botToken)
+	// TEST SEAM: when MOLECULE_CHANNELS_TEST_TELEGRAM_API_BASE is set (only in
+	// the gating channels e2e — never in prod/staging), build the bot client
+	// against a local mock API base instead of api.telegram.org so
+	// POST /channels/discover can be proven end-to-end. The format string is
+	// "<base>/bot%s/%s" (token, method), matching tgbotapi.APIEndpoint.
+	var bot *tgbotapi.BotAPI
+	var err error
+	if apiBase := channelsTestTelegramAPIBase(); apiBase != "" {
+		bot, err = tgbotapi.NewBotAPIWithAPIEndpoint(botToken, apiBase+"/bot%s/%s")
+	} else {
+		bot, err = tgbotapi.NewBotAPI(botToken)
+	}
 	if err != nil {
 		return nil, fmt.Errorf("invalid bot token: %w", err)
 	}
@@ -0,0 +1,47 @@
+package channels
+
+import "os"
+
+// Test seams for the GATING channels e2e (tests/e2e/test_channels_e2e.sh).
+//
+// Every adapter pins its outbound destination to the real vendor host
+// (hooks.slack.com, discord.com, api.telegram.org) in both ValidateConfig and
+// SendMessage. That host pin is correct for production, but it means a real
+// end-to-end test cannot point the LIVE send/discover path at a local mock
+// upstream — so today the outbound serialize+POST is only ever asserted by
+// unit tests that reconstruct the payload by hand (see lark_test.go's
+// "we can't change the prefix const" comment) and never proven through the
+// running platform.
+//
+// These two env-gated overrides close that gap WITHOUT changing any
+// production behaviour:
+//
+//   - MOLECULE_CHANNELS_TEST_WEBHOOK_BASE — when set, Slack Incoming Webhook
+//     URLs with this prefix are accepted as send destinations (in addition to
+//     the real hooks.slack.com host). Lets the e2e create a slack channel whose
+//     webhook_url points at a local httptest mock and assert the mock RECEIVED
+//     the serialized {"text":...} payload.
+//
+//   - MOLECULE_CHANNELS_TEST_TELEGRAM_API_BASE — when set, TelegramAdapter.
+//     DiscoverChats builds its bot client against this API base instead of
+//     api.telegram.org, so POST /channels/discover can be exercised against a
+//     mock that serves getMe/getUpdates and the e2e can assert the discovered
+//     chats round-trip.
+//
+// Both vars are NEVER set in any production or staging deploy. The helpers
+// return "" there, so the real vendor-host pins are the only thing that
+// passes — production behaviour is byte-for-byte unchanged. Reading os.Getenv
+// on each call (not caching) keeps the seam honest: a process that never sets
+// the var can never accidentally enable it.
+
+// channelsTestWebhookBase returns the test-only accepted webhook base prefix,
+// or "" in production. See package doc above.
+func channelsTestWebhookBase() string {
+	return os.Getenv("MOLECULE_CHANNELS_TEST_WEBHOOK_BASE")
+}
+
+// channelsTestTelegramAPIBase returns the test-only Telegram Bot API base
+// (a printf format string "<base>/bot%s/%s"), or "" in production.
+func channelsTestTelegramAPIBase() string {
+	return os.Getenv("MOLECULE_CHANNELS_TEST_TELEGRAM_API_BASE")
+}
@@ -9,6 +9,7 @@ import (
 	"log"
 	"net/http"
 	"os"
+	"sort"
 	"strings"
 	"time"

@@ -18,6 +19,7 @@ import (
 	dockerclient "github.com/docker/docker/client"
 	"github.com/gin-gonic/gin"

+	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/providers"
 	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provisioner"
 )

@@ -41,10 +43,53 @@ func NewWorkspaceImageService(docker *dockerclient.Client) *WorkspaceImageServic
 	return &WorkspaceImageService{docker: docker}
 }

-// AllRuntimes is the canonical list mirroring docs/workspace-runtime-package.md.
-// Update both when a new template is added.
-var AllRuntimes = []string{
-	"claude-code", "codex", "hermes", "openclaw",
+// AllRuntimes is the canonical set of workspace runtimes this tenant will
+// pull/recreate template images for. It is DERIVED from the same providers
+// manifest SSOT (internal/providers/providers.yaml `runtimes:` block, mirrored
+// from CP's providers.yaml) that the rest of the platform routes against —
+// NOT a second hand-maintained list.
+//
+// Why derive instead of hardcode (controlplane#578): the old hardcoded slice
+// here ({claude-code, codex, hermes, openclaw}) silently DRIFTED from CP, which
+// already accepts `google-adk` for pin-promote/redeploy. A google-adk pin would
+// be accepted CP-side, then this tenant's POST /admin/workspace-images/refresh
+// ?runtime=google-adk rejected it 400 ("unknown runtime"), so google-adk image
+// fixes never deployed. Deriving from the manifest makes the tenant allowlist
+// and the CP allowlist provably the same set — they can't drift again.
+//
+// imageRefreshFallbackRuntimes is used ONLY if the embedded providers manifest
+// fails to load (which would be a build/CI failure caught by the providers
+// package's own tests, never a healthy prod). It preserves the historical
+// behavior — plus google-adk — so a manifest regression can never take the
+// refresh endpoint fully offline. Kept in lockstep with the providers.yaml
+// `runtimes:` keys; the drift guard in admin_workspace_images_test.go asserts
+// the two match.
+var imageRefreshFallbackRuntimes = []string{
+	"claude-code", "codex", "google-adk", "hermes", "openclaw",
+}
+
+// AllRuntimes is computed once at package init from the providers SSOT.
+var AllRuntimes = loadImageRefreshRuntimes()
+
+// loadImageRefreshRuntimes returns the sorted runtime names declared in the
+// providers manifest, falling back to imageRefreshFallbackRuntimes if the
+// manifest can't be loaded.
+func loadImageRefreshRuntimes() []string {
+	m, err := providers.LoadManifest()
+	if err != nil || len(m.Runtimes) == 0 {
+		if err != nil {
+			log.Printf("workspace-images: providers.LoadManifest failed (%v); falling back to static runtime allowlist", err)
+		}
+		out := append([]string(nil), imageRefreshFallbackRuntimes...)
+		sort.Strings(out)
+		return out
+	}
+	out := make([]string, 0, len(m.Runtimes))
+	for rt := range m.Runtimes {
+		out = append(out, rt)
+	}
+	sort.Strings(out)
+	return out
 }

 // RefreshResult is the per-call outcome surfaced to HTTP callers AND logged
@@ -197,7 +242,7 @@ func (s *WorkspaceImageService) Refresh(ctx context.Context, runtimes []string,

 // AdminWorkspaceImagesHandler serves POST /admin/workspace-images/refresh.
 //
-//	?runtime=claude-code   (optional; default = all 8 templates)
+//	?runtime=claude-code   (optional; default = all runtimes in AllRuntimes)
 //	&recreate=true|false   (default true; false = pull only)
 //
 // Returns JSON {pulled: [...], failed: [...], recreated: [...]}
@@ -3,7 +3,14 @@ package handlers
 import (
 	"encoding/base64"
 	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"sort"
 	"testing"
+
+	"github.com/gin-gonic/gin"
+
+	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/providers"
 )

 func TestGHCRAuthHeader_NoEnvReturnsEmpty(t *testing.T) {
@@ -92,6 +99,119 @@ func TestGHCRAuthHeader_RespectsRegistryEnv(t *testing.T) {
 	}
 }

+// runtimeListContains is a tiny membership helper for the runtime-allowlist tests.
+func runtimeListContains(s []string, v string) bool {
+	for _, x := range s {
+		if x == v {
+			return true
+		}
+	}
+	return false
+}
+
+// TestAllRuntimes_IncludesGoogleADK is the direct regression for
+// controlplane#578: a google-adk pin promote/redeploy is accepted CP-side, so
+// the tenant image-refresh allowlist MUST also accept google-adk or the image
+// fix never deploys (tenant returned 400 "unknown runtime"). google-adk lives
+// in the providers SSOT, so the derived AllRuntimes must contain it.
+func TestAllRuntimes_IncludesGoogleADK(t *testing.T) {
+	if !runtimeListContains(AllRuntimes, "google-adk") {
+		t.Fatalf("AllRuntimes must include google-adk (controlplane#578 drift); got %v", AllRuntimes)
+	}
+}
+
+// TestAllRuntimes_MatchesProvidersSSOT is the drift guard. AllRuntimes is
+// derived from providers.LoadManifest().Runtimes — assert it equals exactly the
+// runtime keys the providers manifest (mirrored from CP's providers.yaml)
+// declares. If CP adds/removes a runtime, this test fails RED until the tenant
+// re-derives, so the tenant image-refresh allowlist can never silently drift
+// from the CP pin-promote allowlist again.
+func TestAllRuntimes_MatchesProvidersSSOT(t *testing.T) {
+	m, err := providers.LoadManifest()
+	if err != nil {
+		t.Fatalf("providers.LoadManifest: %v", err)
+	}
+	want := make([]string, 0, len(m.Runtimes))
+	for rt := range m.Runtimes {
+		want = append(want, rt)
+	}
+	sort.Strings(want)
+
+	got := append([]string(nil), AllRuntimes...)
+	sort.Strings(got)
+
+	if len(got) != len(want) {
+		t.Fatalf("AllRuntimes drift: got %v, want %v (providers SSOT)", got, want)
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("AllRuntimes drift at %d: got %v, want %v (providers SSOT)", i, got, want)
+		}
+	}
+}
+
+// TestImageRefreshFallbackMatchesSSOT pins the static fallback (used only when
+// the embedded manifest fails to load) to the providers SSOT. If a runtime is
+// added to providers.yaml but not to imageRefreshFallbackRuntimes, this fails
+// RED — so a manifest-load failure can't silently drop a supported runtime.
+func TestImageRefreshFallbackMatchesSSOT(t *testing.T) {
+	m, err := providers.LoadManifest()
+	if err != nil {
+		t.Fatalf("providers.LoadManifest: %v", err)
+	}
+	want := make([]string, 0, len(m.Runtimes))
+	for rt := range m.Runtimes {
+		want = append(want, rt)
+	}
+	sort.Strings(want)
+
+	got := append([]string(nil), imageRefreshFallbackRuntimes...)
+	sort.Strings(got)
+
+	if len(got) != len(want) {
+		t.Fatalf("fallback drift: got %v, want %v (providers SSOT)", got, want)
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("fallback drift at %d: got %v, want %v (providers SSOT)", i, got, want)
+		}
+	}
+}
+
+// TestRefresh_RejectsUnknownRuntime asserts a genuinely unknown runtime still
+// 400s (the guard isn't removed) AND that the 400 body lists google-adk in
+// known_runtimes (proving the allowlist now advertises it). This exercises the
+// gin handler's reject branch, which runs entirely before any Docker call.
+func TestRefresh_RejectsUnknownRuntime(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+
+	// nil docker client is safe: the unknown-runtime branch returns 400
+	// before svc.Refresh (which is the only path that touches Docker).
+	h := &AdminWorkspaceImagesHandler{svc: &WorkspaceImageService{}}
+
+	r := gin.New()
+	r.POST("/admin/workspace-images/refresh", h.Refresh)
+
+	req := httptest.NewRequest(http.MethodPost, "/admin/workspace-images/refresh?runtime=not-a-real-runtime", nil)
+	rec := httptest.NewRecorder()
+	r.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusBadRequest {
+		t.Fatalf("unknown runtime: got status %d, want 400; body=%s", rec.Code, rec.Body.String())
+	}
+
+	var body struct {
+		Error         string   `json:"error"`
+		KnownRuntimes []string `json:"known_runtimes"`
+	}
+	if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil {
+		t.Fatalf("decode 400 body: %v (raw=%s)", err, rec.Body.String())
+	}
+	if !runtimeListContains(body.KnownRuntimes, "google-adk") {
+		t.Errorf("400 known_runtimes must advertise google-adk (controlplane#578); got %v", body.KnownRuntimes)
+	}
+}
+
 func TestGHCRAuthHeader_TrimsWhitespace(t *testing.T) {
 	t.Setenv("MOLECULE_IMAGE_REGISTRY", "")
 	// .env lines often have trailing newlines or accidental spaces. Without
@@ -73,6 +73,7 @@ func (h *ChannelHandler) List(c *gin.Context) {
 		var config map[string]interface{}
 		if err := json.Unmarshal(configJSON, &config); err != nil {
 			log.Printf("Channels: unmarshal config for channel %s: %v", id, err)
+			config = map[string]interface{}{}
 		}
 		// #319: decrypt sensitive fields first so the mask operates on
 		// plaintext (first-4 / last-4 of the real token, not the ciphertext
@@ -94,6 +95,7 @@ func (h *ChannelHandler) List(c *gin.Context) {
 		var allowed []string
 		if err := json.Unmarshal(allowedJSON, &allowed); err != nil {
 			log.Printf("Channels: unmarshal allowed_users for channel %s: %v", id, err)
+			allowed = []string{}
 		}

 		entry := map[string]interface{}{
@@ -540,9 +542,11 @@ func (h *ChannelHandler) Webhook(c *gin.Context) {
 		}
 		if err := json.Unmarshal(configJSON, &row.Config); err != nil {
 			log.Printf("Channels: unmarshal config for webhook row %s: %v", row.ID, err)
+			row.Config = map[string]interface{}{}
 		}
 		if err := json.Unmarshal(allowedJSON, &row.AllowedUsers); err != nil {
 			log.Printf("Channels: unmarshal allowed_users for webhook row %s: %v", row.ID, err)
+			row.AllowedUsers = []string{}
 		}
 		if err := channels.DecryptSensitiveFields(row.Config); err != nil {
 			log.Printf("Channels: decrypt webhook row %s: %v", row.ID, err)
@@ -116,6 +116,56 @@ func TestChannelHandler_List(t *testing.T) {
 	}
 }

+func TestChannelHandler_List_InvalidJSON_FallsBack(t *testing.T) {
+	mock := setupTestDB(t)
+	handler := NewChannelHandler(newTestChannelManager())
+
+	rows := sqlmock.NewRows([]string{
+		"id", "workspace_id", "channel_type", "channel_config", "enabled",
+		"allowed_users", "last_message_at", "message_count", "created_at", "updated_at",
+	}).AddRow(
+		"ch-bad", "ws-1", "telegram",
+		[]byte(`{not valid json`),
+		true, []byte(`[also not json`), nil, 0, nil, nil,
+	)
+	mock.ExpectQuery("SELECT .* FROM workspace_channels WHERE workspace_id").
+		WithArgs("ws-1").
+		WillReturnRows(rows)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Request, _ = http.NewRequest("GET", "/workspaces/ws-1/channels", nil)
+	c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+
+	handler.List(c)
+
+	if w.Code != 200 {
+		t.Errorf("expected 200, got %d", w.Code)
+	}
+
+	var result []map[string]interface{}
+	json.Unmarshal(w.Body.Bytes(), &result)
+	if len(result) != 1 {
+		t.Fatalf("expected 1 channel, got %d", len(result))
+	}
+
+	config, ok := result[0]["config"].(map[string]interface{})
+	if !ok {
+		t.Fatalf("expected config to be a map, got %T", result[0]["config"])
+	}
+	if len(config) != 0 {
+		t.Errorf("expected empty config after unmarshal fallback, got %v", config)
+	}
+
+	allowed, ok := result[0]["allowed_users"].([]interface{})
+	if !ok {
+		t.Fatalf("expected allowed_users to be a slice, got %T", result[0]["allowed_users"])
+	}
+	if len(allowed) != 0 {
+		t.Errorf("expected empty allowed_users after unmarshal fallback, got %v", allowed)
+	}
+}
+
 // ==================== Create ====================

 func TestChannelHandler_Create_Success(t *testing.T) {
@@ -546,6 +596,41 @@ func TestChannelHandler_Webhook_UnknownType(t *testing.T) {
 	}
 }

+// TestChannelHandler_Webhook_InvalidJSON_FallsBack verifies that when the DB
+// row contains invalid JSON for channel_config or allowed_users, the webhook
+// handler logs the error and falls back to an empty map/slice rather than
+// leaving the fields nil (which would panic on downstream code that expects
+// concrete values). With empty config there is no chat_id match, so the
+// handler returns {"status":"no_channel"}.
+func TestChannelHandler_Webhook_InvalidJSON_FallsBack(t *testing.T) {
+	mock := setupTestDB(t)
+	handler := NewChannelHandler(newTestChannelManager())
+
+	mock.ExpectQuery(`SELECT id, workspace_id, channel_type, channel_config, enabled, allowed_users FROM workspace_channels WHERE channel_type = .* AND enabled = true`).
+		WithArgs("telegram").
+		WillReturnRows(sqlmock.NewRows([]string{
+			"id", "workspace_id", "channel_type", "channel_config", "enabled", "allowed_users",
+		}).AddRow("ch-bad", "ws-1", "telegram", []byte(`{bad json`), true, []byte(`[bad json`)))
+
+	body := `{"update_id":1,"message":{"message_id":1,"from":{"id":111,"is_bot":false,"first_name":"Test","username":"testuser"},"chat":{"id":-100123,"title":"Test Group","type":"supergroup"},"date":1700000000,"text":"hello"}}`
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Request = httptest.NewRequest(http.MethodPost, "/webhooks/telegram", strings.NewReader(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+	c.Params = gin.Params{{Key: "type", Value: "telegram"}}
+
+	handler.Webhook(c)
+
+	if w.Code != 200 {
+		t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	var resp map[string]interface{}
+	json.Unmarshal(w.Body.Bytes(), &resp)
+	if resp["status"] != "no_channel" {
+		t.Errorf("expected status 'no_channel', got %v", resp["status"])
+	}
+}
+
 // ==================== Discover ====================

 func TestChannelHandler_Discover_MissingToken(t *testing.T) {
@@ -161,7 +161,7 @@ func (h *PluginsHandler) uninstallViaDocker(ctx context.Context, c *gin.Context,
 	// 1. Strip plugin's rule/fragment markers from CLAUDE.md (mirrors
 	//    AgentskillsAdaptor.uninstall lines 184-188). Best-effort: if
 	//    the user edited CLAUDE.md, our marker stays untouched.
-	h.stripPluginMarkersFromMemory(ctx, containerName, pluginName)
+	h.stripPluginMarkersFromMemory(ctx, workspaceID, containerName, pluginName)

 	// 2. Remove copied skill dirs declared in the plugin's plugin.yaml.
 	for _, skill := range skillNames {
@@ -171,9 +171,11 @@ func (h *PluginsHandler) uninstallViaDocker(ctx context.Context, c *gin.Context,
 			log.Printf("Plugin uninstall: skipping invalid skill name %q in %s: %v", skill, pluginName, err)
 			continue
 		}
-		_, _ = h.execAsRoot(ctx, containerName, []string{
+		if _, rmErr := h.execAsRoot(ctx, containerName, []string{
 			"rm", "-rf", "/configs/skills/" + skill,
-		})
+		}); rmErr != nil {
+			log.Printf("Plugin uninstall: failed to remove skill %s from %s: %v", skill, workspaceID, rmErr)
+		}
 	}

 	// 3. Delete the plugin directory itself (as root to handle file ownership).
@@ -393,7 +393,7 @@ func (h *PluginsHandler) readPluginSkillsFromContainer(ctx context.Context, cont
 // `# Plugin: <name> /` — mirrors AgentskillsAdaptor.uninstall's stripping
 // logic so install/uninstall are symmetric. Best-effort: silent on read or
 // write failure, since the rest of uninstall must still succeed.
-func (h *PluginsHandler) stripPluginMarkersFromMemory(ctx context.Context, containerName, pluginName string) {
+func (h *PluginsHandler) stripPluginMarkersFromMemory(ctx context.Context, workspaceID, containerName, pluginName string) {
 	// Use sed via bash -c for atomic in-place delete: drop the marker line
 	// and the blank line that follows it (install adds a leading blank line
 	// before the marker via append_to_memory). Three sed passes mirror the
@@ -417,7 +417,9 @@ func (h *PluginsHandler) stripPluginMarkersFromMemory(ctx context.Context, conta
 		`awk 'BEGIN{skip=0; blanks=0} /^%s/{skip=1; blanks=0; next} skip==1 && /^[[:space:]]*$/{blanks++; if(blanks>=2){skip=0; print; next} next} /^# Plugin: /{if(skip==1)skip=0} skip==1{next} {print}' /configs/CLAUDE.md > /tmp/claude.new && mv /tmp/claude.new /configs/CLAUDE.md`,
 		regexpEscapeForAwk(marker),
 	)
-	_, _ = h.execAsRoot(ctx, containerName, []string{"bash", "-c", script})
+	if _, awkErr := h.execAsRoot(ctx, containerName, []string{"bash", "-c", script}); awkErr != nil {
+		log.Printf("Plugin uninstall: failed to strip markers from CLAUDE.md for %s in %s: %v", pluginName, workspaceID, awkErr)
+	}
 }

 // regexpEscapeForAwk escapes characters that have special meaning inside an
@@ -332,6 +332,7 @@ func (h *WorkspaceHandler) buildProvisionerConfig(
 		InstanceType:    payload.Compute.InstanceType,
 		DiskGB:          int32(payload.Compute.Volume.RootGB),
 		DataPersistence: payload.Compute.DataPersistence,
+		Provider:        payload.Compute.Provider,
 		Display: provisioner.WorkspaceDisplayConfig{
 			Mode:     payload.Compute.Display.Mode,
 			Width:    payload.Compute.Display.Width,
@@ -174,6 +174,11 @@ type WorkspaceCompute struct {
 	// disk (wiped each recreate — privacy); "" = auto (desktop-control persists,
 	// others follow the org flag). Forwarded verbatim to CP's data_persistence.
 	DataPersistence string `json:"data_persistence,omitempty"`
+	// Provider is the CLOUD/compute backend for this workspace box (multi-provider
+	// RFC, per-workspace): ""/"aws" = default EC2; "hetzner"/"gcp" route to the
+	// CP WorkspaceProvisioner. Distinct from the LLM/model provider. Forwarded to
+	// CP /cp/workspaces/provision `provider`.
+	Provider string `json:"provider,omitempty"`
 }

 type CreateWorkspacePayload struct {
@@ -16,7 +16,7 @@ const SchemaVersion = 1
 // Fingerprint is a stable content hash of the generated projection (schema
 // version + provider catalog + runtime native sets). It changes iff the
 // registry DATA changes (comment-only YAML edits do not churn it).
-const Fingerprint = "e457249eb0fd77a2"
+const Fingerprint = "acb3798aa8ec3cec"

 // GenProvider is the generated projection of one provider catalog entry —
 // the subset a downstream consumer needs to derive + display a provider.
@@ -56,7 +56,7 @@ var Providers = []GenProvider{
 	{Name: "kimi-coding", DisplayName: "Moonshot Kimi (coding-tuned)", Protocol: "anthropic", AuthMode: "third_party_anthropic_compat", AuthEnv: []string{"KIMI_API_KEY", "ANTHROPIC_API_KEY", "ANTHROPIC_AUTH_TOKEN"}, ModelPrefixMatch: "^kimi-", IsPlatform: false},
 	{Name: "deepseek", DisplayName: "DeepSeek", Protocol: "anthropic", AuthMode: "third_party_anthropic_compat", AuthEnv: []string{"DEEPSEEK_API_KEY", "ANTHROPIC_AUTH_TOKEN", "ANTHROPIC_API_KEY"}, ModelPrefixMatch: "^deepseek[-:/]", IsPlatform: false},
 	{Name: "google", DisplayName: "Google Gemini", Protocol: "openai", AuthMode: "third_party_anthropic_compat", AuthEnv: []string{"GEMINI_API_KEY", "GOOGLE_API_KEY"}, ModelPrefixMatch: "^gemini-", IsPlatform: false},
-	{Name: "vertex", DisplayName: "Google Vertex AI (keyless ADC)", Protocol: "openai", AuthMode: "third_party_anthropic_compat", AuthEnv: []string{"GOOGLE_APPLICATION_CREDENTIALS"}, ModelPrefixMatch: "^vertex:", IsPlatform: false},
+	{Name: "vertex", DisplayName: "Google Vertex AI (keyless ADC)", Protocol: "openai", AuthMode: "wif_adc", AuthEnv: []string{"GOOGLE_APPLICATION_CREDENTIALS"}, ModelPrefixMatch: "^vertex:", IsPlatform: false},
 	{Name: "alibaba", DisplayName: "Alibaba Qwen (DashScope)", Protocol: "openai", AuthMode: "third_party_anthropic_compat", AuthEnv: []string{"DASHSCOPE_API_KEY", "ALIBABA_API_KEY"}, ModelPrefixMatch: "(?i)^(qwen|alibaba[:/])", IsPlatform: false},
 	{Name: "nousresearch", DisplayName: "Nous Research (Hermes)", Protocol: "openai", AuthMode: "third_party_anthropic_compat", AuthEnv: []string{"NOUSRESEARCH_API_KEY"}, ModelPrefixMatch: "^nousresearch[:/]", IsPlatform: false},
 	{Name: "openrouter", DisplayName: "OpenRouter (any model)", Protocol: "openai", AuthMode: "third_party_anthropic_compat", AuthEnv: []string{"OPENROUTER_API_KEY"}, ModelPrefixMatch: "^openrouter[:/]", IsPlatform: false},
@@ -99,7 +99,7 @@ var Runtimes = map[string][]GenRuntimeRef{
 	},
 	"google-adk": {
 		{Name: "platform", Models: []string{"platform:gemini-2.5-pro", "platform:gemini-2.5-flash"}},
-		{Name: "google", Models: []string{"gemini-2.5-pro", "gemini-2.5-flash"}},
+		{Name: "google", Models: []string{"gemini-2.5-pro", "gemini-2.5-flash", "google_genai:gemini-2.5-pro", "google_genai:gemini-2.5-flash"}},
 	},
 	"hermes": {
 		{Name: "kimi-coding", Models: []string{"kimi-coding/kimi-k2"}},
@@ -28,9 +28,20 @@
 #   display_name       canvas dropdown label
 #   vendor_logo        canvas asset key
 #   protocol           openai | anthropic   (proxy wire format)
-#   auth_mode          anthropic_api | oauth | third_party_anthropic_compat
-#   base_url_template  base URL for the openai-protocol surface (null = CLI/SDK default)
+#   auth_mode          anthropic_api | oauth | third_party_anthropic_compat |
+#                      wif_adc (keyless AWS→GCP WIF server-side mint; the one
+#                      value the proxy ACTS on — triggers vertexauth.Token)
+#   base_url_template  base URL for the openai-protocol surface (null = CLI/SDK
+#                      default). MAY contain {placeholder} tokens resolved at
+#                      resolution time from endpoint_vars (RFC vertex-provider-
+#                      ssot-endpoint §Design 1) — e.g. vertex's {location}/{project}.
 #   base_url_anthropic base URL for the anthropic-protocol surface (where applicable)
+#   endpoint_vars      OPTIONAL map placeholder -> {env, default}: how each
+#                      {placeholder} in base_url_template is resolved (env when
+#                      set + non-empty, else default — the structured form of the
+#                      proxy's envOr). Empty/absent = static URL (today's shape).
+#   wire_model_prefix  OPTIONAL publisher prefix the upstream expects on the wire
+#                      model id ("google/" for vertex). Empty = unprefixed.
 #   auth_env           env var names accepted (NAMES ONLY — never secrets); any one satisfies auth
 #   auth_token_env     env var the adapter projects the vendor key INTO (default ANTHROPIC_AUTH_TOKEN)
 #   model_prefix_match RE2 regex unifying proxy inferLLMProvider prefixes +
@@ -428,15 +439,34 @@ providers:
  #
  # NOTE: display_name ("keyless ADC") and auth_env (GOOGLE_APPLICATION_CREDENTIALS)
  # are now VESTIGIAL — no consumer reads auth_env post-leak-fix, but it must stay
-  # non-empty (providers.go validate). Left as-is to keep this a comment-only,
-  # regen-free change; retiring them is a registry-regen follow-up.
+  # non-empty (providers.go validate). Retiring them is a follow-up.
+  #
+  # RFC vertex-provider-ssot-endpoint (Phase 1): the endpoint that used to live
+  # ONLY in llm_proxy.go's `case "google", "vertex":` fmt.Sprintf is now
+  # expressed HERE as a templated base_url_template + endpoint_vars, and the
+  # keyless WIF mint is declared via auth_mode: wif_adc. The proxy resolves this
+  # row through Manifest.ResolveEndpoint — the interpolated URL is BYTE-IDENTICAL
+  # to the former fmt.Sprintf (drift-gated by TestProxyEndpointsMatchManifest).
+  # wire_model_prefix replaces the proxy's inline `if !HasPrefix(wireModel,
+  # "google/")`. (Phase 2 migrates the remaining static providers; out of scope.)
  - name: vertex
    display_name: "Google Vertex AI (keyless ADC)"
    vendor_logo: "google"
    protocol: openai
-    auth_mode: third_party_anthropic_compat
-    base_url_template: null
+    # wif_adc (AuthModeWIFADC): keyless AWS→GCP Workload Identity Federation
+    # server-side token mint (internal/vertexauth.Token). The ONE auth_mode the
+    # proxy acts on — it triggers the mint instead of a hardcoded `case "vertex"`.
+    auth_mode: wif_adc
+    # Templated endpoint: {location}/{project} interpolated from endpoint_vars
+    # below. Reproduces the proxy's former
+    # fmt.Sprintf("https://%s-aiplatform.googleapis.com/v1beta1/projects/%s/locations/%s/endpoints/openapi", loc, proj, loc).
+    base_url_template: "https://{location}-aiplatform.googleapis.com/v1beta1/projects/{project}/locations/{location}/endpoints/openapi"
    base_url_anthropic: null
+    endpoint_vars:
+      location: { env: MOLECULE_VERTEX_LOCATION, default: us-central1 }
+      project: { env: MOLECULE_VERTEX_PROJECT, default: molecule-vertex }
+    # Vertex requires the publisher-prefixed model id on the wire (google/<model>).
+    wire_model_prefix: "google/"
    auth_env: [GOOGLE_APPLICATION_CREDENTIALS]
    auth_token_env: ANTHROPIC_AUTH_TOKEN
    model_prefix_match: "^vertex:"
@@ -1028,7 +1058,26 @@ runtimes:
          - platform:gemini-2.5-pro
          - platform:gemini-2.5-flash
      # API-key BYOK arm: AI Studio (the tenant's OWN GOOGLE_API_KEY).
+      #
+      # The colon-namespaced `google_genai:` ids are the BYOK spelling the
+      # template's models[] offers (template-google-adk main:
+      # `google_genai:gemini-2.5-pro` / `-flash`, AI-Studio BYOK; the default is
+      # the platform arm above). The runtime adapter (_routing.resolve_model)
+      # treats `google_genai`/`google`/`gemini` as the SAME AI-Studio prefix
+      # family — it strips the prefix to the bare `gemini-2.5-pro` and serves it
+      # via ADK LlmAgent on the AI-Studio backend (GOOGLE_API_KEY) — so both the
+      # bare and `google_genai:` forms resolve to THIS `google` arm. The bare ids
+      # stay (registry-projection / canvas form); the `google_genai:` ids are
+      # ADDED because the LIVE core check is EXACT membership in ModelsForRuntime,
+      # NOT the `^gemini-` prefix (model_registry_validation.go), so without these
+      # exact entries a template BYOK create 422s UNREGISTERED_MODEL_FOR_RUNTIME.
+      # (This corrects the template's own stale comment that `google_genai:` is
+      # covered by the `^gemini-` prefix — it is not; `^gemini-` matches only the
+      # BARE id.) Vertex was intentionally dropped from the runtime arm (cp#514);
+      # the template no longer offers `vertex:`, so no vertex arm is added here.
      - name: google
        models:
          - gemini-2.5-pro
          - gemini-2.5-flash
+          - google_genai:gemini-2.5-pro
+          - google_genai:gemini-2.5-flash
@@ -29,7 +29,7 @@ import (
 // canonicalProvidersYAMLSHA256 is the sha256 of the canonical providers.yaml as
 // synced from molecule-controlplane. Bumped deliberately on each re-sync (see
 // file doc). Cross-checked live by the sync-providers-yaml CI workflow.
-const canonicalProvidersYAMLSHA256 = "9eb6f97fc37b528c91936be4a75dd87f6c7172742b4535d76b9bb2231ee18e80"
+const canonicalProvidersYAMLSHA256 = "ab51d3faa21348696bf53cffe241ac07d0762c4074207264efe4f58f7591c4dc"

 func TestSyncedYAMLMatchesCanonicalSHA(t *testing.T) {
 	sum := sha256.Sum256(embeddedYAML)
@@ -161,6 +161,9 @@ type cpProvisionRequest struct {
 	Tier         int    `json:"tier"`
 	InstanceType string `json:"instance_type,omitempty"`
 	DiskGB       int32  `json:"disk_gb,omitempty"`
+	// Provider routes the CP to the compute backend for this workspace box
+	// (multi-provider RFC, per-workspace). Distinct from the LLM/model provider.
+	Provider string `json:"provider,omitempty"`
 	// DataPersistence is the per-workspace durable-data choice (internal#734);
 	// CP validates the enum at its provision edge and resolves the data volume
 	// from it. Empty = auto (omitted on the wire).
@@ -257,6 +260,7 @@ func (p *CPProvisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string,
 		InstanceType:    cfg.InstanceType,
 		DiskGB:          cfg.DiskGB,
 		DataPersistence: cfg.DataPersistence,
+		Provider:        cfg.Provider,
 		Display:         cfg.Display,
 		PlatformURL:     cfg.PlatformURL,
 		Env:             env,
@@ -100,6 +100,7 @@ type WorkspaceConfig struct {
 	InstanceType    string // Optional CP EC2 instance type override (SaaS only)
 	DiskGB          int32  // Optional CP root volume size override in GiB (SaaS only)
 	DataPersistence string // internal#734: "persist"|"ephemeral"|"" — durable-data choice forwarded to CP (SaaS only)
+	Provider        string // multi-provider RFC: ""/"aws"|"hetzner"|"gcp" compute backend for the workspace box (per-workspace; distinct from LLM/model provider). Forwarded to CP.
 	Display         WorkspaceDisplayConfig
 	EnvVars         map[string]string // Additional env vars (API keys, etc.)
 	PlatformURL     string
@@ -0,0 +1,322 @@
+//go:build staging_e2e
+
+package staginge2e
+
+import (
+	"fmt"
+	"net/http"
+	"testing"
+	"time"
+)
+
+// TestDataVolumeSurvivesRecreate_Staging closes the data-persistence coverage
+// gap flagged in core#2332 (P0.5): "data-volume survives recreate" and
+// "snapshot-before-container-swap (/home/agent not wiped)" had NO e2e, and both
+// map to a real past incident — feedback_workspace_container_swap_wipes_home_agent:
+// on a container swap, only the /configs + /workspace binds (the durable data
+// volume, cp#326) survive; the container's own $HOME (/home/agent) is ephemeral
+// and is WIPED unless a snapshot is taken BEFORE docker stop+rm+run.
+//
+// This is the FORWARD half of that incident: prove the durable-data invariant
+// holds across a recreate so a future regression that drops the data-volume
+// reattach (or that flips a "persist" workspace to ephemeral) fails LOUD here
+// instead of silently eating a customer's /workspace state.
+//
+// What it does, end-to-end, against a real staging tenant:
+//  0. Provision a throwaway org + tenant via the CP admin API and acquire the
+//     tenant admin token (shared harness — mirrors workspace_lifecycle_test.go).
+//  1. Create a workspace with compute.data_persistence="persist" (the durable
+//     data-volume choice, internal#734) and wait for it to come ONLINE.
+//  2. Write a unique sentinel into /workspace (?root=/workspace) — the data
+//     volume per cp#326 — via the tenant Files API.
+//  3. Probe the /home/agent (container-$HOME) surface to encode the documented
+//     contract for the ephemeral side (see assertAgentHomeContract).
+//  4. Trigger a recreate / container-swap on the SAME data volume via
+//     POST /workspaces/:id/restart, and wait for ONLINE again.
+//  5. Assert the /workspace sentinel SURVIVES (data volume reattached +
+//     persisted). This is the load-bearing assertion — a wipe here is the
+//     regression we are gating.
+//
+// Guarded by the staging_e2e build tag and STAGING_E2E=1 env gate. Teardown is
+// t.Cleanup-driven (admin DELETE /cp/admin/tenants + DELETE /workspaces/:id).
+// Promote-to-required is a CTO call (infra-bound; see doc.go).
+func TestDataVolumeSurvivesRecreate_Staging(t *testing.T) {
+	cfg := requireStagingEnv(t)
+
+	// Unique-per-run sentinel so a stale prior run can never make a wiped
+	// volume look "survived" (we compare exact content, not mere existence).
+	stamp := time.Now().UnixNano()
+	relPath := fmt.Sprintf("e2e-persist/%d.sentinel", stamp)
+
+	slug := fmt.Sprintf("e2e-persist-%d", time.Now().Unix()%100000000)
+	t.Logf("data-persistence: slug=%s", slug)
+
+	// --- Step 0: provision org + tenant, acquire token + wait TLS ready ---
+	orgID := adminCreateOrg(t, cfg, slug)
+	t.Cleanup(func() { adminDeleteTenant(t, cfg, slug) })
+	t.Logf("org created: org_id=%s", orgID)
+
+	token := tenantAdminToken(t, cfg, slug)
+	tenantHost := slug + "." + cfg.subdomainSuffix
+	waitForHTTP(t, tenantHost, http.StatusOK, 10*time.Minute, "tenant /health ready")
+	t.Logf("tenant TLS ready: %s", tenantHost)
+
+	sentinel := fmt.Sprintf("data-volume-survives-recreate stamp=%d host=%s", stamp, tenantHost)
+
+	// --- Step 1: create workspace with durable data persistence ---
+	wsID := createPersistWorkspace(t, tenantHost, token, orgID, stamp)
+	t.Cleanup(func() { deletePersistWorkspace(t, tenantHost, token, orgID, wsID) })
+	t.Logf("workspace created: id=%s (data_persistence=persist)", wsID)
+
+	waitForWorkspaceOnline(t, tenantHost, token, orgID, wsID, 20*time.Minute)
+	t.Logf("workspace %s ONLINE", wsID)
+
+	// --- Step 2: write the /workspace sentinel (data volume, cp#326) ---
+	writeWorkspaceFile(t, tenantHost, token, orgID, wsID, "/workspace", relPath, sentinel)
+	t.Logf("wrote /workspace sentinel: root=/workspace path=%s", relPath)
+
+	// Read it straight back so a write that silently no-op'd can't masquerade
+	// as a survived-recreate later. This also confirms the EIC write landed on
+	// the host data volume before we swap the container out from under it.
+	if got := readWorkspaceFile(t, tenantHost, token, orgID, wsID, "/workspace", relPath); got != sentinel {
+		t.Fatalf("pre-recreate readback mismatch: wrote %q, read %q", sentinel, got)
+	}
+	t.Logf("pre-recreate readback OK")
+
+	// --- Step 3: encode the /home/agent (ephemeral container-$HOME) contract ---
+	assertAgentHomeContract(t, tenantHost, token, orgID, wsID, stamp)
+
+	// A successful Files write to a SaaS workspace can itself debounce-trigger
+	// an auto-restart (internal#624). Settle that window first so our explicit
+	// recreate below is the swap we actually measure, not a coalesced one that
+	// races our readback.
+	settleAutoRestart(t, tenantHost, token, orgID, wsID)
+
+	// --- Step 4: recreate / container-swap on the SAME data volume ---
+	// POST /restart is the recreate path: Stop (prune=false ALWAYS for restart,
+	// so the data volume is NEVER erased) -> re-provision on the same volume,
+	// templates NOT re-applied. See workspace_restart.go runRestartCycle.
+	triggerRecreate(t, tenantHost, token, orgID, wsID)
+	t.Logf("recreate (container swap) triggered via POST /restart")
+
+	// The swap flips status to 'provisioning'; wait for it to come back ONLINE.
+	waitForRecreateThenOnline(t, tenantHost, token, orgID, wsID, 20*time.Minute)
+	t.Logf("workspace %s back ONLINE after recreate", wsID)
+
+	// --- Step 5: LOAD-BEARING — the /workspace sentinel must SURVIVE ---
+	got := readWorkspaceFile(t, tenantHost, token, orgID, wsID, "/workspace", relPath)
+	if got != sentinel {
+		t.Fatalf("DATA-VOLUME REGRESSION: /workspace sentinel did NOT survive recreate.\n"+
+			"  wrote: %q\n  read:  %q\n"+
+			"  This is the cp#326 durable-data-volume invariant: a 'persist' workspace's\n"+
+			"  /workspace MUST survive a container swap. A wipe here means the data volume\n"+
+			"  was not reattached (or a persist→ephemeral regression). See\n"+
+			"  feedback_workspace_container_swap_wipes_home_agent.", sentinel, got)
+	}
+	t.Logf("PASS: /workspace sentinel SURVIVED recreate — data-volume invariant holds (cp#326)")
+}
+
+// assertAgentHomeContract encodes the CORRECT, documented expectation for the
+// /home/agent (container-$HOME) side of the incident.
+//
+// The Files API exposes the container's own $HOME via ?root=/agent-home (the
+// docker-exec backend, internal#425 RFC). That backend is intentionally STUBBED
+// today: every verb returns 501 Not Implemented. So there is NO supported
+// platform write path into the container's /home/agent — which is precisely
+// because that directory is EPHEMERAL: it lives inside the container, not on the
+// durable data volume, and is WIPED on every container swap unless a snapshot is
+// taken first (the incident's snapshot-before-stop+rm+run rule, which is a
+// CP-side provisioner concern, not a tenant ws-server file-API surface).
+//
+// This assertion is the regression tripwire for that contract: if a future
+// change wires /agent-home to a path WITHOUT also making it data-volume-backed,
+// this 501 flips to 200 and the test fails LOUD — forcing whoever lit up the
+// surface to first answer "is /home/agent now durable, and was the snapshot
+// hook added?" rather than silently shipping a wipe-on-recreate surface.
+//
+// We do NOT write-then-recreate-then-expect-wipe on /home/agent: asserting a
+// WIPE as a pass would be fail-open (a no-op write would also "pass"). Pinning
+// the 501 contract is the fail-closed encoding.
+func assertAgentHomeContract(t *testing.T, host, token, orgID, wsID string, stamp int64) {
+	t.Helper()
+	rel := fmt.Sprintf("e2e-persist/%d.home.sentinel", stamp)
+	url := fmt.Sprintf("https://%s/workspaces/%s/files/%s?root=%s",
+		host, wsID, rel, "/agent-home")
+	status, body := doTenantJSON(t, "PUT", url, token, orgID, fmt.Sprintf(`{"content":%q}`, "x"))
+
+	switch status {
+	case http.StatusNotImplemented:
+		// Documented contract: container-$HOME browse/write is stubbed BECAUSE
+		// it is ephemeral. No durable surface to assert survival on. Good.
+		t.Logf("/home/agent contract OK: /agent-home is 501 (ephemeral container-$HOME, no durable write surface — snapshot-before-swap is a CP-side concern)")
+	case http.StatusOK:
+		// The stub was lit up. This is a contract change that MUST be paired
+		// with data-volume backing + a snapshot-before-swap hook; until this
+		// test is extended to prove BOTH, treat the bare flip as a regression
+		// of the documented ephemeral contract.
+		t.Fatalf("CONTRACT DRIFT: PUT ?root=/agent-home returned 200 — the container-$HOME surface was wired up.\n"+
+			"  Per feedback_workspace_container_swap_wipes_home_agent, /home/agent is EPHEMERAL and wiped on\n"+
+			"  container swap unless snapshotted first. If this surface is now durable, EXTEND this test to\n"+
+			"  write→recreate→assert-survival on /home/agent AND assert the snapshot-before-swap hook fired.\n"+
+			"  Do not leave a write-able-but-ephemeral surface uncovered. body=%s", body)
+	default:
+		// 4xx other than 501 (e.g. 400/404) is acceptable — still "not a
+		// durable write surface". Anything 5xx that ISN'T 501 is a real bug.
+		if status >= 500 {
+			t.Fatalf("/home/agent contract probe: unexpected %d (want 501 or a 4xx): %s", status, body)
+		}
+		t.Logf("/home/agent contract: ?root=/agent-home returned %d (non-durable surface) — acceptable", status)
+	}
+}
+
+// --- workspace lifecycle over the tenant API ------------------------------
+
+// createPersistWorkspace creates a throwaway workspace with the durable
+// data-volume choice (compute.data_persistence="persist", internal#734). The
+// "persist" choice is what makes /workspace survive a recreate; we set it
+// explicitly rather than relying on the auto/org-flag default so the invariant
+// under test is unambiguous.
+func createPersistWorkspace(t *testing.T, host, token, orgID string, stamp int64) string {
+	t.Helper()
+	url := "https://" + host + "/workspaces"
+	body := fmt.Sprintf(
+		`{"name":%q,"runtime":%q,"tier":%d,"compute":{"data_persistence":%q}}`,
+		fmt.Sprintf("e2e-persist-%d", stamp%100000000), "claude-code", 1, "persist",
+	)
+	status, resp := doTenantJSON(t, "POST", url, token, orgID, body)
+	if status != http.StatusCreated && status != http.StatusOK {
+		t.Fatalf("create workspace: HTTP %d: %s", status, resp)
+	}
+	id := jsonField(resp, "id")
+	if id == "" {
+		t.Fatalf("create workspace: no id in response: %s", resp)
+	}
+	return id
+}
+
+// deletePersistWorkspace is the t.Cleanup teardown — best-effort, never fails
+// the test. DELETE without prune so a hung delete doesn't strand the test;
+// staging sweep reclaims any leftover compute. (The org/tenant itself is torn
+// down separately via adminDeleteTenant.)
+func deletePersistWorkspace(t *testing.T, host, token, orgID, wsID string) {
+	t.Helper()
+	url := "https://" + host + "/workspaces/" + wsID
+	status, resp := doTenantJSON(t, "DELETE", url, token, orgID, "")
+	if status != http.StatusOK && status != http.StatusAccepted && status != http.StatusNoContent && status != http.StatusNotFound {
+		t.Logf("WARNING: teardown DELETE workspace %s returned HTTP %d: %s (manual cleanup may be needed)", wsID, status, resp)
+		return
+	}
+	t.Logf("teardown: deleted workspace %s (HTTP %d)", wsID, status)
+}
+
+// waitForWorkspaceOnline polls GET /workspaces/:id until .status == "online".
+func waitForWorkspaceOnline(t *testing.T, host, token, orgID, wsID string, timeout time.Duration) {
+	t.Helper()
+	url := "https://" + host + "/workspaces/" + wsID
+	deadline := time.Now().Add(timeout)
+	var last string
+	for time.Now().Before(deadline) {
+		status, body := doTenantJSON(t, "GET", url, token, orgID, "")
+		if status == http.StatusOK {
+			last = jsonField(body, "status")
+			if last == "online" {
+				return
+			}
+		}
+		time.Sleep(10 * time.Second)
+	}
+	t.Fatalf("workspace %s did not reach status=online within %s (last=%q)", wsID, timeout, last)
+}
+
+// triggerRecreate POSTs /restart, the recreate / container-swap path. The
+// handler tears down the container and re-provisions on the SAME data volume
+// (Stop is called with prune=false for restart — see workspace_restart.go's
+// cpStopWithRetryErr — so a recreate can NEVER erase the data volume).
+func triggerRecreate(t *testing.T, host, token, orgID, wsID string) {
+	t.Helper()
+	url := "https://" + host + "/workspaces/" + wsID + "/restart"
+	status, body := doTenantJSON(t, "POST", url, token, orgID, "")
+	if status != http.StatusOK && status != http.StatusAccepted {
+		t.Fatalf("trigger recreate (POST /restart): HTTP %d: %s", status, body)
+	}
+}
+
+// waitForRecreateThenOnline waits out the swap. The recreate flips status to
+// 'provisioning'; we first observe it LEAVE online (so we don't read a stale
+// "still online" before the swap starts), then wait for it to return to online.
+// If we never catch the provisioning dip (fast swap), the subsequent online
+// poll still proves liveness — the load-bearing assertion is the sentinel read,
+// not the transient state machine.
+func waitForRecreateThenOnline(t *testing.T, host, token, orgID, wsID string, timeout time.Duration) {
+	t.Helper()
+	url := "https://" + host + "/workspaces/" + wsID
+	deadline := time.Now().Add(timeout)
+
+	// Brief window to catch the provisioning dip (best-effort; not required).
+	dipDeadline := time.Now().Add(90 * time.Second)
+	for time.Now().Before(dipDeadline) {
+		status, body := doTenantJSON(t, "GET", url, token, orgID, "")
+		if status == http.StatusOK && jsonField(body, "status") != "online" {
+			break
+		}
+		time.Sleep(3 * time.Second)
+	}
+
+	var last string
+	for time.Now().Before(deadline) {
+		status, body := doTenantJSON(t, "GET", url, token, orgID, "")
+		if status == http.StatusOK {
+			last = jsonField(body, "status")
+			if last == "online" {
+				return
+			}
+		}
+		time.Sleep(10 * time.Second)
+	}
+	t.Fatalf("workspace %s did not return to status=online after recreate within %s (last=%q)", wsID, timeout, last)
+}
+
+// settleAutoRestart absorbs the internal#624 file-write→restart debounce so the
+// explicit recreate we measure isn't coalesced with an implicit one. The
+// debounce window is 15s + a restart cycle; we poll back to a stable online.
+func settleAutoRestart(t *testing.T, host, token, orgID, wsID string) {
+	t.Helper()
+	// Give the debounce window time to fire (or not) ...
+	time.Sleep(20 * time.Second)
+	// ... then ensure we're back to a stable online before the measured swap.
+	waitForWorkspaceOnline(t, host, token, orgID, wsID, 10*time.Minute)
+}
+
+// --- tenant Files API ------------------------------------------------------
+
+// writeWorkspaceFile PUTs a file via the tenant Files API into the given root.
+// root="/workspace" is the literal data-volume path (cp#326).
+func writeWorkspaceFile(t *testing.T, host, token, orgID, wsID, root, relPath, content string) {
+	t.Helper()
+	url := fmt.Sprintf("https://%s/workspaces/%s/files/%s?root=%s",
+		host, wsID, relPath, root)
+	status, body := doTenantJSON(t, "PUT", url, token, orgID, fmt.Sprintf(`{"content":%q}`, content))
+	if status != http.StatusOK {
+		t.Fatalf("write %s%s: HTTP %d: %s", root, relPath, status, body)
+	}
+}
+
+// readWorkspaceFile GETs a file via the tenant Files API and returns its
+// content. Fails the test on any non-200 (a not-found after a recreate is the
+// wipe we are gating, so the caller compares content and emits the regression
+// message — but a transport/auth failure should still fail loud here).
+func readWorkspaceFile(t *testing.T, host, token, orgID, wsID, root, relPath string) string {
+	t.Helper()
+	url := fmt.Sprintf("https://%s/workspaces/%s/files/%s?root=%s",
+		host, wsID, relPath, root)
+	status, body := doTenantJSON(t, "GET", url, token, orgID, "")
+	if status == http.StatusNotFound {
+		// Surface the not-found as empty content; the caller's exact-content
+		// compare turns this into the DATA-VOLUME REGRESSION message.
+		return ""
+	}
+	if status != http.StatusOK {
+		t.Fatalf("read %s%s: HTTP %d: %s", root, relPath, status, body)
+	}
+	return jsonField(body, "content")
+}
@@ -0,0 +1,27 @@
+// Package staginge2e holds live, against-real-staging-infra end-to-end tests
+// for molecule-core's workspace-server that are NOT part of the normal
+// `go test ./...` run and NOT part of any unit/httptest suite.
+//
+// Every test here is guarded by the `staging_e2e` build tag AND skips itself
+// at runtime unless the required staging credentials are present in the
+// environment (see requireStagingEnv). So:
+//
+//	go test ./...                      # compiles nothing here (tag absent)
+//	go test -tags=staging_e2e ./...    # compiles; skips LOUD if creds absent
+//	STAGING_E2E=1 CP_BASE_URL=... CP_ADMIN_API_TOKEN=... \
+//	  go test -tags=staging_e2e -run TestWorkspaceLifecycle_Staging \
+//	  -timeout 40m ./internal/staginge2e/
+//
+// These tests provision a REAL throwaway tenant (real EC2-backed workspace on
+// staging) via the CP admin API, drive the workspace lifecycle endpoints
+// against the live tenant ws-server, and assert OBSERVABLE container-state
+// transitions (status + serve reachability) — not just HTTP 200. Teardown is
+// t.Cleanup-driven (admin DELETE /cp/admin/tenants).
+//
+// Run them from the operator host (or CI on dispatch/schedule) where the
+// staging CP admin surface + tenant DNS are reachable.
+//
+// This suite is advisory-by-infra: it needs a live staging tenant, so it is
+// NOT a merge-blocking required check. Promotion to required is a separate CTO
+// decision (mirrors the cp internal/staginge2e suite, cp#386).
+package staginge2e
@@ -0,0 +1,596 @@
+//go:build staging_e2e
+
+package staginge2e
+
+import (
+	"fmt"
+	"net/http"
+	"os"
+	"strings"
+	"testing"
+	"time"
+)
+
+// TestWorkspaceLifecycle_Staging is the live, against-real-staging end-to-end
+// test for core#2332 P1.10 — workspace lifecycle (soft-restart / pause / resume
+// / hibernate) coverage.
+//
+// What it proves that the handler unit tests (httptest in
+// internal/handlers/*_test.go) cannot: that against a REAL EC2-backed tenant
+// workspace, the lifecycle endpoints actually transition the CONTAINER state
+// and recover — not just flip a DB flag or return HTTP 200.
+//
+// Pipeline:
+//
+//  1. Provision a throwaway org + tenant via the CP admin API.
+//
+//  2. Acquire the tenant admin token (accepted by ws-server WorkspaceAuth as
+//     ADMIN_TOKEN — see middleware/wsauth_middleware.go).
+//
+//  3. Create a workspace via the tenant ws-server; wait for status=online with
+//     a routable url (the real boot→register signal).
+//
+//  4. Drive each lifecycle endpoint and assert OBSERVABLE state:
+//
+//     soft restart (POST /restart):
+//     online → provisioning → online, and a post-restart serve probe (A2A
+//     round-trip) succeeds — proves the container came back serveable, not
+//     just that the row flipped.
+//
+//     pause (POST /pause):
+//     → paused, AND the container is genuinely stopped — observed via the
+//     tenant API as: url cleared + the workspace no longer serves A2A
+//     (a stopped EC2/container is unreachable; a mere flag would still serve).
+//     resume (POST /resume):
+//     paused → provisioning → online + serveable again.
+//
+//     hibernate (POST /hibernate?force=true):
+//     online → hibernated, container stopped (url cleared, unserveable).
+//     wake (next A2A message):
+//     hibernated → online (auto-wake-on-message; Resume only handles paused).
+//
+// Status is read from the live DB-backed GET /workspaces/:id (canvas) endpoint
+// — the response body of the lifecycle POST could lie; the GET proves the row.
+//
+// Guarded by the staging_e2e build tag and STAGING_E2E=1 env gate. Teardown is
+// t.Cleanup-driven (admin DELETE /cp/admin/tenants).
+func TestWorkspaceLifecycle_Staging(t *testing.T) {
+	cfg := requireStagingEnv(t)
+
+	slug := fmt.Sprintf("e2e-life-%d", time.Now().Unix()%100000000)
+	t.Logf("workspace-lifecycle: slug=%s", slug)
+
+	// --- Step 1: provision org via admin API ---
+	orgID := adminCreateOrg(t, cfg, slug)
+	t.Cleanup(func() { adminDeleteTenant(t, cfg, slug) })
+	t.Logf("org created: org_id=%s", orgID)
+
+	// --- Step 1b: acquire tenant admin token + wait for tenant TLS ready ---
+	token := tenantAdminToken(t, cfg, slug)
+	tenantHost := slug + "." + cfg.subdomainSuffix
+	waitForHTTP(t, tenantHost, http.StatusOK, 10*time.Minute, "tenant /health ready")
+	t.Logf("tenant TLS ready: %s", tenantHost)
+
+	// --- Step 2: create workspace + wait online (routable) ---
+	wsID := tenantCreateWorkspace(t, cfg, tenantHost, token, orgID)
+	waitForWorkspaceOnlineRoutable(t, tenantHost, token, orgID, wsID, 15*time.Minute, "initial boot")
+	t.Logf("workspace %s online + routable", wsID)
+
+	// Baseline: the freshly-online workspace must actually serve A2A.
+	assertServes(t, tenantHost, token, orgID, wsID, "baseline (post-boot)")
+
+	// ── soft restart ────────────────────────────────────────────────────────
+	// online → provisioning → online; container must come back serveable.
+	t.Run("restart", func(t *testing.T) {
+		status, body := postLifecycle(t, tenantHost, token, orgID, wsID, "/restart")
+		if status != http.StatusOK {
+			t.Fatalf("restart: HTTP %d: %s", status, body)
+		}
+		if st := jsonField(body, "status"); st != "provisioning" {
+			t.Fatalf("restart: body status=%q (expected provisioning): %s", st, body)
+		}
+		// The endpoint flips status→provisioning synchronously (before the HTTP
+		// response) then re-provisions in a goroutine. We don't hard-assert
+		// observing the intermediate 'provisioning' via GET: on a fast box the
+		// row can race back to online before our first poll, so requiring to
+		// CATCH provisioning would be a false-negative flake. The body already
+		// proved the synchronous flip; the load-bearing observable is the
+		// eventual online+routable + a successful serve probe below.
+		waitForWorkspaceOnlineRoutable(t, tenantHost, token, orgID, wsID, 15*time.Minute, "restart→online")
+		// Post-restart liveness/serve probe — proves the container is actually
+		// back, not just that the status row says online.
+		assertServes(t, tenantHost, token, orgID, wsID, "post-restart")
+		t.Logf("restart VERIFIED: online → provisioning → online + serveable")
+	})
+
+	// ── pause → resume ──────────────────────────────────────────────────────
+	t.Run("pause_resume", func(t *testing.T) {
+		// pause → paused, container genuinely stopped.
+		status, body := postLifecycle(t, tenantHost, token, orgID, wsID, "/pause")
+		if status != http.StatusOK {
+			t.Fatalf("pause: HTTP %d: %s", status, body)
+		}
+		if st := jsonField(body, "status"); st != "paused" {
+			t.Fatalf("pause: body status=%q (expected paused): %s", st, body)
+		}
+		waitForWorkspaceStatus(t, tenantHost, token, orgID, wsID, "paused", 3*time.Minute, "pause→paused")
+		// Genuinely-stopped assertion: the canvas GET clears url on pause
+		// (Pause SETs url=''), and a stopped container no longer serves A2A.
+		// A handler that only flipped a flag without stopping the container
+		// would still be reachable here — so this is the real-stop signal.
+		assertURLCleared(t, tenantHost, token, orgID, wsID, 3*time.Minute, "pause")
+		assertNotServing(t, tenantHost, token, orgID, wsID, "pause")
+		t.Logf("pause VERIFIED: paused + url cleared + container unserveable (genuinely stopped)")
+
+		// resume → provisioning → online + serveable again.
+		status, body = postLifecycle(t, tenantHost, token, orgID, wsID, "/resume")
+		if status != http.StatusOK {
+			t.Fatalf("resume: HTTP %d: %s", status, body)
+		}
+		if st := jsonField(body, "status"); st != "provisioning" {
+			t.Fatalf("resume: body status=%q (expected provisioning): %s", st, body)
+		}
+		waitForWorkspaceOnlineRoutable(t, tenantHost, token, orgID, wsID, 15*time.Minute, "resume→online")
+		assertServes(t, tenantHost, token, orgID, wsID, "post-resume")
+		t.Logf("resume VERIFIED: paused → provisioning → online + serveable")
+	})
+
+	// ── hibernate → wake ────────────────────────────────────────────────────
+	t.Run("hibernate_wake", func(t *testing.T) {
+		// hibernate (force, since a fresh online ws may carry no active tasks
+		// but we don't want a transient active_tasks>0 to 409 the test).
+		status, body := postLifecycle(t, tenantHost, token, orgID, wsID, "/hibernate?force=true")
+		if status != http.StatusOK {
+			t.Fatalf("hibernate: HTTP %d: %s", status, body)
+		}
+		if st := jsonField(body, "status"); st != "hibernated" {
+			t.Fatalf("hibernate: body status=%q (expected hibernated): %s", st, body)
+		}
+		// Confirm it settled at 'hibernated' (not stuck mid-'hibernating') and
+		// the container is genuinely stopped (url cleared + unserveable).
+		waitForWorkspaceStatus(t, tenantHost, token, orgID, wsID, "hibernated", 3*time.Minute, "hibernate→hibernated")
+		assertURLCleared(t, tenantHost, token, orgID, wsID, 3*time.Minute, "hibernate")
+		assertNotServing(t, tenantHost, token, orgID, wsID, "hibernate")
+		t.Logf("hibernate VERIFIED: hibernated + url cleared + container unserveable")
+
+		// wake: a hibernated workspace auto-wakes on the next incoming A2A
+		// message (NOT /resume — Resume only handles status=paused). The wake
+		// A2A itself may return transient 5xx while the container re-provisions;
+		// the load-bearing contract is the STATUS transition back to online.
+		sendWakeA2A(t, tenantHost, token, orgID, wsID)
+		waitForWorkspaceOnlineRoutable(t, tenantHost, token, orgID, wsID, 15*time.Minute, "hibernate→wake→online")
+		assertServes(t, tenantHost, token, orgID, wsID, "post-wake")
+		t.Logf("wake VERIFIED: hibernated → online via auto-wake A2A + serveable")
+	})
+}
+
+// ---------------------------------------------------------------------------
+// lifecycle drivers + observable-state assertions
+// ---------------------------------------------------------------------------
+
+// postLifecycle POSTs a lifecycle endpoint (path includes any ?query) on the
+// tenant ws-server using the tenant admin token (accepted by WorkspaceAuth).
+func postLifecycle(t *testing.T, host, token, orgID, wsID, pathAndQuery string) (int, string) {
+	t.Helper()
+	url := "https://" + host + "/workspaces/" + wsID + pathAndQuery
+	return doTenantJSON(t, "POST", url, token, orgID, "")
+}
+
+// workspaceStatusAndURL reads the canvas GET /workspaces/:id and returns
+// (status, url). url is "" when the workspace is not routable (paused/hibernated
+// clear it). httpStatus is surfaced so callers can distinguish 404/Gone.
+func workspaceStatusAndURL(t *testing.T, host, token, orgID, wsID string) (httpStatus int, status, url string) {
+	t.Helper()
+	u := "https://" + host + "/workspaces/" + wsID
+	hs, body := doTenantJSON(t, "GET", u, token, orgID, "")
+	return hs, jsonField(body, "status"), jsonField(body, "url")
+}
+
+// waitForWorkspaceStatus polls the canvas GET until .status == want.
+func waitForWorkspaceStatus(t *testing.T, host, token, orgID, wsID, want string, timeout time.Duration, why string) {
+	t.Helper()
+	deadline := time.Now().Add(timeout)
+	var last string
+	for time.Now().Before(deadline) {
+		_, st, _ := workspaceStatusAndURL(t, host, token, orgID, wsID)
+		if st != last {
+			t.Logf("    [%s] status → %q", why, st)
+			last = st
+		}
+		if st == want {
+			return
+		}
+		time.Sleep(10 * time.Second)
+	}
+	t.Fatalf("%s: workspace %s never reached status=%q within %s (last=%q)", why, wsID, want, timeout, last)
+}
+
+// waitForWorkspaceOnlineRoutable polls until status=online AND url is non-empty.
+// A routable url is the real "the agent is reachable" signal the SDK uses — an
+// online row without a url is not yet serveable.
+func waitForWorkspaceOnlineRoutable(t *testing.T, host, token, orgID, wsID string, timeout time.Duration, why string) {
+	t.Helper()
+	deadline := time.Now().Add(timeout)
+	var lastStatus, lastURL string
+	for time.Now().Before(deadline) {
+		_, st, url := workspaceStatusAndURL(t, host, token, orgID, wsID)
+		if st != lastStatus || (url != "") != (lastURL != "") {
+			t.Logf("    [%s] status=%q routable=%v", why, st, url != "")
+			lastStatus, lastURL = st, url
+		}
+		if st == "online" && url != "" {
+			return
+		}
+		time.Sleep(10 * time.Second)
+	}
+	t.Fatalf("%s: workspace %s never reached online+routable within %s (last status=%q, url-set=%v)",
+		why, wsID, timeout, lastStatus, lastURL != "")
+}
+
+// assertURLCleared asserts the canvas GET reports an empty url within timeout.
+// Pause/Hibernate SET url=” as part of stopping the container; a non-empty url
+// means the workspace is still routable (container not stopped).
+func assertURLCleared(t *testing.T, host, token, orgID, wsID string, timeout time.Duration, why string) {
+	t.Helper()
+	deadline := time.Now().Add(timeout)
+	var lastURL string
+	for time.Now().Before(deadline) {
+		_, _, url := workspaceStatusAndURL(t, host, token, orgID, wsID)
+		lastURL = url
+		if url == "" {
+			return
+		}
+		time.Sleep(5 * time.Second)
+	}
+	t.Fatalf("%s: workspace %s url never cleared within %s (last url-set=%v) — container may not have actually stopped",
+		why, wsID, timeout, lastURL != "")
+}
+
+// serveProbe sends one A2A message/send to the workspace and reports whether the
+// agent served it (2xx). A 2xx means a live container handled the request; a
+// connection error / 5xx / 4xx means it did not serve.
+func serveProbe(t *testing.T, host, token, orgID, wsID string) (served bool, code int) {
+	t.Helper()
+	url := "https://" + host + "/workspaces/" + wsID + "/a2a"
+	body := fmt.Sprintf(`{"jsonrpc":"2.0","method":"message/send","id":"e2e-probe","params":{"message":{"role":"user","messageId":%q,"parts":[{"kind":"text","text":"platform lifecycle e2e serve probe — reply with the single token: PONG"}]}}}`,
+		fmt.Sprintf("e2e-probe-%d", time.Now().UnixNano()))
+	req, err := http.NewRequest("POST", url, strings.NewReader(body))
+	if err != nil {
+		t.Fatalf("build serve probe: %v", err)
+	}
+	req.Header.Set("Authorization", "Bearer "+token)
+	req.Header.Set("X-Molecule-Org-Id", orgID)
+	req.Header.Set("Origin", "https://"+host)
+	req.Header.Set("Content-Type", "application/json")
+	client := &http.Client{Timeout: 90 * time.Second}
+	resp, err := client.Do(req)
+	if err != nil {
+		return false, 0
+	}
+	defer resp.Body.Close()
+	drain(resp)
+	return resp.StatusCode >= 200 && resp.StatusCode < 300, resp.StatusCode
+}
+
+// assertServes requires the workspace to serve an A2A round-trip within a short
+// readiness window (it may have just transitioned to online; allow brief warmup
+// + tolerate transient cold 5xx, same edge class the shell harness tolerates).
+func assertServes(t *testing.T, host, token, orgID, wsID, why string) {
+	t.Helper()
+	deadline := time.Now().Add(5 * time.Minute)
+	var lastCode int
+	for time.Now().Before(deadline) {
+		served, code := serveProbe(t, host, token, orgID, wsID)
+		lastCode = code
+		if served {
+			return
+		}
+		time.Sleep(15 * time.Second)
+	}
+	t.Fatalf("%s: workspace %s never served an A2A round-trip within 5m (last http=%d) — online but not serveable",
+		why, wsID, lastCode)
+}
+
+// assertNotServing requires the workspace to STOP serving A2A within timeout —
+// the observable proxy (via the tenant API, no AWS/SSM access in core) that the
+// container is genuinely stopped, not merely flagged paused/hibernated.
+//
+// NOTE: a hibernated workspace auto-wakes on the NEXT A2A message — so a single
+// probe could itself trigger a wake. We therefore look for the workspace to be
+// unreachable on the FIRST probe taken after the status/url already settled to
+// stopped; we do not retry-poll the probe (that would wake it). A live-and-
+// serving container returns 2xx immediately, which is the regression we catch.
+//
+// TODO(core#2332): the strongest "container stopped" signal is the EC2/Docker
+// state itself (instance stopped), which is only observable from the CP side
+// (AWS/SSM) — not reachable from the core ws-server module without importing the
+// CP client surface. This asserts the strongest signal available here (url
+// cleared + immediate non-serve). If/when a CP-side admin endpoint surfaces the
+// instance power-state to the tenant API, tighten this to assert it directly.
+func assertNotServing(t *testing.T, host, token, orgID, wsID string, why string) {
+	t.Helper()
+	// The status/url already settled to stopped before this is called. One
+	// probe — not a retry loop — to avoid auto-waking a hibernated workspace.
+	served, code := serveProbe(t, host, token, orgID, wsID)
+	if served {
+		t.Fatalf("%s: workspace %s STILL serves A2A (http=%d) after status settled to stopped — "+
+			"container was not actually stopped (handler flipped the flag only)", why, wsID, code)
+	}
+	t.Logf("    [%s] workspace unserveable after stop (probe http=%d) — container genuinely stopped", why, code)
+}
+
+// sendWakeA2A sends a wake message to a hibernated workspace. The wake A2A may
+// itself return transient 5xx while the container re-provisions — we send it
+// best-effort with bounded retries on the cold-restart 5xx class and let the
+// caller assert the real contract (status → online).
+func sendWakeA2A(t *testing.T, host, token, orgID, wsID string) {
+	t.Helper()
+	for attempt := 1; attempt <= 12; attempt++ {
+		served, code := serveProbe(t, host, token, orgID, wsID)
+		if served {
+			t.Logf("    wake A2A served (http=%d) on attempt %d", code, attempt)
+			return
+		}
+		// 5xx / 0 (conn refused while container is down) are expected during
+		// cold wake — retry. The wake has still been dispatched (it reaches the
+		// ProxyA2A handler, which triggers re-provision); we just couldn't get a
+		// 2xx synchronously. Keep nudging until the status assertion takes over.
+		t.Logf("    wake A2A attempt %d/12: http=%d (cold restart) — retrying", attempt, code)
+		time.Sleep(15 * time.Second)
+	}
+	t.Logf("    wake A2A did not return 2xx within retries — relying on status→online assertion to confirm wake")
+}
+
+// drain reads and discards a response body (cap 1 MiB) so the connection can be
+// reused / closed cleanly.
+func drain(resp *http.Response) {
+	buf := make([]byte, 4096)
+	total := 0
+	for {
+		n, e := resp.Body.Read(buf)
+		total += n
+		if e != nil || total > 1<<20 {
+			break
+		}
+	}
+}
+
+// ---------------------------------------------------------------------------
+// harness (self-contained — this package is excluded from the default build).
+// Mirrors the idioms of cp's internal/staginge2e (cp#386): STAGING_E2E=1 gate,
+// CP_ADMIN_API_TOKEN admin surface, provision→wait-online→assert, t.Cleanup
+// teardown. Core has no CP client packages, so these are HTTP-only.
+// ---------------------------------------------------------------------------
+
+type stagingCfg struct {
+	cpBase          string
+	adminToken      string
+	subdomainSuffix string
+}
+
+// requireStagingEnv gates the suite. STAGING_E2E != 1 SKIPs (the suite's
+// contract — advisory-by-infra, not fail-open within a run). With STAGING_E2E=1
+// but creds absent it also skips LOUD (so a misconfigured CI run can't false-
+// green by silently passing zero assertions).
+func requireStagingEnv(t *testing.T) stagingCfg {
+	t.Helper()
+	if os.Getenv("STAGING_E2E") != "1" {
+		t.Skip("STAGING_E2E != 1 — skipping live staging e2e (set STAGING_E2E=1 + CP_BASE_URL + CP_ADMIN_API_TOKEN to run)")
+	}
+	get := func(k string) string { return strings.TrimSpace(os.Getenv(k)) }
+	cfg := stagingCfg{
+		cpBase:          strings.TrimRight(get("CP_BASE_URL"), "/"),
+		adminToken:      get("CP_ADMIN_API_TOKEN"),
+		subdomainSuffix: envOr("STAGING_TENANT_SUBDOMAIN_SUFFIX", "staging.moleculesai.app"),
+	}
+	var missing []string
+	for k, v := range map[string]string{
+		"CP_BASE_URL":        cfg.cpBase,
+		"CP_ADMIN_API_TOKEN": cfg.adminToken,
+	} {
+		if v == "" {
+			missing = append(missing, k)
+		}
+	}
+	if len(missing) > 0 {
+		t.Skipf("STAGING_E2E=1 but missing required env: %s — skipping LOUD (not a silent pass)", strings.Join(missing, ", "))
+	}
+	return cfg
+}
+
+func envOr(k, def string) string {
+	if v := strings.TrimSpace(os.Getenv(k)); v != "" {
+		return v
+	}
+	return def
+}
+
+// adminCreateOrg provisions a throwaway org via the CP admin API and waits for
+// its instance to reach running (provisioning is async).
+func adminCreateOrg(t *testing.T, cfg stagingCfg, slug string) (orgID string) {
+	t.Helper()
+	body := fmt.Sprintf(`{"slug":%q,"name":%q,"owner_user_id":%q}`, slug, "E2E Workspace Lifecycle", "e2e-runner:"+slug)
+	status, resp := doJSON(t, "POST", cfg.cpBase+"/cp/admin/orgs", cfg.adminToken, body)
+	if status != http.StatusCreated && status != http.StatusOK {
+		t.Fatalf("AdminCreate org: HTTP %d: %s", status, resp)
+	}
+	id := jsonField(resp, "id")
+	if id == "" {
+		t.Fatalf("AdminCreate org: no id in response: %s", resp)
+	}
+	deadline := time.Now().Add(7 * time.Minute)
+	for time.Now().Before(deadline) {
+		st, list := doJSON(t, "GET", cfg.cpBase+"/cp/admin/orgs", cfg.adminToken, "")
+		if st == http.StatusOK && strings.Contains(list, `"slug":"`+slug+`"`) &&
+			orgInstanceStatus(list, slug) == "running" {
+			return id
+		}
+		time.Sleep(15 * time.Second)
+	}
+	t.Fatalf("org %s did not reach instance_status=running within timeout", slug)
+	return ""
+}
+
+func adminDeleteTenant(t *testing.T, cfg stagingCfg, slug string) {
+	t.Helper()
+	body := fmt.Sprintf(`{"confirm":%q}`, slug)
+	status, resp := doJSON(t, "DELETE", cfg.cpBase+"/cp/admin/tenants/"+slug, cfg.adminToken, body)
+	if status != http.StatusOK && status != http.StatusAccepted && status != http.StatusNotFound {
+		t.Logf("WARNING: teardown DELETE tenant %s returned HTTP %d: %s (manual cleanup may be needed)", slug, status, resp)
+		return
+	}
+	t.Logf("teardown: deleted tenant %s (HTTP %d)", slug, status)
+}
+
+// tenantAdminToken fetches the per-tenant admin token from the CP admin surface.
+// Only available once the tenant platform has finished provisioning.
+func tenantAdminToken(t *testing.T, cfg stagingCfg, slug string) string {
+	t.Helper()
+	url := cfg.cpBase + "/cp/admin/orgs/" + slug + "/admin-token"
+	deadline := time.Now().Add(7 * time.Minute)
+	for time.Now().Before(deadline) {
+		status, body := doJSON(t, "GET", url, cfg.adminToken, "")
+		if status == http.StatusOK {
+			if tok := jsonField(body, "admin_token"); tok != "" {
+				return tok
+			}
+		}
+		time.Sleep(5 * time.Second)
+	}
+	t.Fatalf("tenant admin token not available for %s within timeout", slug)
+	return ""
+}
+
+// tenantCreateWorkspace creates a workspace via the tenant ws-server, exercising
+// the full tenant → CP provisioner → EC2 path.
+func tenantCreateWorkspace(t *testing.T, cfg stagingCfg, host, token, orgID string) string {
+	t.Helper()
+	url := "https://" + host + "/workspaces"
+	body := fmt.Sprintf(
+		`{"name":%q,"runtime":%q,"tier":%d,"model":%q,"billing_mode":%q,"provider":%q}`,
+		"core2332-life-e2e", "claude-code", 1, "moonshot/kimi-k2.6", "platform_managed", "platform",
+	)
+	status, resp := doTenantJSON(t, "POST", url, token, orgID, body)
+	if status != http.StatusCreated && status != http.StatusOK {
+		t.Fatalf("tenant workspace create: HTTP %d: %s", status, resp)
+	}
+	id := jsonField(resp, "id")
+	if id == "" {
+		t.Fatalf("tenant workspace create: no id in response: %s", resp)
+	}
+	return id
+}
+
+// --- reachability ----------------------------------------------------------
+
+func waitForHTTP(t *testing.T, host string, want int, timeout time.Duration, why string) {
+	t.Helper()
+	url := "https://" + host + "/health"
+	client := &http.Client{Timeout: 15 * time.Second}
+	deadline := time.Now().Add(timeout)
+	var last int
+	for time.Now().Before(deadline) {
+		req, _ := http.NewRequest("GET", url, nil)
+		resp, err := client.Do(req)
+		if err == nil {
+			last = resp.StatusCode
+			resp.Body.Close()
+			if resp.StatusCode == want {
+				return
+			}
+		}
+		time.Sleep(10 * time.Second)
+	}
+	t.Fatalf("%s: %s never returned HTTP %d within %s (last=%d)", why, url, want, timeout, last)
+}
+
+// --- HTTP helpers ----------------------------------------------------------
+
+// doJSON hits the CP admin surface (bearer admin token, no tenant headers).
+func doJSON(t *testing.T, method, url, token, body string) (int, string) {
+	t.Helper()
+	req, err := http.NewRequest(method, url, strings.NewReader(body))
+	if err != nil {
+		t.Fatalf("build %s %s: %v", method, url, err)
+	}
+	req.Header.Set("Authorization", "Bearer "+token)
+	req.Header.Set("Content-Type", "application/json")
+	client := &http.Client{Timeout: 150 * time.Second}
+	resp, err := client.Do(req)
+	if err != nil {
+		t.Fatalf("%s %s: %v", method, url, err)
+	}
+	defer resp.Body.Close()
+	return resp.StatusCode, readBody(resp)
+}
+
+// doTenantJSON hits the tenant ws-server. It adds the three headers the SaaS
+// auth chain requires: Authorization (tenant admin token), X-Molecule-Org-Id
+// (tenant guard 404s anything without it), and Origin (Cloudflare WAF rejects a
+// mismatched/absent Origin with 404).
+func doTenantJSON(t *testing.T, method, url, token, orgID, body string) (int, string) {
+	t.Helper()
+	req, err := http.NewRequest(method, url, strings.NewReader(body))
+	if err != nil {
+		t.Fatalf("build %s %s: %v", method, url, err)
+	}
+	req.Header.Set("Authorization", "Bearer "+token)
+	req.Header.Set("X-Molecule-Org-Id", orgID)
+	req.Header.Set("Origin", "https://"+strings.SplitN(strings.TrimPrefix(url, "https://"), "/", 2)[0])
+	req.Header.Set("Content-Type", "application/json")
+	client := &http.Client{Timeout: 90 * time.Second}
+	resp, err := client.Do(req)
+	if err != nil {
+		t.Fatalf("%s %s: %v", method, url, err)
+	}
+	defer resp.Body.Close()
+	return resp.StatusCode, readBody(resp)
+}
+
+func readBody(resp *http.Response) string {
+	buf := make([]byte, 0, 4096)
+	tmp := make([]byte, 4096)
+	for {
+		n, e := resp.Body.Read(tmp)
+		buf = append(buf, tmp[:n]...)
+		if e != nil || len(buf) > 1<<20 {
+			break
+		}
+	}
+	return string(buf)
+}
+
+// jsonField does a flat, dependency-free extraction of a top-level string field
+// value ("key":"value") — sufficient for the id/status/url fields we read.
+func jsonField(body, key string) string {
+	needle := `"` + key + `":"`
+	i := strings.Index(body, needle)
+	if i < 0 {
+		return ""
+	}
+	rest := body[i+len(needle):]
+	j := strings.IndexByte(rest, '"')
+	if j < 0 {
+		return ""
+	}
+	return rest[:j]
+}
+
+// orgInstanceStatus finds the instance_status for a given slug in a
+// /cp/admin/orgs list response by scanning the object that contains the slug.
+func orgInstanceStatus(listBody, slug string) string {
+	marker := `"slug":"` + slug + `"`
+	i := strings.Index(listBody, marker)
+	if i < 0 {
+		return ""
+	}
+	lo := i - 600
+	if lo < 0 {
+		lo = 0
+	}
+	hi := i + 600
+	if hi > len(listBody) {
+		hi = len(listBody)
+	}
+	return jsonField(listBody[lo:hi], "instance_status")
+}