chore(dead-code): remove unused QueueDepth function

QueueDepth was added for Phase 2/3 busy-return response visibility but was never wired to a caller. The inline depth query in EnqueueA2A serves today's enqueue response, making this function dead code. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
Merge PR #2469 via Gitea merge queue
2026-06-09 04:03:55 +00:00 · 2026-06-09 03:14:14 +00:00 · 2026-06-09 03:05:21 +00:00 · 2026-06-09 02:56:25 +00:00 · 2026-06-09 02:39:55 +00:00 · 2026-06-09 02:37:45 +00:00
47 changed files with 3700 additions and 305 deletions
@@ -0,0 +1,247 @@
+#!/usr/bin/env python3
+"""
+SSOT fail-closed approval validator (SEV-1 internal#812).
+
+This module is the SINGLE source of truth for whether a Gitea review counts
+as a "genuine" approval. Both consumers must call into it — they MUST NOT
+duplicate the predicate:
+
+  - .gitea/scripts/gitea-merge-queue.py (Python) — imports directly.
+  - .gitea/scripts/review-check.sh (bash, jq) — calls the Python helper
+    at .gitea/scripts/_review_check_filter.py, which in turn calls this
+    module. There is no separate jq / bash copy of the predicate; a
+    reviewer who wants to weaken the gate has to weaken this one file.
+
+# The fail-closed contract
+
+A review counts as a GENUINE APPROVED on the current head ONLY IF ALL hold:
+
+  1. state == "APPROVED"
+  2. official == true
+  3. dismissed != true
+  4. stale != true
+  5. commit_id is present and equals the PR's current head SHA
+
+ANY failure of any of the above → REJECT.
+
+# The bug this fixes
+
+The previous gitea-merge-queue.py predicate had a `if isinstance(commit_id,
+str) and commit_id and headsha:` guard that *skipped* the commit_id check
+when the review carried no commit_id. The previous review-check.sh jq
+filter required `commit_id == $head`, which is also implicitly fail-closed
+on missing commit_id (null != head), but only one of the two consumers
+behaved correctly — a code-drift trap.
+
+Both behaviors are now defined here, as a single fail-closed predicate.
+A MISSING commit_id is the Gitea row signature of a spoofed or pre-commit
+review: a real reviewer cannot have submitted against a commit that
+doesn't exist. Accepting these is exactly the fail-open that SEV-1
+internal#812 describes and the re-opened path that closed #843 (with CR2
+ Researcher both flagging it) addresses.
+
+# Mutation-resistance
+
+The unit tests in tests/test_approval_validator.py assert rejection
+explicitly for each fail-closed case (missing commit_id, stale head,
+non-official, dismissed, etc.). A reviewer who tries to weaken the
+predicate by removing the commit_id check, by re-introducing the
+"no commit_id is accepted" escape hatch, or by changing `!=` to `==`
+in the head comparison will trip those tests in CI.
+"""
+
+from __future__ import annotations
+
+from typing import Iterable, Optional, Tuple
+
+# ---------------------------------------------------------------------------
+# Canonical Gitea review-state enum (EXACT match -- no case coercion).
+# ---------------------------------------------------------------------------
+#
+# Gitea's reviews API emits review.state as one of a fixed set of
+# UPPERCASE string constants: "APPROVED", "REQUEST_CHANGES",
+# "REQUEST_REVIEW", "COMMENT", "PENDING", "DISMISSED" (verified
+# against the live API across real molecule-core PRs). They are ALWAYS
+# uppercase on the wire.
+#
+# FAIL-CLOSED: we compare review.state to these constants with EXACT
+# equality. The previous code used str(state or "").upper(), which
+# coerced a lowercase/mixed-case "approved" or "request_changes" into
+# the canonical value and ACCEPTED it. A real Gitea row never carries a
+# lowercase state, so a case-variant value is the signature of a
+# hand-forged / spoofed row, not a legitimate review. Coercing it was a
+# residual fail-open (SEV-1 internal#812, RCs 9849/9851/9852). We reject
+# anything that is not byte-for-byte the canonical constant.
+STATE_APPROVED = "APPROVED"
+STATE_REQUEST_CHANGES = "REQUEST_CHANGES"
+
+
+# ---------------------------------------------------------------------------
+# Shared predicate — fail-closed on every condition
+# ---------------------------------------------------------------------------
+
+
+def is_official_current_head(review: object, headsha: object) -> bool:
+    """Common predicate: review is official, not dismissed, not stale, and
+    bound to the PR's current head SHA. EVERY condition is mandatory and
+    fail-closed. Both is_genuine_approval and is_open_request_changes build
+    on this so the rule cannot drift between the two cases.
+
+    `official` is checked with `is not True` (NOT `not review.get("official")`).
+    The latter is truthy on the string "false" or the integer 1, which is
+    exactly the fail-open surface we are closing here — a non-boolean
+    pass-through is treated as official. Gitea emits a real boolean, so
+    the stricter check rejects anything that isn't literally True.
+    """
+    if not isinstance(review, dict):
+        return False
+    if review.get("official") is not True:
+        return False
+    if review.get("dismissed"):
+        return False
+    if review.get("stale"):
+        return False
+    commit_id = review.get("commit_id")
+    # FAIL-CLOSED: a missing/empty/non-string commit_id is REJECTED. The
+    # previous code had `if isinstance(commit_id, str) and commit_id and
+    # headsha:` which SKIPPED the check when the review carried no
+    # commit_id. That was the spoof-bug surface.
+    if not isinstance(commit_id, str) or not commit_id:
+        return False
+    # FAIL-CLOSED: a present-but-wrong commit_id is also REJECTED. Stale
+    # reviews (on a previous head) cannot count.
+    if not isinstance(headsha, str) or not headsha or commit_id != headsha:
+        return False
+    return True
+
+
+# ---------------------------------------------------------------------------
+# Per-verdict predicates
+# ---------------------------------------------------------------------------
+
+
+def is_genuine_approval(
+    review: object,
+    *,
+    headsha: str,
+    reviewer_set: Optional[Iterable[str]] = None,
+) -> bool:
+    """Return True iff `review` is a genuine APPROVED on the current head.
+
+    When `reviewer_set` is provided, the review's `user.login` must be in
+    the set (the merge-queue uses this to count only "recognised"
+    reviewers for the 2-genuine floor; review-check.sh applies its own
+    team-membership probe separately and so does not pass a set).
+    """
+    if not isinstance(review, dict):
+        return False
+    # EXACT-ENUM (fail-closed): no .upper()/.strip() coercion. A
+    # case-variant or whitespace-padded state is a forged row and is
+    # rejected, not normalised into APPROVED.
+    if review.get("state") != STATE_APPROVED:
+        return False
+    if not is_official_current_head(review, headsha):
+        return False
+    if reviewer_set is not None:
+        user = (review.get("user") or {}).get("login")
+        if not isinstance(user, str) or user not in set(reviewer_set):
+            return False
+    return True
+
+
+def is_open_request_changes(review: object, *, headsha: str) -> bool:
+    """Return True iff `review` is an open official REQUEST_CHANGES on the
+    current head. Same fail-closed contract as is_genuine_approval —
+    a missing commit_id is REJECTED, not silently treated as 'still
+    blocking the merge from an old head'.
+    """
+    if not isinstance(review, dict):
+        return False
+    # EXACT-ENUM (fail-closed): same contract as is_genuine_approval. A
+    # lowercase/mixed-case "request_changes" must NOT be coerced into a
+    # block-erasing match; an exact REQUEST_CHANGES is required.
+    if review.get("state") != STATE_REQUEST_CHANGES:
+        return False
+    if not is_official_current_head(review, headsha):
+        return False
+    return True
+
+
+# ---------------------------------------------------------------------------
+# Consumer-facing reducer (returns the two call sites need)
+# ---------------------------------------------------------------------------
+
+
+def classify_reviews(
+    reviews: Iterable[object],
+    *,
+    headsha: str,
+    reviewer_set: Optional[Iterable[str]] = None,
+) -> Tuple[set[str], list[str]]:
+    """Reduce a PR's reviews to (approvers, request_changes) on the CURRENT head.
+
+    approvers: distinct logins whose LATEST official review on the current
+        head is APPROVED.
+    request_changes: distinct logins whose LATEST official review on the
+        current head is REQUEST_CHANGES.
+
+    Gitea returns reviews oldest-first. We keep the latest *VALID*
+    submission per user (later VALID entries overwrite earlier ones; an
+    invalid later row — a COMMENT, or a review with a null/old commit_id —
+    is ignored and can NOT overwrite or erase a genuine review). See the
+    inline VALIDATE-BEFORE-REDUCE note below for the exploit this closes.
+    """
+    reviewer_set_set = set(reviewer_set) if reviewer_set is not None else None
+
+    # VALIDATE-BEFORE-REDUCE (SEV-1 internal#812 follow-up).
+    #
+    # The earlier implementation reduced FIRST (latest row per user, keyed
+    # only on state in {APPROVED, REQUEST_CHANGES}) and validated the single
+    # surviving row AFTER. That is reduce-before-validate, and it is
+    # exploitable: a user posts a genuine current-head APPROVED (or
+    # REQUEST_CHANGES), then posts a LATER row that fails the fail-closed
+    # predicate (a COMMENT, or an APPROVED with a null/old commit_id). The
+    # later INVALID row overwrote the genuine one in latest_by_user, so a
+    # real approval was masked, and — worse — a real current-head
+    # REQUEST_CHANGES could be erased and the block silently evaporate.
+    #
+    # The fix: filter to VALID reviews FIRST (each row must pass
+    # is_official_current_head AND carry an APPROVED/REQUEST_CHANGES state),
+    # and only then reduce to the latest VALID review per user. An invalid
+    # later row is never eligible to become a user's "latest" state, so it
+    # cannot overwrite or erase a genuine review. A user's verdict is the
+    # state of their latest VALID (official, current-head, non-dismissed,
+    # non-stale, commit_id-present-and-matching) review.
+    latest_valid_by_user: dict = {}
+    for review in reviews:
+        if not isinstance(review, dict):
+            continue
+        user = (review.get("user") or {}).get("login")
+        if not isinstance(user, str):
+            continue
+        if reviewer_set_set is not None and user not in reviewer_set_set:
+            continue
+        # EXACT-ENUM (fail-closed): exact constants only, no coercion. A
+        # case-coerced row must not become eligible to overwrite/erase a
+        # genuine per-user verdict in the reduce below.
+        state = review.get("state")
+        if state not in (STATE_APPROVED, STATE_REQUEST_CHANGES):
+            continue
+        # Fail-closed predicate BEFORE the reduce: official, not dismissed,
+        # not stale, commit_id present AND == head. Invalid rows are dropped
+        # here and so can never become the per-user "latest".
+        if not is_official_current_head(review, headsha):
+            continue
+        latest_valid_by_user[user] = review
+
+    approvers: set[str] = set()
+    request_changes: list[str] = []
+    for user, review in latest_valid_by_user.items():
+        # Each surviving review already passed is_official_current_head, so
+        # the state alone determines the verdict. We still go through the
+        # per-verdict SSOT predicates so the rule cannot drift.
+        if is_genuine_approval(review, headsha=headsha, reviewer_set=None):
+            approvers.add(user)
+        elif is_open_request_changes(review, headsha=headsha):
+            request_changes.append(user)
+    return approvers, request_changes
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+"""
+Helper for review-check.sh: applies the SSOT approval predicate to a
+PR's reviews and prints the candidate approver logins on stdout (one per
+line, de-duplicated, author excluded).
+
+review-check.sh uses this in place of its previous inline jq filter so the
+predicate is single-sourced. The jq filter is gone; if you want to change
+the predicate, edit .gitea/scripts/_approval_validator.py, not this file.
+
+Usage:
+  python3 _review_check_filter.py <reviews.json> <head-sha> <author-login>
+
+Output:
+  - Candidate approver logins, one per line, de-duplicated, sorted.
+  - Excludes `author-login` (the PR author cannot approve their own PR).
+  - Empty output → review-check.sh interprets as "no candidates" and exits 1
+    after the team-membership probe.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+# Same-dir import — script lives next to _approval_validator.py
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from _approval_validator import is_genuine_approval  # noqa: E402
+
+
+def main(argv: list[str]) -> int:
+    if len(argv) != 4:
+        print(
+            f"usage: {argv[0] if argv else '_review_check_filter.py'} "
+            "<reviews.json> <head-sha> <author-login>",
+            file=sys.stderr,
+        )
+        return 2
+    reviews_path = Path(argv[1])
+    headsha = argv[2]
+    author = argv[3]
+
+    try:
+        reviews = json.loads(reviews_path.read_text(encoding="utf-8"))
+    except (OSError, json.JSONDecodeError) as exc:
+        print(f"::error::could not read reviews JSON: {exc}", file=sys.stderr)
+        return 2
+    if not isinstance(reviews, list):
+        print("::error::reviews JSON was not a list", file=sys.stderr)
+        return 2
+
+    candidates: set[str] = set()
+    for review in reviews:
+        # We pass reviewer_set=None here because review-check.sh applies its
+        # own team-membership probe (CURL_AUTH_FILE + 200/204/403/404 logic)
+        # separately. The SSOT predicate enforces only the fail-closed
+        # commit_id / state / official / dismissed / stale contract here.
+        if not is_genuine_approval(review, headsha=headsha, reviewer_set=None):
+            continue
+        user = (review.get("user") or {}).get("login")
+        if not isinstance(user, str) or not user:
+            continue
+        if user == author:
+            continue
+        candidates.add(user)
+
+    for user in sorted(candidates):
+        print(user)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
@@ -116,28 +116,65 @@ fi
 # 3. Status-check state at the PR HEAD (where checks ran). The merge
 #    commit doesn't get its own checks; we evaluate the PR's last
 #    commit, which is what branch protection compared against.
-# Fail-closed: verify HTTP 200. A 401/403/404 means the status is
-# unreadable — we must NOT treat that as "no statuses" and skip checks.
-STATUS_TMP=$(mktemp)
-STATUS_HTTP=$(curl -sS -o "$STATUS_TMP" -w '%{http_code}' -H "$AUTH" \
-  "${API}/repos/${OWNER}/${NAME}/commits/${HEAD_SHA}/status")
-STATUS=$(cat "$STATUS_TMP")
-rm -f "$STATUS_TMP"
-if [ "$STATUS_HTTP" != "200" ]; then
-  echo "::error::GET /commits/${HEAD_SHA}/status returned HTTP ${STATUS_HTTP} — cannot evaluate required checks."
-  exit 1
-fi
-# FAIL-CLOSED: a 200 status response missing the 'statuses' array, or with
-# 'statuses' set to a non-array type (null/string/object), must NOT be treated
-# as "no checks" — that would silently declare all checks green.
-if ! echo "$STATUS" | jq -e '(.statuses | type) == "array"' >/dev/null; then
-  echo "::error::GET /commits/${HEAD_SHA}/status returned HTTP 200 but 'statuses' is missing or not an array — cannot evaluate required checks."
-  exit 1
-fi
+#
+# Pagination (status-pagination RCA, #2440-family): the combined
+# /commits/{sha}/status endpoint caps its embedded `statuses` array at the
+# Gitea default page size (~30). On a high-churn PR an older-but-still-current
+# required-context SUCCESS row is pushed PAST that cap, so reading the combined
+# view would record that context as `missing` and emit a FALSE-POSITIVE
+# force-merge. We instead page through the dedicated /commits/{sha}/statuses
+# list to EXHAUSTION (until a short/empty page), accumulating every row.
+#
+# Fail-closed is preserved end to end: any non-200 page, or a page whose body
+# is not a JSON array, aborts with exit 1 (we never treat an unreadable/partial
+# page as "no checks"). A genuinely-absent required context appears on NO page,
+# so CHECK_STATE has no entry for it → `${...:-missing}` below keeps it
+# `missing` → it is still counted as not-green. No fail-open path is added.
+PER_PAGE=100
+page=1
+ALL_STATUSES_TMP=$(mktemp)
+printf '[]' > "$ALL_STATUSES_TMP"   # accumulator: a single JSON array of rows
+while :; do
+  STATUS_TMP=$(mktemp)
+  STATUS_HTTP=$(curl -sS -o "$STATUS_TMP" -w '%{http_code}' -H "$AUTH" \
+    "${API}/repos/${OWNER}/${NAME}/commits/${HEAD_SHA}/statuses?page=${page}&limit=${PER_PAGE}")
+  PAGE_BODY=$(cat "$STATUS_TMP")
+  rm -f "$STATUS_TMP"
+  if [ "$STATUS_HTTP" != "200" ]; then
+    rm -f "$ALL_STATUSES_TMP"
+    echo "::error::GET /commits/${HEAD_SHA}/statuses?page=${page} returned HTTP ${STATUS_HTTP} — cannot evaluate required checks."
+    exit 1
+  fi
+  # FAIL-CLOSED: the /statuses endpoint returns a bare JSON array. A non-array
+  # body (null/object/string) means the response is malformed — we must NOT
+  # treat that as "no checks", which would silently declare all checks green.
+  if ! echo "$PAGE_BODY" | jq -e 'type == "array"' >/dev/null 2>&1; then
+    rm -f "$ALL_STATUSES_TMP"
+    echo "::error::GET /commits/${HEAD_SHA}/statuses?page=${page} returned HTTP 200 but body is not a JSON array — cannot evaluate required checks."
+    exit 1
+  fi
+  PAGE_COUNT=$(echo "$PAGE_BODY" | jq 'length')
+  # Append this page's rows to the accumulator (insertion order is preserved
+  # but NOT relied upon — the collapse below selects max-by-id per context).
+  COMBINED=$(jq -s '.[0] + .[1]' "$ALL_STATUSES_TMP" <(echo "$PAGE_BODY"))
+  printf '%s' "$COMBINED" > "$ALL_STATUSES_TMP"
+  # Short page (fewer than PER_PAGE rows) ⇒ last page ⇒ stop.
+  if [ "$PAGE_COUNT" -lt "$PER_PAGE" ]; then
+    break
+  fi
+  page=$((page + 1))
+done
+STATUS=$(cat "$ALL_STATUSES_TMP")
+rm -f "$ALL_STATUSES_TMP"
 declare -A CHECK_STATE
+# Gitea's /commits/{sha}/statuses is roughly newest-first but NOT strictly
+# monotonic by id (observed first ids 157,155,156,… — local inversions from
+# re-runs and page boundaries), so neither first- nor last-occurrence reliably
+# yields the current row. Select the MAX-id row per context explicitly
+# (order-independent), matching prod-auto-deploy.py's latest_status_for_context.
 while IFS=$'\t' read -r ctx state; do
  [ -n "$ctx" ] && CHECK_STATE[$ctx]="$state"
-done < <(echo "$STATUS" | jq -r '.statuses | .[] | "\(.context)\t\(.status)"')
+done < <(echo "$STATUS" | jq -r 'group_by(.context) | map(max_by(.id)) | .[] | "\(.context)\t\(.status)"')

 # 4. For each required check, was it green at merge? YAML block scalars
 #    (`|`) leave a trailing newline; skip blank/whitespace-only lines.
@@ -30,6 +30,11 @@ PROFILES: dict[str, dict[str, str]] = {
            # workflow (they reuse its migrated Postgres), so changes to the
            # scheduler package must trigger the job too.
            r"|^workspace-server/internal/scheduler/"
+            # #2150: the db package's real-PG migration-replay-from-scratch
+            # + InitPostgres ping tests also run in this same workflow (they
+            # reuse its sibling Postgres, against a separate `molecule_replay`
+            # database). Changes to db must trigger the job too.
+            r"|^workspace-server/internal/db/"
            r"|^workspace-server/migrations/"
            r"|^\.gitea/workflows/handlers-postgres-integration\.yml$"
        ),
@@ -105,6 +105,12 @@ import urllib.parse
 import urllib.request
 from typing import Any

+# SSOT fail-closed approval predicate (SEV-1 internal#812). review-check.sh
+# consumes the same module via _review_check_filter.py — do NOT duplicate
+# the predicate here. See _approval_validator.py for the fail-closed contract.
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from _approval_validator import classify_reviews as _classify_reviews_ssot  # noqa: E402
+

 def _env(key: str, *, default: str = "") -> str:
    return os.environ.get(key, default)
@@ -148,10 +154,23 @@ OPT_OUT_LABELS = {
 # branch-protection configuration. These are the uniform-gate checks that
 # must pass before any PR can merge (SOP tier removal makes them mandatory
 # for all PRs, not just tier:medium/tier:high).
+#
+# Context names use the (pull_request_target) suffix (not pull_request)
+# to match the workflow event_type that actually emits them — verified
+# live against PR#2419/#2331/etc.: the qa-review/security-review
+# workflows run on pull_request_target (their `on:` block uses
+# pull_request_target, not pull_request), and sop-checklist's
+# all-items-acked job also uses pull_request_target. The previous
+# (pull_request) suffix never matched the live emitted contexts,
+# which is what was painting ~16 ready PRs red (gate appeared
+# "missing" qa-review/security-review even after both passed).
+# Verified against the lint-bp-context-emit-match test which already
+# asserts (pull_request_target) for these names. No requirement
+# dropped; just a name correction.
 GOVERNANCE_REQUIRED_CONTEXTS = [
-    "qa-review / approved (pull_request)",
-    "security-review / approved (pull_request)",
-    "sop-checklist / all-items-acked (pull_request)",
+    "qa-review / approved (pull_request_target)",
+    "security-review / approved (pull_request_target)",
+    "sop-checklist / all-items-acked (pull_request_target)",
 ]
 REQUIRED_CONTEXTS_RAW = _env(
    "REQUIRED_CONTEXTS",
@@ -411,57 +430,26 @@ def get_branch_protection(branch: str) -> BranchProtection:
 def genuine_approvals(
    reviews: list[dict],
    *,
-    head_sha: str,
+    headsha: str,
    reviewer_set: set[str],
 ) -> tuple[set[str], list[str]]:
-    """Reduce a PR's reviews to genuine official approvals on the CURRENT head.
+    """Thin wrapper over the SSOT predicate in _approval_validator.py.

-    Returns (approvers, request_changes) where:
-      - approvers is the set of distinct logins (in reviewer_set) whose LATEST
-        review on the current head is an official, non-stale, non-dismissed
-        APPROVED, and
-      - request_changes is the list of logins (in reviewer_set) whose latest
-        official review on the current head is REQUEST_CHANGES.
+    All logic — the per-review commit_id / state / official / dismissed /
+    stale contract — lives in _approval_validator.classify_reviews. This
+    wrapper exists only to keep the call site (and external readers of
+    the symbol) stable. Do NOT add any per-review logic here; if you need
+    to change the predicate, edit _approval_validator.py.

-    "Current head" is enforced two ways, because Gitea exposes both signals:
-    a review must be `official` and NOT `stale`/`dismissed`, AND when the
-    review carries a commit_id it must equal head_sha. A review with no
-    commit_id but stale=False/dismissed=False is accepted (older Gitea rows).
-    We take each reviewer's LATEST submission (reviews arrive oldest-first), so
-    a later REQUEST_CHANGES correctly supersedes an earlier APPROVED and vice
-    versa.
+    See _approval_validator.py for the full fail-closed contract
+    (SEV-1 internal#812). The previous inline implementation had a
+    `if isinstance(commit_id, str) and commit_id and headsha:` guard that
+    silently accepted reviews with no commit_id; that fail-open surface is
+    now closed at the SSOT.
    """
-    latest_by_user: dict[str, dict] = {}
-    for review in reviews:
-        if not isinstance(review, dict):
-            continue
-        user = (review.get("user") or {}).get("login")
-        if not isinstance(user, str) or user not in reviewer_set:
-            continue
-        state = str(review.get("state") or "").upper()
-        if state not in {"APPROVED", "REQUEST_CHANGES"}:
-            continue  # ignore COMMENT/PENDING/DISMISSED-state rows
-        # reviews are returned oldest-first; later entries overwrite → latest wins
-        latest_by_user[user] = review
-
-    approvers: set[str] = set()
-    request_changes: list[str] = []
-    for user, review in latest_by_user.items():
-        if not review.get("official"):
-            continue
-        if review.get("stale") or review.get("dismissed"):
-            continue
-        commit_id = review.get("commit_id")
-        if isinstance(commit_id, str) and commit_id and head_sha:
-            if commit_id != head_sha:
-                continue  # review was on a previous head
-        state = str(review.get("state") or "").upper()
-        if state == "APPROVED":
-            approvers.add(user)
-        elif state == "REQUEST_CHANGES":
-            request_changes.append(user)
-    return approvers, request_changes
-
+    return _classify_reviews_ssot(
+        reviews, headsha=headsha, reviewer_set=reviewer_set
+    )

 def get_pull_reviews(pr_number: int) -> list[dict]:
    _, body = api("GET", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/reviews")
@@ -1134,7 +1122,7 @@ def _evaluate_candidate(

    reviews = get_pull_reviews(pr_number)
    approvers, request_changes = genuine_approvals(
-        reviews, head_sha=head_sha, reviewer_set=REVIEWER_SET
+        reviews, headsha=head_sha, reviewer_set=REVIEWER_SET
    )

    decision = evaluate_merge_readiness(
@@ -95,17 +95,27 @@ def build_plan(env: dict[str, str]) -> dict:


 def latest_status_for_context(statuses: list[dict], context: str) -> dict | None:
-    """Return the first matching status.
+    """Return the NEWEST status row for ``context`` (highest ``id``).

-    Gitea's combined-status response is newest-first in practice. The merge
-    queue relies on the same contract; keeping the selector explicit makes
-    stale duplicate contexts easy to test.
+    This must work for BOTH orderings Gitea exposes: the combined
+    ``/status`` view is newest-first, but the exhaustively-paginated
+    ``/statuses`` list (see ``fetch_all_statuses``) is ascending id order
+    (oldest-first). Selecting by max ``id`` collapses duplicate context rows
+    to the current one regardless of input order, so a stale earlier run can
+    never shadow the latest result. Rows without an ``id`` are treated as
+    oldest (id -1) so a well-formed newer row always wins.
    """
-
+    newest: dict | None = None
+    newest_id = -1
    for status in statuses:
-        if status.get("context") == context:
-            return status
-    return None
+        if status.get("context") != context:
+            continue
+        raw_id = status.get("id")
+        sid = raw_id if isinstance(raw_id, int) else -1
+        if newest is None or sid >= newest_id:
+            newest = status
+            newest_id = sid
+    return newest


 def ci_context_state(statuses: list[dict], context: str) -> str:
@@ -351,6 +361,55 @@ def _api_json(url: str, token: str) -> dict:
        raise RuntimeError(f"GET {url} -> HTTP {exc.code}: {body}") from exc


+def _api_json_list(url: str, token: str) -> list:
+    """GET a Gitea list endpoint and return the JSON array.
+
+    Like ``_api_json`` but asserts the body is a list. Fail-closed: a non-list
+    body (or HTTP error) raises so the caller never mistakes an unreadable page
+    for "no more statuses" and silently truncates the required-context scan.
+    """
+    req = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
+    try:
+        with urllib.request.urlopen(req, timeout=20) as resp:
+            body = json.loads(resp.read())
+    except urllib.error.HTTPError as exc:
+        detail = exc.read().decode("utf-8", errors="replace")[:500]
+        raise RuntimeError(f"GET {url} -> HTTP {exc.code}: {detail}") from exc
+    if not isinstance(body, list):
+        raise RuntimeError(f"GET {url} -> expected JSON array, got {type(body).__name__}")
+    return body
+
+
+def fetch_all_statuses(host: str, repo: str, sha: str, token: str, page_size: int = 100) -> list[dict]:
+    """Return EVERY commit-status row for ``sha``, paginating to exhaustion.
+
+    The combined ``/commits/{sha}/status`` endpoint caps its embedded
+    ``statuses`` array at the Gitea default page size (~30). On a high-churn
+    commit, an older-but-still-current required-context SUCCESS row is pushed
+    PAST that cap, so a reader of the combined view sees the required context
+    as ``missing`` and either blocks (force-merge audit) or waits forever
+    (this deploy gate). We instead walk ``/commits/{sha}/statuses`` page by
+    page until a short/empty page, accumulating ALL rows.
+
+    Fail-closed: any page that errors or is not a list raises (see
+    ``_api_json_list``) — we never degrade to a partial list and call a deploy
+    green. A genuinely-absent required context simply never appears on ANY
+    page, so the caller's ``ci_context_state`` still reports ``missing`` and
+    the gate stays closed.
+    """
+    base = f"https://{host}/api/v1/repos/{repo}/commits/{sha}/statuses"
+    results: list[dict] = []
+    page = 1
+    while True:
+        page_url = f"{base}?page={page}&limit={page_size}"
+        rows = _api_json_list(page_url, token)
+        results.extend(r for r in rows if isinstance(r, dict))
+        if len(rows) < page_size:
+            break
+        page += 1
+    return results
+
+
 def _api_json_optional(url: str, token: str) -> tuple[int, dict | None]:
    req = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
    try:
@@ -472,12 +531,19 @@ def wait_for_ci_context(env: dict[str, str]) -> str:
    if not token:
        raise ValueError("GITEA_TOKEN is required to wait for CI status")

-    url = f"https://{host}/api/v1/repos/{repo}/commits/{sha}/status"
    deadline = time.time() + timeout
    last_states: dict[str, str] = {}
    while time.time() <= deadline:
-        body = _api_json(url, token)
-        statuses = body.get("statuses") or []
+        # Read the FULL, exhaustively-paginated /statuses list — NOT the
+        # combined /status view, whose embedded `statuses` array is capped at
+        # the Gitea page size (~30). On a high-churn commit a required-context
+        # SUCCESS row lands past that cap and the combined view would report
+        # it `missing`, so this gate would wait until timeout and refuse a
+        # legitimate prod deploy. Fetching every page closes that hole.
+        # Fail-closed is preserved: a genuinely-absent required context is on
+        # NO page, so ci_context_state() still returns "missing" → never
+        # satisfied → the deploy stays blocked.
+        statuses = fetch_all_statuses(host, repo, sha, token)
        states = {context: ci_context_state(statuses, context) for context in contexts}
        for context, state in states.items():
            if state != last_states.get(context):
@@ -197,17 +197,13 @@ if [ "$HTTP_CODE" != "200" ]; then
  exit 1
 fi

-# Filter: state=APPROVED, official=true, not-dismissed, non-author,
-# commit_id matches current PR head. All conditions are mandatory.
-JQ_FILTER='.[]
-  | select(.state == "APPROVED")
-  | select(.official == true)
-  | select(.dismissed != true)
-  | select(.user.login != $author)
-  | select(.commit_id == $head)
-  | .user.login'
-
-REVIEW_CANDIDATES=$(jq -r --arg author "$PR_AUTHOR" --arg head "$PR_HEAD_SHA" "$JQ_FILTER" "$REVIEWS_JSON" | sort -u)
+# Filter via the SSOT fail-closed predicate in _approval_validator.py
+# (same module gitea-merge-queue.py imports). The jq filter is gone
+# entirely — any change to the predicate must be made in
+# _approval_validator.py. See SEV-1 internal#812 for the fail-closed
+# contract this closes.
+SCRIPT_DIR_HERE="$(cd "$(dirname "$0")" && pwd)"
+REVIEW_CANDIDATES=$(python3 "$SCRIPT_DIR_HERE/_review_check_filter.py" "$REVIEWS_JSON" "$PR_HEAD_SHA" "$PR_AUTHOR")
 debug "candidate non-author approvers: $(echo "$REVIEW_CANDIDATES" | tr '\n' ' ')"

 if [ -z "$REVIEW_CANDIDATES" ]; then
@@ -134,6 +134,14 @@ class Handler(http.server.BaseHTTPRequestHandler):
                return self._json(200, [
                    {"state": "APPROVED", "dismissed": False, "user": {"login": "core-devops"}, "commit_id": "deadbeef0000111122223333444455556666"},
                ])
+            if sc == "T23_missing_commit_id":
+                # APPROVED review with NO commit_id field — the SEV-1
+                # internal#812 / closed-#843 spoof-bug signature. The
+                # fail-closed SSOT must REJECT (not silently accept as
+                # "older Gitea row" the way the old pre-fix code did).
+                return self._json(200, [
+                    {"state": "APPROVED", "official": True, "dismissed": False, "user": {"login": "core-devops"}},
+                ])
            # Default: one non-author APPROVED (current head, official)
            return self._json(200, [
                {"state": "APPROVED", "dismissed": False, "official": True, "user": {"login": "core-devops"}, "commit_id": "deadbeef0000111122223333444455556666"},
@@ -0,0 +1,610 @@
+#!/usr/bin/env python3
+"""
+Mutation-verified unit tests for the SSOT fail-closed approval predicate
+in _approval_validator.py (SEV-1 internal#812).
+
+Each test asserts REJECTION explicitly. A reviewer who weakens the
+predicate — e.g., by removing the commit_id check, by reintroducing the
+"no commit_id is accepted" escape hatch, by changing `!=` to `==` in the
+head comparison, or by allowing official == false — will trip these
+tests in CI.
+
+Run:
+  cd .gitea/scripts
+  python3 -m unittest tests.test_approval_validator -v
+  # or
+  python3 tests/test_approval_validator.py
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+import unittest
+
+# Same-dir import — test lives next to _approval_validator.py
+sys.path.insert(
+    0,
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+)
+from _approval_validator import (  # noqa: E402
+    classify_reviews,
+    is_genuine_approval,
+    is_official_current_head,
+    is_open_request_changes,
+)
+
+HEAD = "0123456789abcdef0123456789abcdef01234567"
+OTHER_HEAD = "fedcba9876543210fedcba9876543210fedcba98"
+
+
+def _review(
+    *,
+    state: str = "APPROVED",
+    official: bool = True,
+    dismissed: bool = False,
+    stale: bool = False,
+    commit_id: object = HEAD,
+    user: str = "reviewer-1",
+    body: str = "",
+) -> dict:
+    """Build a minimal review row shaped like the Gitea reviews API."""
+    return {
+        "id": 1,
+        "user": {"login": user, "id": 1},
+        "body": body,
+        "state": state,
+        "official": official,
+        "dismissed": dismissed,
+        "stale": stale,
+        "commit_id": commit_id,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Hard contract: every fail-closed branch must reject
+# ---------------------------------------------------------------------------
+
+
+class IsOfficialCurrentHeadFailClosed(unittest.TestCase):
+    """is_official_current_head is the common predicate. EVERY condition
+    is mandatory. The tests below assert REJECTION for every possible
+    failure of any condition."""
+
+    def test_accepts_canonical_review(self):
+        self.assertTrue(is_official_current_head(_review(), HEAD))
+
+    def test_rejects_non_dict(self):
+        for bad in [None, "string", 42, [], (), object()]:
+            with self.subTest(bad=bad):
+                self.assertFalse(is_official_current_head(bad, HEAD))
+
+    def test_rejects_when_official_is_false(self):
+        for v in [False, None, 0, "false"]:
+            with self.subTest(v=v):
+                self.assertFalse(
+                    is_official_current_head(_review(official=v), HEAD)
+                )
+
+    def test_rejects_when_dismissed(self):
+        for v in [True, "true", 1]:
+            with self.subTest(v=v):
+                self.assertFalse(
+                    is_official_current_head(_review(dismissed=v), HEAD)
+                )
+
+    def test_rejects_when_stale(self):
+        for v in [True, "true", 1]:
+            with self.subTest(v=v):
+                self.assertFalse(
+                    is_official_current_head(_review(stale=v), HEAD)
+                )
+
+    def test_rejects_when_commit_id_missing(self):
+        """FAIL-CLOSED #1: missing commit_id is REJECTED.
+        This is the spoof signature that closed #843 (with CR2 + Researcher
+        both flagging it)."""
+        for bad in [None, "", 0, False, [], {}, ()]:
+            with self.subTest(commit_id=bad):
+                self.assertFalse(
+                    is_official_current_head(_review(commit_id=bad), HEAD),
+                    f"commit_id={bad!r} must reject (fail-closed)",
+                )
+
+    def test_rejects_when_commit_id_wrong_type(self):
+        for bad in [123, 1.5, True, ["abc"], {"sha": HEAD}, ("tuple",)]:
+            with self.subTest(commit_id=bad):
+                self.assertFalse(
+                    is_official_current_head(_review(commit_id=bad), HEAD)
+                )
+
+    def test_rejects_when_commit_id_stale(self):
+        """FAIL-CLOSED #2: present-but-wrong commit_id is REJECTED. Stale
+        reviews on a previous head cannot count."""
+        self.assertFalse(
+            is_official_current_head(_review(commit_id=OTHER_HEAD), HEAD)
+        )
+
+    def test_rejects_when_head_missing(self):
+        for bad in [None, "", 0, False]:
+            with self.subTest(head=bad):
+                self.assertFalse(
+                    is_official_current_head(_review(), bad)
+                )
+
+    def test_rejects_when_head_wrong_type(self):
+        self.assertFalse(is_official_current_head(_review(), 123))
+        self.assertFalse(is_official_current_head(_review(), ["x"]))
+
+
+# ---------------------------------------------------------------------------
+# is_genuine_approval
+# ---------------------------------------------------------------------------
+
+
+class IsGenuineApprovalContract(unittest.TestCase):
+    def test_accepts_canonical_approval(self):
+        self.assertTrue(
+            is_genuine_approval(_review(state="APPROVED"), headsha=HEAD)
+        )
+
+    def test_rejects_non_approved_states(self):
+        for state in ("REQUEST_CHANGES", "COMMENT", "PENDING", "DISMISSED", "approve", "", "bogus"):
+            with self.subTest(state=state):
+                self.assertFalse(
+                    is_genuine_approval(_review(state=state), headsha=HEAD)
+                )
+
+    def test_rejects_case_coerced_approved_states(self):
+        """EXACT-ENUM fail-closed (RCs 9849/9851/9852): Gitea always emits
+        the canonical UPPERCASE "APPROVED". A lowercase/mixed-case/padded
+        value is the signature of a forged row and MUST be rejected, not
+        coerced via .upper() into an accepted APPROVED. Each of these was
+        ACCEPTED before the exact-enum fix."""
+        for state in (
+            "approved", "Approved", "ApProVeD", "APPROVED ", " APPROVED",
+            "approved\n", "\tAPPROVED",
+        ):
+            with self.subTest(state=state):
+                self.assertFalse(
+                    is_genuine_approval(_review(state=state), headsha=HEAD),
+                    f"case-coerced/padded state {state!r} must NOT count as "
+                    "a genuine approval",
+                )
+
+    def test_rejects_non_official_approval(self):
+        """Comment-based / non-official 'APPROVED' is REJECTED.
+        PM: 'reject comment-based / non-official reviews'."""
+        self.assertFalse(
+            is_genuine_approval(
+                _review(state="APPROVED", official=False), headsha=HEAD
+            )
+        )
+
+    def test_rejects_dismissed_approval(self):
+        self.assertFalse(
+            is_genuine_approval(
+                _review(state="APPROVED", dismissed=True), headsha=HEAD
+            )
+        )
+
+    def test_rejects_stale_head_approval(self):
+        """commit_id != head is REJECTED. Stale-on-old-head approvals cannot
+        count, even if they were official and not dismissed."""
+        self.assertFalse(
+            is_genuine_approval(
+                _review(state="APPROVED", commit_id=OTHER_HEAD), headsha=HEAD
+            )
+        )
+
+    def test_rejects_missing_commit_id_approval(self):
+        """FAIL-CLOSED #3: the SEV-1 case. A APPROVED review with NO
+        commit_id is the spoof-bug signature. Reject."""
+        for bad in [None, "", 0, False]:
+            with self.subTest(commit_id=bad):
+                self.assertFalse(
+                    is_genuine_approval(
+                        _review(state="APPROVED", commit_id=bad), headsha=HEAD
+                    ),
+                    f"missing commit_id={bad!r} must reject",
+                )
+
+    def test_reviewer_set_filters_users(self):
+        self.assertTrue(
+            is_genuine_approval(
+                _review(user="alice"),
+                headsha=HEAD,
+                reviewer_set={"alice", "bob"},
+            )
+        )
+        self.assertFalse(
+            is_genuine_approval(
+                _review(user="carol"),
+                headsha=HEAD,
+                reviewer_set={"alice", "bob"},
+            )
+        )
+
+    def test_reviewer_set_none_skips_check(self):
+        # None means "no team filter at this layer" (e.g., review-check.sh
+        # applies its own team-membership probe separately).
+        self.assertTrue(
+            is_genuine_approval(
+                _review(user="anyone"),
+                headsha=HEAD,
+                reviewer_set=None,
+            )
+        )
+
+
+# ---------------------------------------------------------------------------
+# is_open_request_changes
+# ---------------------------------------------------------------------------
+
+
+class IsOpenRequestChangesContract(unittest.TestCase):
+    def test_accepts_canonical_request_changes(self):
+        self.assertTrue(
+            is_open_request_changes(
+                _review(state="REQUEST_CHANGES"), headsha=HEAD
+            )
+        )
+
+    def test_rejects_non_request_changes_states(self):
+        for state in ("APPROVED", "COMMENT", "PENDING", "DISMISSED"):
+            with self.subTest(state=state):
+                self.assertFalse(
+                    is_open_request_changes(
+                        _review(state=state), headsha=HEAD
+                    )
+                )
+
+    def test_rejects_case_coerced_request_changes_states(self):
+        """EXACT-ENUM fail-closed: a lowercase/mixed-case "request_changes"
+        must NOT be coerced into an open-block match. Before the exact-enum
+        fix, .upper() accepted these as REQUEST_CHANGES."""
+        for state in (
+            "request_changes", "Request_Changes", "REQUEST_CHANGES ",
+            " REQUEST_CHANGES", "request_changes\n",
+        ):
+            with self.subTest(state=state):
+                self.assertFalse(
+                    is_open_request_changes(
+                        _review(state=state), headsha=HEAD
+                    ),
+                    f"case-coerced/padded state {state!r} must NOT count as "
+                    "an open REQUEST_CHANGES",
+                )
+
+    def test_rejects_when_dismissed(self):
+        self.assertFalse(
+            is_open_request_changes(
+                _review(state="REQUEST_CHANGES", dismissed=True), headsha=HEAD
+            )
+        )
+
+    def test_rejects_when_stale_head(self):
+        self.assertFalse(
+            is_open_request_changes(
+                _review(state="REQUEST_CHANGES", commit_id=OTHER_HEAD),
+                headsha=HEAD,
+            )
+        )
+
+    def test_rejects_when_missing_commit_id(self):
+        for bad in [None, "", 0]:
+            with self.subTest(commit_id=bad):
+                self.assertFalse(
+                    is_open_request_changes(
+                        _review(state="REQUEST_CHANGES", commit_id=bad),
+                        headsha=HEAD,
+                    )
+                )
+
+
+# ---------------------------------------------------------------------------
+# classify_reviews — the merge-queue consumer
+# ---------------------------------------------------------------------------
+
+
+class ClassifyReviewsContract(unittest.TestCase):
+    def test_basic_approvers_and_request_changes(self):
+        reviews = [
+            _review(user="alice", state="APPROVED", commit_id=HEAD),
+            _review(user="bob", state="REQUEST_CHANGES", commit_id=HEAD),
+        ]
+        approvers, request_changes = classify_reviews(reviews, headsha=HEAD)
+        self.assertEqual(approvers, {"alice"})
+        self.assertEqual(request_changes, ["bob"])
+
+    def test_reviewer_set_filters_early(self):
+        reviews = [
+            _review(user="alice", state="APPROVED", commit_id=HEAD),
+            _review(user="carol", state="APPROVED", commit_id=HEAD),
+        ]
+        approvers, _ = classify_reviews(
+            reviews, headsha=HEAD, reviewer_set={"alice"}
+        )
+        self.assertEqual(approvers, {"alice"})
+
+    def test_latest_review_per_user_wins(self):
+        # alice's REQUEST_CHANGES (latest) supersedes her earlier APPROVED.
+        reviews = [
+            _review(user="alice", state="APPROVED", commit_id=HEAD),
+            _review(user="alice", state="REQUEST_CHANGES", commit_id=HEAD),
+        ]
+        approvers, request_changes = classify_reviews(reviews, headsha=HEAD)
+        self.assertNotIn("alice", approvers)
+        self.assertIn("alice", request_changes)
+
+    def test_stale_head_approval_excluded(self):
+        reviews = [
+            _review(user="alice", state="APPROVED", commit_id=OTHER_HEAD),
+        ]
+        approvers, _ = classify_reviews(reviews, headsha=HEAD)
+        self.assertEqual(approvers, set())
+
+    def test_missing_commit_id_approval_excluded(self):
+        """The SEV-1 fail-open surface. APPROVED + no commit_id → must NOT
+        count toward approvers, even with stale=False/dismissed=False."""
+        reviews = [
+            _review(user="alice", state="APPROVED", commit_id=None),
+            _review(user="bob", state="APPROVED", commit_id=""),
+        ]
+        approvers, _ = classify_reviews(reviews, headsha=HEAD)
+        self.assertEqual(approvers, set())
+
+    def test_dismissed_approval_excluded(self):
+        reviews = [
+            _review(user="alice", state="APPROVED", dismissed=True, commit_id=HEAD),
+        ]
+        approvers, _ = classify_reviews(reviews, headsha=HEAD)
+        self.assertEqual(approvers, set())
+
+    def test_non_official_approval_excluded(self):
+        reviews = [
+            _review(user="alice", state="APPROVED", official=False, commit_id=HEAD),
+        ]
+        approvers, _ = classify_reviews(reviews, headsha=HEAD)
+        self.assertEqual(approvers, set())
+
+    def test_comment_state_excluded(self):
+        reviews = [
+            _review(user="alice", state="COMMENT", commit_id=HEAD),
+        ]
+        approvers, _ = classify_reviews(reviews, headsha=HEAD)
+        self.assertEqual(approvers, set())
+
+    def test_case_coerced_approved_not_counted(self):
+        """EXACT-ENUM via the reducer: a lowercase 'approved' (otherwise
+        valid official current-head row) must NOT be counted as an approver.
+        Before the fix, classify_reviews coerced it via .upper()."""
+        for state in ("approved", "Approved", "APPROVED "):
+            with self.subTest(state=state):
+                reviews = [
+                    _review(user="alice", state=state, commit_id=HEAD),
+                ]
+                approvers, request_changes = classify_reviews(
+                    reviews, headsha=HEAD
+                )
+                self.assertEqual(approvers, set())
+                self.assertEqual(request_changes, [])
+
+    def test_case_coerced_request_changes_not_silently_dropped(self):
+        """EXACT-ENUM via the reducer: a lowercase 'request_changes' must be
+        rejected (not coerced into a block). Crucially, it must NOT silently
+        erase a SAME-USER genuine current-head REQUEST_CHANGES posted
+        earlier — the case-variant later row is invalid and is ignored, so
+        the genuine block stands."""
+        reviews = [
+            _review(user="bob", state="REQUEST_CHANGES", commit_id=HEAD),
+            _review(user="bob", state="request_changes", commit_id=HEAD),
+        ]
+        approvers, request_changes = classify_reviews(reviews, headsha=HEAD)
+        self.assertIn("bob", request_changes)
+        self.assertNotIn("bob", approvers)
+
+    def test_stale_head_request_changes_excluded(self):
+        # A REQUEST_CHANGES on a previous head must NOT block the current head.
+        reviews = [
+            _review(user="bob", state="REQUEST_CHANGES", commit_id=OTHER_HEAD),
+        ]
+        _, request_changes = classify_reviews(reviews, headsha=HEAD)
+        self.assertEqual(request_changes, [])
+
+    # -----------------------------------------------------------------
+    # VALIDATE-BEFORE-REDUCE regression tests (SEV-1 internal#812 follow-up).
+    #
+    # The bug: classify_reviews reduced to the LATEST row per user FIRST and
+    # validated AFTER. A later INVALID row (a COMMENT, or APPROVED/
+    # REQUEST_CHANGES with a null/old commit_id) from the same user could
+    # overwrite a genuine current-head review — masking an approval or
+    # ERASING a REQUEST_CHANGES block. The fix validates before the reduce,
+    # so an invalid later row is never eligible to be a user's "latest".
+    # -----------------------------------------------------------------
+
+    def test_genuine_approval_not_masked_by_later_comment(self):
+        """A genuine current-head APPROVED followed by a LATER COMMENT from
+        the SAME user must STILL count as an approval. A later non-
+        APPROVED/RC row (COMMENT) must not erase the approval. This is the
+        reduce-before-validate masking bug."""
+        reviews = [
+            _review(user="alice", state="APPROVED", commit_id=HEAD),
+            _review(user="alice", state="COMMENT", commit_id=HEAD),
+        ]
+        approvers, request_changes = classify_reviews(reviews, headsha=HEAD)
+        self.assertIn("alice", approvers)
+        self.assertEqual(request_changes, [])
+
+    def test_genuine_approval_not_masked_by_later_null_commit_id(self):
+        """A genuine current-head APPROVED followed by a LATER APPROVED with
+        a null commit_id (the spoof/invalid signature) from the SAME user
+        must STILL count. The invalid later row must be ignored, not allowed
+        to overwrite the valid earlier approval."""
+        for bad in [None, ""]:
+            with self.subTest(commit_id=bad):
+                reviews = [
+                    _review(user="alice", state="APPROVED", commit_id=HEAD),
+                    _review(user="alice", state="APPROVED", commit_id=bad),
+                ]
+                approvers, _ = classify_reviews(reviews, headsha=HEAD)
+                self.assertIn(
+                    "alice", approvers,
+                    f"later invalid commit_id={bad!r} must not mask the "
+                    "genuine current-head approval",
+                )
+
+    def test_genuine_approval_not_masked_by_later_stale_commit_id(self):
+        """A genuine current-head APPROVED followed by a LATER APPROVED on a
+        STALE (old) head from the SAME user must STILL count toward
+        approvers — the stale later row is invalid and must be ignored."""
+        reviews = [
+            _review(user="alice", state="APPROVED", commit_id=HEAD),
+            _review(user="alice", state="APPROVED", commit_id=OTHER_HEAD),
+        ]
+        approvers, _ = classify_reviews(reviews, headsha=HEAD)
+        self.assertIn("alice", approvers)
+
+    def test_request_changes_not_erased_by_later_comment(self):
+        """A genuine current-head REQUEST_CHANGES followed by a LATER COMMENT
+        from the SAME user must STILL block. The later invalid row must not
+        erase the REQUEST_CHANGES — this is the worse, silently-evaporating-
+        block variant of the bug."""
+        reviews = [
+            _review(user="bob", state="REQUEST_CHANGES", commit_id=HEAD),
+            _review(user="bob", state="COMMENT", commit_id=HEAD),
+        ]
+        approvers, request_changes = classify_reviews(reviews, headsha=HEAD)
+        self.assertIn("bob", request_changes)
+        self.assertNotIn("bob", approvers)
+
+    def test_request_changes_not_erased_by_later_null_commit_id(self):
+        """A genuine current-head REQUEST_CHANGES followed by a LATER
+        REQUEST_CHANGES with a null/old commit_id from the SAME user must
+        STILL block. The invalid later row must be ignored, not allowed to
+        relocate the user's verdict off the current head."""
+        for bad in [None, "", OTHER_HEAD]:
+            with self.subTest(commit_id=bad):
+                reviews = [
+                    _review(user="bob", state="REQUEST_CHANGES", commit_id=HEAD),
+                    _review(user="bob", state="REQUEST_CHANGES", commit_id=bad),
+                ]
+                _, request_changes = classify_reviews(reviews, headsha=HEAD)
+                self.assertIn(
+                    "bob", request_changes,
+                    f"later invalid commit_id={bad!r} must not erase the "
+                    "genuine current-head REQUEST_CHANGES block",
+                )
+
+    def test_request_changes_not_erased_by_later_approved_invalid(self):
+        """A genuine current-head REQUEST_CHANGES followed by a LATER
+        INVALID APPROVED (null commit_id) from the SAME user must STILL
+        block AND must NOT count the user as an approver. The invalid
+        approval must not flip a real block into a pass."""
+        reviews = [
+            _review(user="bob", state="REQUEST_CHANGES", commit_id=HEAD),
+            _review(user="bob", state="APPROVED", commit_id=None),
+        ]
+        approvers, request_changes = classify_reviews(reviews, headsha=HEAD)
+        self.assertIn("bob", request_changes)
+        self.assertNotIn("bob", approvers)
+
+    def test_genuine_request_changes_still_supersedes_genuine_approval(self):
+        """Sanity: a genuine LATER current-head REQUEST_CHANGES still
+        supersedes an earlier genuine APPROVED from the same user (the
+        valid-row supersession we MUST preserve — only INVALID later rows
+        are ignored). Guards against an over-correction that ignores all
+        later rows."""
+        reviews = [
+            _review(user="alice", state="APPROVED", commit_id=HEAD),
+            _review(user="alice", state="REQUEST_CHANGES", commit_id=HEAD),
+        ]
+        approvers, request_changes = classify_reviews(reviews, headsha=HEAD)
+        self.assertNotIn("alice", approvers)
+        self.assertIn("alice", request_changes)
+
+    def test_genuine_approval_still_supersedes_genuine_request_changes(self):
+        """Sanity: a genuine LATER current-head APPROVED supersedes an
+        earlier genuine REQUEST_CHANGES from the same user."""
+        reviews = [
+            _review(user="alice", state="REQUEST_CHANGES", commit_id=HEAD),
+            _review(user="alice", state="APPROVED", commit_id=HEAD),
+        ]
+        approvers, request_changes = classify_reviews(reviews, headsha=HEAD)
+        self.assertIn("alice", approvers)
+        self.assertEqual(request_changes, [])
+
+    def test_two_valid_approvers_plus_one_invalid_later_row(self):
+        """Two distinct users with valid current-head approvals + a third
+        user whose ONLY genuine approval is followed by an invalid later
+        row → all three real approvers are counted; the invalid later row
+        does not drop the third user."""
+        reviews = [
+            _review(user="alice", state="APPROVED", commit_id=HEAD),
+            _review(user="bob", state="APPROVED", commit_id=HEAD),
+            _review(user="carol", state="APPROVED", commit_id=HEAD),
+            _review(user="carol", state="COMMENT", commit_id=HEAD),
+        ]
+        approvers, request_changes = classify_reviews(reviews, headsha=HEAD)
+        self.assertEqual(approvers, {"alice", "bob", "carol"})
+        self.assertEqual(request_changes, [])
+
+
+# ---------------------------------------------------------------------------
+# Mutation-resistance smoke checks
+#
+# These tests document the mutations a reviewer would have to apply to
+# weaken the gate. They are not synthetic; they verify that the
+# predicate is structured so each known-softening mutation would also
+# fail at least one other test in this file. We can't actually mutate
+# the source in CI, but these tests are explicit about the mutations
+# that would slip through, and the suite is dense enough that any
+# loosening of the predicate will fail multiple cases.
+# ---------------------------------------------------------------------------
+
+
+class MutationResistance(unittest.TestCase):
+    def test_documented_mutation_remove_commit_id_check_fails(self):
+        """If a reviewer removes the commit_id check (e.g., reverts to
+        the pre-fix `if isinstance(commit_id, str) and commit_id and
+        headsha:` guard, or replaces `commit_id != headsha` with True),
+        the missing-commit_id tests above (test_rejects_when_commit_id_missing
+        in IsOfficialCurrentHeadFailClosed, test_rejects_missing_commit_id_approval
+        in IsGenuineApprovalContract, test_missing_commit_id_approval_excluded
+        in ClassifyReviewsContract) would all fail. The reviewer would have
+        to weaken all three test categories to slip the SEV-1 surface in."""
+        # Sanity: every missing-commit_id case is a False today.
+        for bad in [None, "", 0, False]:
+            with self.subTest(commit_id=bad):
+                self.assertFalse(
+                    is_official_current_head(_review(commit_id=bad), HEAD)
+                )
+                self.assertFalse(
+                    is_genuine_approval(
+                        _review(commit_id=bad), headsha=HEAD
+                    )
+                )
+
+    def test_documented_mutation_change_neq_to_eq_fails(self):
+        """If a reviewer changes `commit_id != headsha` to `commit_id == headsha`
+        in the head comparison (inverting the check), the stale-head tests
+        (test_rejects_when_commit_id_stale, test_stale_head_approval_excluded)
+        would fail because the wrong head would now match."""
+        self.assertFalse(
+            is_official_current_head(_review(commit_id=OTHER_HEAD), HEAD)
+        )
+
+    def test_documented_mutation_drop_official_check_fails(self):
+        """If a reviewer drops the `if not review.get('official')` check, the
+        non-official tests (test_rejects_when_official_is_false,
+        test_rejects_non_official_approval, test_non_official_approval_excluded)
+        would all fail."""
+        self.assertFalse(
+            is_genuine_approval(
+                _review(state="APPROVED", official=False), headsha=HEAD
+            )
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -115,5 +115,79 @@ T16=$(validate_required_checks_json "main" '{"main":"CI / all-required"}')
 [ "$T16" = "false" ] || fail "T16: string branch entry should fail"
 pass "T16: string branch entry fails"

+# ---------------------------------------------------------------------------
+# T17+ — /statuses pagination (status-pagination RCA, #2440-family).
+# The reader now pages /commits/{sha}/statuses to exhaustion instead of reading
+# the capped combined /status view. These lock the page-accumulation,
+# newest-wins collapse, short-page stop, and fail-closed contracts.
+# ---------------------------------------------------------------------------
+
+# Page-body type validator used per page (bare array, not an object).
+validate_page_is_array() { jq -e 'type == "array"' >/dev/null 2>&1 && echo true || echo false; }
+
+# newest-wins collapse: mirror the script's max-by-id jq (order-independent).
+collapse_newest_per_context() {
+  declare -A CS
+  while IFS=$'\t' read -r ctx state; do
+    [ -n "$ctx" ] && CS[$ctx]="$state"
+  done < <(jq -r 'group_by(.context) | map(max_by(.id)) | .[] | "\(.context)\t\(.status)"')
+  state="${CS[CI / all-required (push)]:-missing}"
+  echo "$state"
+}
+
+# T17 — a bare JSON array page passes the per-page array check.
+T17=$(echo '[{"context":"c1","status":"success"}]' | validate_page_is_array)
+[ "$T17" = "true" ] || fail "T17: bare array page should pass array check"
+pass "T17: bare array page passes array check"
+
+# T18 — a non-array page (object) fails the per-page array check → fail-closed.
+T18=$(echo '{"statuses":[]}' | validate_page_is_array)
+[ "$T18" = "false" ] || fail "T18: object page should fail array check (fail-closed)"
+pass "T18: object page fails array check (fail-closed)"
+
+# T19 — required SUCCESS on PAGE 2 is FOUND after accumulation (not missing).
+#   page1: 100 noise rows (older ids); page2: the required-context success.
+PAGE1=$(jq -nc '[range(0;100) | {id:., context:("noise-\(.) (push)"), status:"pending"}]')
+PAGE2='[{"id":200,"context":"CI / all-required (push)","status":"success"}]'
+# Accumulation matching the script: two-arg `jq -s '.[0] + .[1]'` over the
+# running accumulator and the new page.
+ACCUM=$(jq -s '.[0] + .[1]' <(echo "$PAGE1") <(echo "$PAGE2"))
+LEN=$(echo "$ACCUM" | jq 'length')
+[ "$LEN" = "101" ] || fail "T19: accumulated length should be 101, got $LEN"
+RESULT=$(echo "$ACCUM" | collapse_newest_per_context)
+[ "$RESULT" = "success" ] || fail "T19: required success on page2 must be FOUND, got '$RESULT'"
+pass "T19: required success on page2 is found after pagination"
+
+# T20 — genuinely-absent required context across all pages stays 'missing'
+#       → fail-closed (counted as not-green, flags the force-merge).
+ABSENT=$(jq -nc '[range(0;100) | {id:., context:("noise-\(.) (push)"), status:"success"}]')
+RESULT2=$(echo "$ABSENT" | collapse_newest_per_context)
+[ "$RESULT2" = "missing" ] || fail "T20: absent required context must stay 'missing', got '$RESULT2'"
+pass "T20: genuinely-absent required context stays missing (fail-closed)"
+
+# T21 — non-monotonic order: newest id (157, neither first nor last in list)
+#       a NEWER success row (oldest-first append → last overwrite wins).
+DUP='[{"id":155,"context":"CI / all-required (push)","status":"pending"},
+      {"id":157,"context":"CI / all-required (push)","status":"success"},
+      {"id":125,"context":"CI / all-required (push)","status":"failure"}]'
+RESULT3=$(echo "$DUP" | collapse_newest_per_context)
+[ "$RESULT3" = "success" ] || fail "T21: newest (success) must win over older (failure), got '$RESULT3'"
+pass "T21: newest row per context wins after pagination collapse"
+
+# T22 — short-page stop condition: a page with fewer than PER_PAGE rows ends
+#       the loop. Emulate the numeric comparison the script uses.
+PER_PAGE=100
+PAGE_COUNT=$(echo "$PAGE2" | jq 'length')   # 1 row
+if [ "$PAGE_COUNT" -lt "$PER_PAGE" ]; then SHORT=stop; else SHORT=continue; fi
+[ "$SHORT" = "stop" ] || fail "T22: short page should stop pagination"
+pass "T22: short page stops pagination loop"
+
+# T23 — a full page (== PER_PAGE) continues the loop.
+FULL=$(jq -nc '[range(0;100) | {id:., context:"x", status:"success"}]')
+FULL_COUNT=$(echo "$FULL" | jq 'length')
+if [ "$FULL_COUNT" -lt "$PER_PAGE" ]; then CONT=stop; else CONT=continue; fi
+[ "$CONT" = "continue" ] || fail "T23: full page should continue pagination"
+pass "T23: full page continues pagination loop"
+
 echo
 echo "ALL AUDIT-FORCE-MERGE CHECKS PASSED"
@@ -14,35 +14,35 @@ spec.loader.exec_module(mq)
 def test_latest_statuses_dedupes_by_context_newest_first():
    statuses = [
        {"context": "CI / all-required (pull_request)", "status": "failure"},
-        {"context": "sop-checklist / all-items-acked (pull_request)", "state": "success"},
+        {"context": "sop-checklist / all-items-acked (pull_request_target)", "state": "success"},
        {"context": "CI / all-required (pull_request)", "status": "success"},
    ]

    latest = mq.latest_statuses_by_context(statuses)

    assert latest["CI / all-required (pull_request)"]["status"] == "failure"
-    assert latest["sop-checklist / all-items-acked (pull_request)"]["state"] == "success"
+    assert latest["sop-checklist / all-items-acked (pull_request_target)"]["state"] == "success"


 def test_required_contexts_green_rejects_missing_and_pending():
    latest = mq.latest_statuses_by_context([
        {"context": "CI / all-required (pull_request)", "status": "success"},
-        {"context": "sop-checklist / all-items-acked (pull_request)", "status": "pending"},
+        {"context": "sop-checklist / all-items-acked (pull_request_target)", "status": "pending"},
    ])

    ok, missing_or_bad = mq.required_contexts_green(
        latest,
        [
            "CI / all-required (pull_request)",
-            "sop-checklist / all-items-acked (pull_request)",
-            "qa-review / approved (pull_request)",
+            "sop-checklist / all-items-acked (pull_request_target)",
+            "qa-review / approved (pull_request_target)",
        ],
    )

    assert ok is False
    assert missing_or_bad == [
-        "sop-checklist / all-items-acked (pull_request)=pending",
-        "qa-review / approved (pull_request)=missing",
+        "sop-checklist / all-items-acked (pull_request_target)=pending",
+        "qa-review / approved (pull_request_target)=missing",
    ]


@@ -56,7 +56,7 @@ def test_required_contexts_green_rejects_volume_skipped():
    latest = mq.latest_statuses_by_context([
        {"context": "CI / all-required (pull_request)", "status": "success"},
        {
-            "context": "sop-checklist / all-items-acked (pull_request)",
+            "context": "sop-checklist / all-items-acked (pull_request_target)",
            "status": "pending",
            "description": "[volume-skipped] comment-cap=1000 hit; please file ...",
        },
@@ -66,12 +66,12 @@ def test_required_contexts_green_rejects_volume_skipped():
        latest,
        [
            "CI / all-required (pull_request)",
-            "sop-checklist / all-items-acked (pull_request)",
+            "sop-checklist / all-items-acked (pull_request_target)",
        ],
    )

    assert ok is False
-    assert "sop-checklist / all-items-acked (pull_request)=pending" in missing_or_bad
+    assert "sop-checklist / all-items-acked (pull_request_target)=pending" in missing_or_bad


 def test_choose_next_pr_sorts_by_queue_label_timestamp_then_number():
@@ -129,16 +129,16 @@ def _ready_kwargs(**overrides):
            "state": "success",
            "statuses": [
                {"context": "CI / all-required (pull_request)", "status": "success"},
-                {"context": "qa-review / approved (pull_request)", "status": "success"},
-                {"context": "security-review / approved (pull_request)", "status": "success"},
-                {"context": "sop-checklist / all-items-acked (pull_request)", "status": "success"},
+                {"context": "qa-review / approved (pull_request_target)", "status": "success"},
+                {"context": "security-review / approved (pull_request_target)", "status": "success"},
+                {"context": "sop-checklist / all-items-acked (pull_request_target)", "status": "success"},
            ],
        },
        required_contexts=[
            "CI / all-required (pull_request)",
-            "qa-review / approved (pull_request)",
-            "security-review / approved (pull_request)",
-            "sop-checklist / all-items-acked (pull_request)",
+            "qa-review / approved (pull_request_target)",
+            "security-review / approved (pull_request_target)",
+            "sop-checklist / all-items-acked (pull_request_target)",
        ],
        required_approvals=2,
        approvers={"agent-reviewer-cr2", "agent-researcher"},
@@ -248,7 +248,7 @@ def test_genuine_approvals_counts_two_distinct_on_current_head():
        {"state": "APPROVED", "user": {"login": "agent-reviewer-cr2"},
         "official": True, "stale": False, "dismissed": False, "commit_id": "HEAD"},
    ]
-    approvers, rc = mq.genuine_approvals(reviews, head_sha="HEAD", reviewer_set=REVIEWERS)
+    approvers, rc = mq.genuine_approvals(reviews, headsha="HEAD", reviewer_set=REVIEWERS)
    assert approvers == {"agent-researcher", "agent-reviewer-cr2"}
    assert rc == []

@@ -265,7 +265,7 @@ def test_genuine_approvals_ignores_stale_dismissed_and_wrong_head():
        {"state": "APPROVED", "user": {"login": "agent-reviewer"},
         "official": True, "stale": False, "dismissed": False, "commit_id": "OLD"},
    ]
-    approvers, rc = mq.genuine_approvals(reviews, head_sha="HEAD", reviewer_set=REVIEWERS)
+    approvers, rc = mq.genuine_approvals(reviews, headsha="HEAD", reviewer_set=REVIEWERS)
    assert approvers == set()
    assert rc == []

@@ -279,7 +279,7 @@ def test_genuine_approvals_ignores_unofficial_and_outsiders():
        {"state": "APPROVED", "user": {"login": "hongming-codex-laptop"},
         "official": True, "stale": False, "dismissed": False, "commit_id": "HEAD"},
    ]
-    approvers, rc = mq.genuine_approvals(reviews, head_sha="HEAD", reviewer_set=REVIEWERS)
+    approvers, rc = mq.genuine_approvals(reviews, headsha="HEAD", reviewer_set=REVIEWERS)
    assert approvers == set()


@@ -291,7 +291,7 @@ def test_genuine_approvals_latest_review_supersedes_earlier():
        {"state": "REQUEST_CHANGES", "user": {"login": "agent-reviewer-cr2"},
         "official": True, "stale": False, "dismissed": False, "commit_id": "HEAD"},
    ]
-    approvers, rc = mq.genuine_approvals(reviews, head_sha="HEAD", reviewer_set=REVIEWERS)
+    approvers, rc = mq.genuine_approvals(reviews, headsha="HEAD", reviewer_set=REVIEWERS)
    assert approvers == set()
    assert rc == ["agent-reviewer-cr2"]

@@ -321,9 +321,9 @@ def test_governance_red_blocks_merge():
        "state": "failure",
        "statuses": [
            {"context": "CI / all-required (pull_request)", "status": "success"},
-            {"context": "qa-review / approved (pull_request)", "status": "failure"},
-            {"context": "security-review / approved (pull_request)", "status": "pending"},
-            {"context": "sop-checklist / all-items-acked (pull_request)", "status": "failure"},
+            {"context": "qa-review / approved (pull_request_target)", "status": "failure"},
+            {"context": "security-review / approved (pull_request_target)", "status": "pending"},
+            {"context": "sop-checklist / all-items-acked (pull_request_target)", "status": "failure"},
            {"context": "Staging SaaS / e2e (pull_request)", "status": "failure"},
        ],
    }
@@ -361,9 +361,9 @@ def test_non_required_advisory_red_does_not_block_merge():
        "state": "failure",  # combined polluted by advisory non-required reds
        "statuses": [
            {"context": "CI / all-required (pull_request)", "status": "success"},
-            {"context": "qa-review / approved (pull_request)", "status": "success"},
-            {"context": "security-review / approved (pull_request)", "status": "success"},
-            {"context": "sop-checklist / all-items-acked (pull_request)", "status": "success"},
+            {"context": "qa-review / approved (pull_request_target)", "status": "success"},
+            {"context": "security-review / approved (pull_request_target)", "status": "success"},
+            {"context": "sop-checklist / all-items-acked (pull_request_target)", "status": "success"},
            {"context": "Staging SaaS / e2e (pull_request)", "status": "failure"},
        ],
    }
@@ -471,9 +471,9 @@ def test_process_once_holds_pr_on_permanent_merge_error(monkeypatch):
            return {"state": "success", "statuses": [{"context": "CI / all-required (push)", "status": "success"}]}
        return {"state": "success", "statuses": [
            {"context": "CI / all-required (pull_request)", "status": "success"},
-            {"context": "qa-review / approved (pull_request)", "status": "success"},
-            {"context": "security-review / approved (pull_request)", "status": "success"},
-            {"context": "sop-checklist / all-items-acked (pull_request)", "status": "success"},
+            {"context": "qa-review / approved (pull_request_target)", "status": "success"},
+            {"context": "security-review / approved (pull_request_target)", "status": "success"},
+            {"context": "sop-checklist / all-items-acked (pull_request_target)", "status": "success"},
        ]}
    monkeypatch.setattr(mq, "get_combined_status", fake_combined)

@@ -544,9 +544,9 @@ def _fully_ready_process_once_monkeypatch(monkeypatch, mergeable, calls):
            return {"state": "success", "statuses": [{"context": "CI / all-required (push)", "status": "success"}]}
        return {"state": "success", "statuses": [
            {"context": "CI / all-required (pull_request)", "status": "success"},
-            {"context": "qa-review / approved (pull_request)", "status": "success"},
-            {"context": "security-review / approved (pull_request)", "status": "success"},
-            {"context": "sop-checklist / all-items-acked (pull_request)", "status": "success"},
+            {"context": "qa-review / approved (pull_request_target)", "status": "success"},
+            {"context": "security-review / approved (pull_request_target)", "status": "success"},
+            {"context": "sop-checklist / all-items-acked (pull_request_target)", "status": "success"},
        ]}
    monkeypatch.setattr(mq, "get_combined_status", fake_combined)

@@ -955,9 +955,9 @@ def _stale_pr_update_409_monkeypatch(monkeypatch, queued_issues, calls):
            return {"state": "success", "statuses": [{"context": "CI / all-required (push)", "status": "success"}]}
        return {"state": "success", "statuses": [
            {"context": "CI / all-required (pull_request)", "status": "success"},
-            {"context": "qa-review / approved (pull_request)", "status": "success"},
-            {"context": "security-review / approved (pull_request)", "status": "success"},
-            {"context": "sop-checklist / all-items-acked (pull_request)", "status": "success"},
+            {"context": "qa-review / approved (pull_request_target)", "status": "success"},
+            {"context": "security-review / approved (pull_request_target)", "status": "success"},
+            {"context": "sop-checklist / all-items-acked (pull_request_target)", "status": "success"},
        ]}
    monkeypatch.setattr(mq, "get_combined_status", fake_combined)

@@ -1232,9 +1232,9 @@ def _wire_ready_process_once(monkeypatch, *, issues, pr_payload, calls):
            ]}
        return {"state": "success", "statuses": [
            {"context": "CI / all-required (pull_request)", "status": "success"},
-            {"context": "qa-review / approved (pull_request)", "status": "success"},
-            {"context": "security-review / approved (pull_request)", "status": "success"},
-            {"context": "sop-checklist / all-items-acked (pull_request)", "status": "success"},
+            {"context": "qa-review / approved (pull_request_target)", "status": "success"},
+            {"context": "security-review / approved (pull_request_target)", "status": "success"},
+            {"context": "sop-checklist / all-items-acked (pull_request_target)", "status": "success"},
        ]}
    monkeypatch.setattr(mq, "get_combined_status", fake_combined)
    monkeypatch.setattr(mq, "list_candidate_issues", lambda *, auto_discover: issues)
@@ -1420,9 +1420,9 @@ def _wire_multi_candidate_process_once(monkeypatch, *, issues, pulls, reviews, c
            return {"state": "success", "statuses": [{"context": "CI / all-required (push)", "status": "success"}]}
        return {"state": "success", "statuses": [
            {"context": "CI / all-required (pull_request)", "status": "success"},
-            {"context": "qa-review / approved (pull_request)", "status": "success"},
-            {"context": "security-review / approved (pull_request)", "status": "success"},
-            {"context": "sop-checklist / all-items-acked (pull_request)", "status": "success"},
+            {"context": "qa-review / approved (pull_request_target)", "status": "success"},
+            {"context": "security-review / approved (pull_request_target)", "status": "success"},
+            {"context": "sop-checklist / all-items-acked (pull_request_target)", "status": "success"},
        ]}
    monkeypatch.setattr(mq, "get_combined_status", fake_combined)

@@ -1557,9 +1557,9 @@ def test_hol_unready_red_required_ci_is_skipped_for_ready_pr(monkeypatch):
        return {"state": state,
                "statuses": [
                    {"context": "CI / all-required (pull_request)", "status": state},
-                    {"context": "qa-review / approved (pull_request)", "status": "success"},
-                    {"context": "security-review / approved (pull_request)", "status": "success"},
-                    {"context": "sop-checklist / all-items-acked (pull_request)", "status": "success"},
+                    {"context": "qa-review / approved (pull_request_target)", "status": "success"},
+                    {"context": "security-review / approved (pull_request_target)", "status": "success"},
+                    {"context": "sop-checklist / all-items-acked (pull_request_target)", "status": "success"},
                ]}
    monkeypatch.setattr(mq, "get_combined_status", fake_combined)

@@ -105,16 +105,25 @@ def test_build_plan_disable_flag_short_circuits_before_credentials():
    assert plan["disabled_reason"] == "PROD_AUTO_DEPLOY_DISABLED=true"


-def test_latest_status_for_context_uses_first_matching_status():
+def test_latest_status_for_context_picks_newest_by_id_regardless_of_order():
+    # The exhaustively-paginated /statuses list is ascending id order
+    # (oldest-first), the opposite of the combined /status view. The selector
+    # must collapse duplicate context rows to the NEWEST (max id) so a stale
+    # earlier run never shadows the current result, whichever way they arrive.
    statuses = [
-        {"context": "CI / all-required (push)", "status": "pending"},
-        {"context": "CI / all-required (pull_request)", "status": "success"},
-        {"context": "CI / all-required (push)", "status": "success"},
+        {"id": 10, "context": "CI / all-required (push)", "status": "pending"},
+        {"id": 11, "context": "CI / all-required (pull_request)", "status": "success"},
+        {"id": 12, "context": "CI / all-required (push)", "status": "success"},
    ]

    latest = prod.latest_status_for_context(statuses, "CI / all-required (push)")

-    assert latest == {"context": "CI / all-required (push)", "status": "pending"}
+    assert latest == {"id": 12, "context": "CI / all-required (push)", "status": "success"}
+
+    # Same rows shuffled (newest-first, as the combined view would deliver)
+    # must still resolve to the same newest row.
+    latest_rev = prod.latest_status_for_context(list(reversed(statuses)), "CI / all-required (push)")
+    assert latest_rev == {"id": 12, "context": "CI / all-required (push)", "status": "success"}


 def test_ci_context_state_handles_missing_and_gitea_status_key():
@@ -612,3 +621,123 @@ def test_superseded_by_none_for_latest_job_so_it_still_rolls(monkeypatch):
        )
        is None
    )
+
+
+# ---------------------------------------------------------------------------
+# /statuses pagination — required-context SUCCESS on page 2+ must be FOUND,
+# genuinely-absent context must STILL fail-closed (no fail-open).
+# Regression for the single-page-status bug (#2440-family, pagination RCA):
+# the combined /status view caps `statuses` at ~30, so on a high-churn commit
+# the still-current required-context row is pushed past page 1 and the reader
+# falsely reports it `missing`.
+# ---------------------------------------------------------------------------
+def _paged_statuses_stub(pages):
+    """Return a fake _api_json_list that serves `pages` keyed by ?page=N."""
+    def fake(url, _token):
+        # url looks like .../statuses?page=N&limit=100
+        page = 1
+        for part in url.split("?", 1)[-1].split("&"):
+            if part.startswith("page="):
+                page = int(part.split("=", 1)[1])
+        return pages.get(page, [])
+    return fake
+
+
+def test_fetch_all_statuses_finds_required_success_on_page_two(monkeypatch):
+    # Page 1 is a full 100 rows of unrelated/older churn; the required-context
+    # SUCCESS only appears on page 2. A single-page reader would miss it.
+    page1 = [
+        {"id": i, "context": f"noise-{i} (push)", "status": "pending"}
+        for i in range(100)
+    ]
+    page2 = [
+        {"id": 200, "context": "CI / all-required (push)", "status": "success"},
+        {"id": 201, "context": "Secret scan / Scan diff for credential-shaped strings (push)",
+         "status": "success"},
+    ]
+    monkeypatch.setattr(prod, "_api_json_list", _paged_statuses_stub({1: page1, 2: page2}))
+
+    rows = prod.fetch_all_statuses("git.moleculesai.app", "molecule-ai/molecule-core", "a" * 40, "tok")
+    # Must have walked to page 2 and accumulated every row.
+    assert len(rows) == 102
+    assert prod.ci_context_state(rows, "CI / all-required (push)") == "success"
+    assert (
+        prod.ci_context_state(
+            rows, "Secret scan / Scan diff for credential-shaped strings (push)"
+        )
+        == "success"
+    )
+
+
+def test_fetch_all_statuses_genuinely_absent_context_stays_missing(monkeypatch):
+    # The required context is on NO page → fail-closed: ci_context_state must
+    # report "missing", which context_is_satisfied() rejects → gate stays shut.
+    page1 = [
+        {"id": i, "context": f"noise-{i} (push)", "status": "success"}
+        for i in range(100)
+    ]
+    page2 = [{"id": 200, "context": "some-other (push)", "status": "success"}]
+    monkeypatch.setattr(prod, "_api_json_list", _paged_statuses_stub({1: page1, 2: page2}))
+
+    rows = prod.fetch_all_statuses("git.moleculesai.app", "molecule-ai/molecule-core", "b" * 40, "tok")
+    state = prod.ci_context_state(rows, "CI / all-required (push)")
+    assert state == "missing"
+    assert prod.context_is_satisfied(state) is False
+
+
+def test_fetch_all_statuses_fail_closed_on_page_error(monkeypatch):
+    # A page that raises (unreadable) must propagate, never silently truncate
+    # the scan and let the caller treat a partial list as complete.
+    def boom(url, _token):
+        if "page=2" in url:
+            raise RuntimeError("GET .../statuses?page=2 -> HTTP 502: bad gateway")
+        return [{"id": i, "context": f"n-{i}", "status": "success"} for i in range(100)]
+
+    monkeypatch.setattr(prod, "_api_json_list", boom)
+    try:
+        prod.fetch_all_statuses("h", "r", "c" * 40, "tok")
+    except RuntimeError as exc:
+        assert "502" in str(exc)
+    else:
+        raise AssertionError("expected page-2 error to propagate (fail-closed)")
+
+
+def test_wait_for_ci_context_succeeds_when_required_status_is_past_page_one(monkeypatch):
+    # End-to-end: the gate reads the EXHAUSTIVE list, so a required SUCCESS that
+    # only exists past page 1 lets the deploy proceed instead of timing out.
+    full = [
+        {"id": i, "context": f"noise-{i} (push)", "status": "success"}
+        for i in range(100)
+    ] + [
+        {"id": 500, "context": "CI / all-required (push)", "status": "success"},
+        {"id": 501, "context": "Secret scan / Scan diff for credential-shaped strings (push)",
+         "status": "success"},
+    ]
+    monkeypatch.setattr(prod, "fetch_all_statuses", lambda *a, **k: full)
+    result = prod.wait_for_ci_context(
+        {"GITHUB_SHA": "d" * 40, "GITEA_TOKEN": "tok", "CI_STATUS_TIMEOUT_SECONDS": "30"}
+    )
+    assert result == "success"
+
+
+def test_wait_for_ci_context_times_out_fail_closed_when_required_absent(monkeypatch):
+    # Genuinely-absent required context across all pages → never satisfied →
+    # the gate times out rather than green-lighting the deploy (no fail-open).
+    present_but_irrelevant = [
+        {"id": 500, "context": "some-other (push)", "status": "success"},
+    ]
+    monkeypatch.setattr(prod, "fetch_all_statuses", lambda *a, **k: present_but_irrelevant)
+    # Zero timeout + 0 interval → single poll then TimeoutError.
+    try:
+        prod.wait_for_ci_context(
+            {
+                "GITHUB_SHA": "e" * 40,
+                "GITEA_TOKEN": "tok",
+                "CI_STATUS_TIMEOUT_SECONDS": "1",
+                "CI_STATUS_POLL_INTERVAL_SECONDS": "1",
+            }
+        )
+    except TimeoutError as exc:
+        assert "missing" in str(exc)
+    else:
+        raise AssertionError("expected fail-closed TimeoutError, not a satisfied gate")
@@ -25,6 +25,11 @@
 #   T20 — ai-sop-ack APPROVED review excluded from security-review gate
 #   T21 — stale-head APPROVED review → exit 1 (commit_id mismatch)
 #   T22 — missing/non-official APPROVED review → exit 1 (official != true)
+#   T23 — missing-commit_id APPROVED review → exit 1 (SEV-1 internal#812
+#         fail-closed contract: a missing/empty commit_id is REJECTED, not
+#         silently accepted as "older Gitea row" the way the pre-fix
+#         gitea-merge-queue.py did. Closes the spoof-bug surface that
+#         #843 had.)
 #
 # Hostile-self-review (per feedback_assert_exact_not_substring):
 # this test MUST FAIL if the script is absent. Verified by running
@@ -427,6 +432,22 @@ T22_RC=$(cat "$FIX_STATE_DIR/last_rc")
 assert_eq "T22 exit code 1 (missing official rejected)" "1" "$T22_RC"
 assert_contains "T22 no candidates error" "no candidates from reviews API or issue comments" "$T22_OUT"

+# T23 — missing-commit_id APPROVED review must be rejected.
+# SEV-1 internal#812 (supersedes closed internal#843). A review with NO
+# commit_id field is the spoof-bug signature: a real reviewer cannot
+# have submitted against a commit that doesn't exist. The fail-closed
+# SSOT must REJECT — the pre-fix gitea-merge-queue.py silently accepted
+# these (the "older Gitea row" escape hatch), which is the exact surface
+# that closed #843 had. The Python unit tests in
+# test_approval_validator.py cover the predicate at the unit level;
+# this T23 covers the bash + jq pipeline end-to-end.
+echo
+echo "== T23 missing commit_id APPROVED review rejected (SEV-1 fail-closed) =="
+T23_OUT=$(run_review_check "T23_missing_commit_id")
+T23_RC=$(cat "$FIX_STATE_DIR/last_rc")
+assert_eq "T23 exit code 1 (missing commit_id rejected)" "1" "$T23_RC"
+assert_contains "T23 no candidates error" "no candidates from reviews API or issue comments" "$T23_OUT"
+
 echo
 echo "------"
 echo "PASS=$PASS FAIL=$FAIL"
@@ -148,6 +148,11 @@ jobs:
        run: $(go env GOPATH)/bin/golangci-lint run --timeout 3m ./...
      - if: ${{ needs.changes.outputs.platform == 'true' }}
        name: Diagnostic — per-package verbose 60s
+        # DIAGNOSTIC ONLY (continue-on-error below): this step exists to dump
+        # verbose per-package output for triage, NOT to gate. The blocking gate
+        # is "Run tests with coverage (blocking gate)" immediately below. The
+        # `set +e` / swallowed exits here are intentional — do not "fix" them
+        # like a gate; the real gate is the next step.
        run: |
          set +e
          go test -race -v -timeout 60s ./internal/handlers/... 2>&1 | tee /tmp/test-handlers.log
@@ -244,7 +244,12 @@ jobs:
          # fail if any didn't land — that would be a real regression we
          # want loud.
          # workspace_schedules added for the #2149 scheduler integration tests.
-          for tbl in delegations workspaces activity_logs pending_uploads workspace_schedules; do
+          # workspace_auth_tokens + org_api_tokens added for the #2156
+          # registry-auth TestIntegration_ suite (#2148). Without this
+          # guard, a silently-skipped migration 020 (workspace_auth_tokens)
+          # or 035 (org_api_tokens) would let the auth tests run against
+          # missing tables and falsely green.
+          for tbl in delegations workspaces activity_logs pending_uploads workspace_schedules workspace_auth_tokens org_api_tokens; do
            if ! psql -h "${PG_HOST}" -U postgres -d molecule -tA \
                -c "SELECT 1 FROM information_schema.tables WHERE table_name = '$tbl'" \
                | grep -q 1; then
@@ -285,6 +290,33 @@ jobs:
          # / workspaces all landed by the migration replay step above).
          go test -tags=integration -timeout 5m -v ./internal/scheduler/ -run "^TestIntegration_"

+      - if: needs.detect-changes.outputs.handlers == 'true'
+        name: Migration replay-from-scratch gate (#2150)
+        env:
+          PGPASSWORD: test
+        run: |
+          # Issue #2150 (SOP internal#765): prove the FULL forward migration
+          # chain (.up + legacy .sql) replays from a blank schema via the
+          # PRODUCTION db.RunMigrations entrypoint — hard-fail on any error.
+          #
+          # This is the gap the psql apply loop above does NOT cover: that
+          # loop deliberately SKIPS failing migrations (`⊘ skipped`), so it
+          # stays green even if the chain stops replaying. The Go test below
+          # uses the real boot-time runner with hard-fail semantics, catching
+          # the #211 .down-wipe class and the 045 non-idempotent crash-loop
+          # class (it runs the chain twice).
+          #
+          # Run against a SEPARATE database so the destructive
+          # `DROP SCHEMA public CASCADE` inside the test never touches the
+          # `molecule` DB the handlers integration tests above migrated. No
+          # ordering coupling with the handlers step.
+          createdb -h "${PG_HOST}" -U postgres molecule_replay 2>/dev/null || \
+            psql -h "${PG_HOST}" -U postgres -d molecule \
+              -c "CREATE DATABASE molecule_replay" >/dev/null 2>&1 || true
+          INTEGRATION_DB_URL="postgres://postgres:test@${PG_HOST}:5432/molecule_replay?sslmode=disable" \
+            go test -tags=integration -timeout 5m -v ./internal/db/ \
+              -run '^TestIntegration_Migration|^TestIntegration_InitPostgres'
+
      - if: failure() && needs.detect-changes.outputs.handlers == 'true'
        name: Diagnostic dump on failure
        env:
@@ -74,6 +74,10 @@ jobs:
    env:
      PG_CONTAINER: pg-lpe2e-${{ github.run_id }}-${{ github.run_attempt }}
      REDIS_CONTAINER: redis-lpe2e-${{ github.run_id }}-${{ github.run_attempt }}
+      # Hard-code dev mode at the job level so the platform server ALWAYS sees it,
+      # even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA).
+      MOLECULE_ENV: development
+      SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!!
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
@@ -124,12 +128,16 @@ jobs:

      - name: Configure platform env (admin token + local Docker provisioner)
        run: |
+          # Allocate an unused ephemeral port to avoid collision with concurrent
+          # jobs or stale processes from prior cancelled runs (see #2450).
+          PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
+          echo "PORT=${PORT}" >> "$GITHUB_ENV"
+          echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
          # Deterministic admin token: the script sends MOLECULE_ADMIN_TOKEN as the
          # bearer; the platform checks ADMIN_TOKEN. Set both to the same value.
          T="lpe2e-admin-${{ github.run_id }}-${{ github.run_attempt }}"
          echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
          echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
-          echo "BASE=http://localhost:8080" >> "$GITHUB_ENV"
          # MOLECULE_ENV=development: dev posture. MOLECULE_ORG_ID is left UNSET so
          # main.go wires the LOCAL Docker provisioner (not the CP provisioner), and
          # MOLECULE_IMAGE_REGISTRY is left UNSET so image resolution uses
@@ -143,21 +151,10 @@ jobs:

      - name: Kill stale platform-server before start (issue #1046)
        run: |
-          # ROOT CAUSE of the stub-gate red on docker-host: both this gating job
-          # and the advisory lifecycle-real job bind the SAME fixed host port
-          # :8080 (PORT=8080 ./platform-server). On the small docker-host runner
-          # pool a prior cancelled/timeout run can leave a zombie platform-server
-          # on :8080 (a cancelled run never reaches "Stop platform"), and — until
-          # lifecycle-real was serialised behind this job via needs: — the two
-          # jobs could also co-schedule on one runner and contend for :8080. A
-          # second bind on :8080 is FATAL (the server exits), so "Wait for
-          # /health" times out at 300s and this REQUIRED gate reds. Free the port
-          # before binding — mirrors the e2e-api.yml #1046 fix for the identical
-          # fixed-port-on-shared-runner class.
-          #
-          # /proc scan — works on any Linux without pkill/lsof/ss. comm is
-          # truncated to 15 chars: "platform-serve" matches "platform-server".
-          # Verify via cmdline to avoid false positives.
+          # Dynamic port allocation (see #2450) eliminates the fixed-port race
+          # that caused this gate to red when a prior run left a zombie process.
+          # We still sweep by process name to avoid leaking platform-server
+          # processes on the shared runner.
          killed=0
          for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do
            kpid="${pid%/comm}"; kpid="${kpid##*/}"
@@ -169,35 +166,28 @@ jobs:
            fi
          done
          if [ "$killed" -gt 0 ]; then echo "Killed $killed stale platform-server process(es)."; else echo "No platform-server-named process found."; fi
-          # Belt-and-braces: also free :8080 from ANY holder regardless of process
-          # name. A differently-named squatter (e.g. a leftover Fastify dev server
-          # from another job) survives the comm-name scan above, makes our bind
-          # FATAL, and can false-positive the /health probe below (no-flakes RCA;
-          # tracked alongside #2430). fuser/lsof are present on the ubuntu runner;
-          # if neither exists the name-scan above is the floor.
-          if command -v fuser >/dev/null 2>&1; then fuser -k 8080/tcp 2>/dev/null || true; fi
-          if command -v lsof  >/dev/null 2>&1; then lsof -ti tcp:8080 2>/dev/null | xargs -r kill -9 2>/dev/null || true; fi
-          sleep 2
-          echo ":8080 freed (comm-scan + port-scan swept any squatter)."
+          sleep 1

      - name: Start platform (background)
        working-directory: workspace-server
        run: |
-          # Bind to :8080 (the script's BASE). DATABASE_URL/REDIS_URL/ADMIN_TOKEN/
-          # MOLECULE_ENV are inherited from $GITHUB_ENV.
-          PORT=8080 ./platform-server > platform.log 2>&1 &
+          # Bind to the dynamically allocated port (see #2450).
+          # DATABASE_URL/REDIS_URL/ADMIN_TOKEN/MOLECULE_ENV are inherited from
+          # $GITHUB_ENV.
+          PORT=$PORT ./platform-server > platform.log 2>&1 &
          echo $! > platform.pid

      - name: Wait for /health (+ migrations applied)
        run: |
          DEADLINE=300; PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"; start=$(date +%s)
          while :; do
-            # Verify OUR server owns :8080 BEFORE trusting /health. Our server binds
-            # :8080 or exits FATAL, so "our PID alive" <=> "we own :8080"; checking it
-            # first stops a squatter that answers /health on :8080 (our bind having
-            # failed) from false-positiving the gate (no-flakes RCA).
+            # Verify OUR server is still alive before trusting /health. Our server
+            # binds the allocated port or exits FATAL, so "our PID alive" <=>
+            # "we own the port"; checking it first stops a squatter that answers
+            # /health on the same port (our bind having failed) from false-positiving
+            # the gate (no-flakes RCA).
            if [ -n "$PID" ] && ! kill -0 "$PID" 2>/dev/null; then
-              echo "::error::platform-server exited early (failed to bind :8080 or crashed)"; cat workspace-server/platform.log || true; exit 1
+              echo "::error::platform-server exited early (failed to bind or crashed)"; cat workspace-server/platform.log || true; exit 1
            fi
            if curl -sf "$BASE/health" >/dev/null; then
              tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \
@@ -237,13 +227,13 @@ jobs:
  lifecycle-real:
    name: Local Provision Lifecycle E2E (real image + MiniMax LLM, advisory)
    runs-on: docker-host
-    # Serialise behind the gating stub job: both jobs bind the SAME fixed host
-    # port :8080, so co-scheduling them on one docker-host runner makes the
-    # second platform-server fail to bind (fatal) and reds whichever lost the
-    # race. `needs:` forces this advisory job to start only AFTER lifecycle-stub
-    # finishes, so they never contend for :8080. continue-on-error keeps a real-
-    # job miss non-blocking; `needs:` does NOT gate on the stub's success (a
-    # failed required gate still lets this advisory dependent run).
+    # Serialise behind the gating stub job: both jobs share the same docker-host
+    # runner and provision sibling containers. `needs:` forces this advisory job
+    # to start only AFTER lifecycle-stub finishes, avoiding resource contention.
+    # (Dynamic ports eliminated the fixed-port race; serialisation remains for
+    # docker-host capacity hygiene.) continue-on-error keeps a real-job miss
+    # non-blocking; `needs:` does NOT gate on the stub's success (a failed
+    # required gate still lets this advisory dependent run).
    needs: lifecycle-stub
    if: ${{ always() }}
    # Tracker for lint-continue-on-error-tracking (Tier 2e / internal#350): this
@@ -254,6 +244,10 @@ jobs:
    env:
      PG_CONTAINER: pg-lpe2e-real-${{ github.run_id }}-${{ github.run_attempt }}
      REDIS_CONTAINER: redis-lpe2e-real-${{ github.run_id }}-${{ github.run_attempt }}
+      # Hard-code dev mode at the job level so the platform server ALWAYS sees it,
+      # even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA).
+      MOLECULE_ENV: development
+      SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!!
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
@@ -299,10 +293,14 @@ jobs:

      - name: Configure platform env
        run: |
+          # Allocate an unused ephemeral port to avoid collision with concurrent
+          # jobs or stale processes from prior cancelled runs (see #2450).
+          PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
+          echo "PORT=${PORT}" >> "$GITHUB_ENV"
+          echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
          T="lpe2e-real-admin-${{ github.run_id }}-${{ github.run_attempt }}"
          echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
          echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
-          echo "BASE=http://localhost:8080" >> "$GITHUB_ENV"
          echo "MOLECULE_ENV=development" >> "$GITHUB_ENV"
          echo "SECRETS_ENCRYPTION_KEY=lpe2e-test-encryption-key-32bytes!!" >> "$GITHUB_ENV"

@@ -312,8 +310,9 @@ jobs:

      - name: Kill stale platform-server before start (issue #1046)
        run: |
-          # Same fixed-:8080 hygiene as the stub job — free the port from any
-          # zombie left by a cancelled run before this job binds it.
+          # Dynamic port allocation (see #2450) eliminates the fixed-port race.
+          # We still sweep by process name to avoid leaking platform-server
+          # processes on the shared runner.
          killed=0
          for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do
            kpid="${pid%/comm}"; kpid="${kpid##*/}"
@@ -325,30 +324,23 @@ jobs:
            fi
          done
          if [ "$killed" -gt 0 ]; then echo "Killed $killed stale platform-server process(es)."; else echo "No platform-server-named process found."; fi
-          # Belt-and-braces: free :8080 from ANY holder regardless of process name
-          # (a differently-named squatter survives the comm-name scan above, makes
-          # our bind FATAL, and can false-positive the /health probe). Mirrors the
-          # stub job's no-flakes fix (tracked alongside #2430).
-          if command -v fuser >/dev/null 2>&1; then fuser -k 8080/tcp 2>/dev/null || true; fi
-          if command -v lsof  >/dev/null 2>&1; then lsof -ti tcp:8080 2>/dev/null | xargs -r kill -9 2>/dev/null || true; fi
-          sleep 2
-          echo ":8080 freed (comm-scan + port-scan swept any squatter)."
+          sleep 1

      - name: Start platform (background)
        working-directory: workspace-server
        run: |
-          PORT=8080 ./platform-server > platform.log 2>&1 &
+          PORT=$PORT ./platform-server > platform.log 2>&1 &
          echo $! > platform.pid

      - name: Wait for /health (+ migrations applied)
        run: |
          DEADLINE=300; PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"; start=$(date +%s)
          while :; do
-            # Verify OUR server owns :8080 before trusting /health (no-flakes RCA):
-            # our server binds :8080 or exits FATAL, so checking our PID first stops
-            # a squatter answering /health on :8080 from false-positiving the gate.
+            # Verify OUR server is still alive before trusting /health. Our server
+            # binds the allocated port or exits FATAL, so checking our PID first
+            # stops a squatter from false-positiving the gate (no-flakes RCA).
            if [ -n "$PID" ] && ! kill -0 "$PID" 2>/dev/null; then
-              echo "::error::platform-server exited early (failed to bind :8080 or crashed)"; cat workspace-server/platform.log || true; exit 1
+              echo "::error::platform-server exited early (failed to bind or crashed)"; cat workspace-server/platform.log || true; exit 1
            fi
            if curl -sf "$BASE/health" >/dev/null; then
              tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \
@@ -248,16 +248,36 @@ jobs:
            --tag "${STAGING_TENANT_IMAGE_NAME}:${TAG_LATEST}"
          )

-          docker buildx build \
-            --file ./workspace-server/Dockerfile.tenant \
-            --build-arg NEXT_PUBLIC_PLATFORM_URL= \
-            --build-arg GIT_SHA="${GIT_SHA}" \
-            --label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \
-            --label "org.opencontainers.image.revision=${GIT_SHA}" \
-            --label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-            --label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
-            "${build_tags[@]}" \
-            --push .
+          # Retry loop: buildkit EOF (internal#2468) is often transient on the
+          # publish runner under memory pressure. Up to 3 attempts with a fresh
+          # builder each time so a crashed buildkit doesn't poison the next try.
+          for attempt in 1 2 3; do
+            echo "::notice::Tenant image build attempt ${attempt}/3 ..."
+            builder="tenant-builder-${GITHUB_RUN_ID}-${attempt}"
+            docker buildx create --name "${builder}" --use >/dev/null 2>&1 || true
+            if docker buildx build \
+                --builder "${builder}" \
+                --file ./workspace-server/Dockerfile.tenant \
+                --build-arg NEXT_PUBLIC_PLATFORM_URL= \
+                --build-arg GIT_SHA="${GIT_SHA}" \
+                --label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \
+                --label "org.opencontainers.image.revision=${GIT_SHA}" \
+                --label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+                --label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
+                "${build_tags[@]}" \
+                --push .; then
+              docker buildx rm "${builder}" >/dev/null 2>&1 || true
+              echo "::notice::Tenant image build succeeded on attempt ${attempt}"
+              break
+            fi
+            echo "::warning::Tenant image build attempt ${attempt} failed — cleaning builder and retrying"
+            docker buildx rm "${builder}" >/dev/null 2>&1 || true
+            sleep 10
+            if [ "$attempt" -eq 3 ]; then
+              echo "::error::Tenant image build failed after 3 attempts"
+              exit 1
+            fi
+          done

  # bp-exempt: production deploy side-effect; merge is gated by CI / all-required and this job waits for push CI before acting.
  deploy-production:
@@ -21,15 +21,21 @@ on:
    branches: [main, staging]
    paths:
      - '.gitea/scripts/review-check.sh'
+      - '.gitea/scripts/_approval_validator.py'
+      - '.gitea/scripts/_review_check_filter.py'
      - '.gitea/scripts/tests/test_review_check.sh'
      - '.gitea/scripts/tests/_review_check_fixture.py'
+      - '.gitea/scripts/tests/test_approval_validator.py'
      - '.gitea/workflows/review-check-tests.yml'
  pull_request:
    branches: [main, staging]
    paths:
      - '.gitea/scripts/review-check.sh'
+      - '.gitea/scripts/_approval_validator.py'
+      - '.gitea/scripts/_review_check_filter.py'
      - '.gitea/scripts/tests/test_review_check.sh'
      - '.gitea/scripts/tests/_review_check_fixture.py'
+      - '.gitea/scripts/tests/test_approval_validator.py'
      - '.gitea/workflows/review-check-tests.yml'
  workflow_dispatch:

@@ -70,3 +76,16 @@ jobs:

      - name: Run review-check.sh regression suite
        run: bash .gitea/scripts/tests/test_review_check.sh
+
+      - name: SSOT approval-validator unit tests (SEV-1 internal#812)
+        # The Python unit tests for _approval_validator.py are
+        # mutation-verified — every fail-closed branch has an explicit
+        # REJECT assertion. A reviewer who weakens the predicate trips
+        # these in CI.
+        run: |
+          # The test file lives in .gitea/scripts/tests/ with no __init__.py,
+          # so `unittest discover -s .gitea/scripts` finds 0 tests (the SEV-1
+          # suite silently never ran — a CI gap fixed alongside internal#812).
+          # Run the file directly; it self-inserts its sys.path and calls
+          # unittest.main(), so a failing assertion exits non-zero and fails CI.
+          python3 .gitea/scripts/tests/test_approval_validator.py -v
@@ -58,22 +58,51 @@ jobs:
          python-version: '3.11'
      - name: Install .gitea script test dependencies
        run: python -m pip install --quiet 'pytest==9.0.2' 'PyYAML==6.0.2'
-      - name: Run scripts/ unittests, if any
+      - name: Run scripts/ unittests (fail-closed on 0 collected)
        # Top-level scripts/ tests live alongside their target file. The
        # runtime packaging tests moved to molecule-ai-workspace-runtime, so
-        # this pass may legitimately find no tests.
+        # this pass may legitimately find NO test files today.
+        #
+        # Gate-integrity fix: the previous guard keyed off `rc==5` to detect
+        # "no tests collected", but Python 3.12's unittest exits 0 (not 5)
+        # when discovery finds 0 tests ("NO TESTS RAN"). The guard therefore
+        # never fired, so any test_*.py added here would silently run 0 tests
+        # while this step stayed GREEN. A green step that runs 0 tests is
+        # worse than a red one. We now fail-closed:
+        #   - genuinely NO test_*.py present  -> loud SKIP (legitimate no-op)
+        #   - test_*.py present but 0 collected -> FAIL (broken import/empty)
        working-directory: scripts
        run: |
-          set +e
-          python -m unittest discover -t . -p 'test_*.py' -v
-          rc=$?
-          if [ "$rc" -eq 5 ]; then
-            echo "No top-level scripts/ unittest files found; skipping."
+          set -euo pipefail
+          # Non-recursive count: scripts/ has no __init__.py, so unittest
+          # discover does not recurse into subdirs (ops/ is run separately
+          # below) — top-level files are the entire discovery scope here.
+          nfiles=$(find . -maxdepth 1 -name 'test_*.py' | wc -l | tr -d ' ')
+          if [ "$nfiles" -eq 0 ]; then
+            echo "SKIP: no top-level scripts/ test_*.py files present (genuine no-op)."
            exit 0
          fi
-          exit "$rc"
+          echo "Found $nfiles top-level scripts/ test_*.py file(s); asserting they collect >0 tests."
+          ncollected=$(python -c "import unittest; print(unittest.TestLoader().discover('.', pattern='test_*.py', top_level_dir='.').countTestCases())")
+          echo "Collected $ncollected test case(s)."
+          if [ "$ncollected" -eq 0 ]; then
+            echo "FAIL: test_*.py file(s) present but 0 tests collected (broken import / empty file / discovery error)."
+            exit 1
+          fi
+          python -m unittest discover -t . -p 'test_*.py' -v
      - name: Run scripts/ops/ unittests (sweep_cf_decide, ...)
+        # Real gate: scripts/ops/ must always run tests. Assert >0 collected so
+        # deleting all test files (or breaking an import) can't pass GREEN by
+        # running 0 tests — same gate-integrity class as the scripts/ step.
        working-directory: scripts/ops
-        run: python -m unittest discover -p 'test_*.py' -v
+        run: |
+          set -euo pipefail
+          ncollected=$(python -c "import unittest; print(unittest.TestLoader().discover('.', pattern='test_*.py').countTestCases())")
+          echo "scripts/ops/ collected $ncollected test case(s)."
+          if [ "$ncollected" -eq 0 ]; then
+            echo "FAIL: scripts/ops/ collected 0 tests — this gate must run real tests (deleted/broken import?)."
+            exit 1
+          fi
+          python -m unittest discover -p 'test_*.py' -v
      - name: Run .gitea/scripts pytest suite
        run: python -m pytest .gitea/scripts/tests -q
@@ -1,7 +1,14 @@
 import { test, expect } from "@playwright/test";
+import type { Page } from "@playwright/test";
 import { startEchoRuntime } from "./fixtures/echo-runtime";
 import { seedWorkspace, startHeartbeat, cleanupWorkspace } from "./fixtures/chat-seed";

+/** Enter the Org-map view so the Canvas (React Flow graph) mounts. */
+async function enterMapView(page: Page): Promise<void> {
+  const btn = page.getByTestId("nav-map");
+  await expect(btn, "rail button nav-map missing").toBeVisible({ timeout: 10_000 });
+  await btn.click();
+}

 test.describe("Desktop ChatTab", () => {
  let cleanup: () => Promise<void> = async () => {};
@@ -29,6 +36,7 @@ test.describe("Desktop ChatTab", () => {
  test.beforeEach(async ({ page }) => {
    await page.setViewportSize({ width: 1280, height: 800 });
    await page.goto("/");
+    await enterMapView(page);
    await page.waitForSelector(".react-flow__node", { timeout: 10_000 });
    // Dismiss onboarding guide if present.
    const skipGuide = page.getByText("Skip guide");
@@ -67,6 +75,7 @@ test.describe("Desktop ChatTab", () => {
    await expect(page.getByText("Echo: Persistence test")).toBeVisible({ timeout: 15_000 });

    await page.reload();
+    await enterMapView(page);
    await page.waitForSelector(".react-flow__node", { timeout: 10_000 });
    await page.getByText(workspaceName, { exact: true }).first().click();
    await page.locator('#tab-chat').click();
@@ -143,6 +152,7 @@ test.describe("Desktop ChatTab — Markdown rendering", () => {
  test.beforeEach(async ({ page }) => {
    await page.setViewportSize({ width: 1280, height: 800 });
    await page.goto("/");
+    await enterMapView(page);
    await page.waitForSelector(".react-flow__node", { timeout: 10_000 });
    const skipGuide2 = page.getByText("Skip guide");
    if (await skipGuide2.isVisible().catch(() => false)) {
@@ -17,6 +17,7 @@ import { WORKSPACE_KIND } from "@/lib/workspace-kind";
 import { stripPlatformRootForMap } from "@/store/canvas-topology";
 import { useTheme } from "@/lib/theme-provider";
 import { A2ATopologyOverlay } from "./A2ATopologyOverlay";
+import { MessageFlightLayer } from "./MessageFlightLayer";
 import { WorkspaceNode } from "./WorkspaceNode";
 import { SidePanel } from "./SidePanel";
 import { CreateWorkspaceButton } from "./CreateWorkspaceDialog";
@@ -371,6 +372,10 @@ function CanvasInner() {
            nodeBorderRadius={4}
          />
          <DropTargetBadge />
+          {/* Flies an envelope between agents on each delegate/message event.
+              Inside <ReactFlow> so its ViewportPortal renders in flow coords
+              and tracks pan/zoom. */}
+          <MessageFlightLayer />
        </ReactFlow>

        {/* Screen-reader live region — announces workspace count on initial load and
@@ -0,0 +1,84 @@
+/** FlightEnvelope — a single envelope that animates from `from` to `to` and
+ *  fades out, used by both the canvas (flow coords inside a ViewportPortal) and
+ *  the concierge home (screen coords inside a fixed overlay). The parent owns
+ *  the coordinate space; this component only animates the translate delta.
+ *
+ *  Uses the Web Animations API so the from/to delta can be dynamic per flight
+ *  (a static CSS @keyframes can't translate to a runtime-computed point). */
+import { useEffect, useRef } from "react";
+import { FLIGHT_DURATION_MS, type A2AFlightKind } from "@/hooks/useA2AFlights";
+
+/** Stroke colour by activity kind — mirrors CommunicationOverlay's palette
+ *  (send = cyan, receive = violet/accent, task = warm) so the two surfaces
+ *  read as the same event. */
+const KIND_COLOR: Record<A2AFlightKind, string> = {
+  send: "#22d3ee",
+  receive: "#8b5cf6",
+  task: "#f5a623",
+};
+
+export interface Point {
+  x: number;
+  y: number;
+}
+
+export function FlightEnvelope({
+  from,
+  to,
+  kind,
+}: {
+  from: Point;
+  to: Point;
+  kind: A2AFlightKind;
+}) {
+  const ref = useRef<HTMLDivElement>(null);
+
+  useEffect(() => {
+    const el = ref.current;
+    // Element.animate is unavailable in some test/SSR environments — degrade to
+    // a static (instantly-finished) envelope rather than throw.
+    if (!el || typeof el.animate !== "function") return;
+    const dx = to.x - from.x;
+    const dy = to.y - from.y;
+    const anim = el.animate(
+      [
+        { transform: "translate(-50%,-50%) translate(0px,0px) scale(0.45)", opacity: 0 },
+        { opacity: 1, offset: 0.16 },
+        { opacity: 1, offset: 0.8 },
+        { transform: `translate(-50%,-50%) translate(${dx}px,${dy}px) scale(1)`, opacity: 0 },
+      ],
+      { duration: FLIGHT_DURATION_MS, easing: "cubic-bezier(0.45, 0, 0.25, 1)", fill: "forwards" },
+    );
+    return () => anim.cancel();
+  }, [from.x, from.y, to.x, to.y]);
+
+  const color = KIND_COLOR[kind];
+  return (
+    <div
+      ref={ref}
+      data-testid="flight-envelope"
+      aria-hidden="true"
+      style={{
+        position: "absolute",
+        left: from.x,
+        top: from.y,
+        pointerEvents: "none",
+        willChange: "transform, opacity",
+        filter: "drop-shadow(0 1px 3px rgba(0,0,0,0.45))",
+        zIndex: 6,
+      }}
+    >
+      <svg width="22" height="22" viewBox="0 0 24 24" fill="none" aria-hidden="true">
+        <rect x="2.5" y="5.5" width="19" height="13" rx="2.5" fill="#0b0b0f" stroke={color} strokeWidth="1.6" />
+        <path
+          d="M3.5 7.5l8.5 6 8.5-6"
+          stroke={color}
+          strokeWidth="1.6"
+          fill="none"
+          strokeLinecap="round"
+          strokeLinejoin="round"
+        />
+      </svg>
+    </div>
+  );
+}
@@ -0,0 +1,46 @@
+/** MessageFlightLayer — flies an envelope from the source agent to the target
+ *  agent on the spatial canvas whenever a delegate / message event fires.
+ *
+ *  Mounted INSIDE <ReactFlow> so its ViewportPortal places the envelope in flow
+ *  coordinates; it therefore pans and zooms with the canvas for free. The
+ *  flight lifecycle (which events become envelopes, reduced-motion opt-out,
+ *  expiry) lives in useA2AFlights — this component only resolves node centres
+ *  and renders. */
+import { ViewportPortal, type Node } from "@xyflow/react";
+import { useCanvasStore } from "@/store/canvas";
+import { useA2AFlights } from "@/hooks/useA2AFlights";
+import { FlightEnvelope, type Point } from "./FlightEnvelope";
+import type { WorkspaceNodeData } from "@/store/canvas";
+
+// Fallback node footprint when React Flow has not measured a node yet. Matches
+// WorkspaceNode's leaf size (w-[300px] min-h-[176px]); a slightly-off centre
+// for the first frame after mount is invisible at flight scale.
+const DEFAULT_W = 300;
+const DEFAULT_H = 176;
+
+function nodeCenter(n: Node<WorkspaceNodeData>): Point {
+  const w = n.measured?.width ?? DEFAULT_W;
+  const h = n.measured?.height ?? DEFAULT_H;
+  return { x: n.position.x + w / 2, y: n.position.y + h / 2 };
+}
+
+export function MessageFlightLayer() {
+  const flights = useA2AFlights();
+  const nodes = useCanvasStore((s) => s.nodes);
+
+  if (flights.length === 0) return null;
+
+  return (
+    <ViewportPortal>
+      {flights.map((f) => {
+        const src = nodes.find((n) => n.id === f.sourceId);
+        const dst = nodes.find((n) => n.id === f.targetId);
+        // Both endpoints must be on-canvas to draw a path between them.
+        if (!src || !dst) return null;
+        return (
+          <FlightEnvelope key={f.key} from={nodeCenter(src)} to={nodeCenter(dst)} kind={f.kind} />
+        );
+      })}
+    </ViewportPortal>
+  );
+}
@@ -9,6 +9,7 @@ import { showToast } from "@/components/Toaster";
 import type { ActivityEntry } from "@/types/activity";
 import { Canvas } from "@/components/Canvas";
 import { CommunicationOverlay } from "@/components/CommunicationOverlay";
+import { MessageFlightHome } from "./MessageFlightHome";
 import { ChatTab } from "@/components/tabs/ChatTab";
 import { WorkspacePanelTabs } from "@/components/WorkspacePanelTabs";
 import { SettingsTabs } from "@/components/settings";
@@ -237,6 +238,7 @@ export function ConciergeShell() {
        tabIndex={0}
        data-testid="agent-tree-node"
        data-node-name={n.data.name}
+        data-ws-id={n.id}
        data-platform={isPlatform ? "true" : "false"}
        data-depth={depth}
        className={`${s.ws} ${selectedNodeId === n.id ? s.active : ""}`}
@@ -299,6 +301,8 @@ export function ConciergeShell() {

  return (
    <div className={s.root}>
+      {/* Envelope flies between agent rows on each delegate/message event. */}
+      <MessageFlightHome />
      <div className={`${s.app} ${railOpen ? s.railOpen : ""}`}>
        {/* ICON RAIL */}
        <nav className={s.rail}>
@@ -0,0 +1,50 @@
+/** MessageFlightHome — the concierge-home counterpart of MessageFlightLayer.
+ *  The home view is a vertical agent tree (not a spatial canvas), so an envelope
+ *  flies between the source and target agent ROWS. It shares the exact same
+ *  flight stream (useA2AFlights) as the canvas, and resolves endpoints from each
+ *  row's DOM rect (rows carry data-ws-id). Reduced-motion is honoured by the
+ *  shared hook (it emits no flights). */
+import { useRef } from "react";
+import { useA2AFlights, type A2AFlight } from "@/hooks/useA2AFlights";
+import { FlightEnvelope, type Point } from "../FlightEnvelope";
+
+function rowCenter(wsId: string): Point | null {
+  if (typeof document === "undefined") return null;
+  const sel =
+    typeof CSS !== "undefined" && typeof CSS.escape === "function"
+      ? CSS.escape(wsId)
+      : wsId;
+  const el = document.querySelector<HTMLElement>(`[data-ws-id="${sel}"]`);
+  if (!el) return null;
+  const r = el.getBoundingClientRect();
+  return { x: r.left + r.width / 2, y: r.top + r.height / 2 };
+}
+
+/** One flight. Captures the source/target row rects ONCE on mount (a ref, not
+ *  per-render) so a later re-render or scroll mid-flight does not restart the
+ *  animation. */
+function HomeFlight({ flight }: { flight: A2AFlight }) {
+  const pos = useRef<{ from: Point; to: Point } | null>(null);
+  if (pos.current === null) {
+    const from = rowCenter(flight.sourceId);
+    const to = rowCenter(flight.targetId);
+    if (from && to) pos.current = { from, to };
+  }
+  if (!pos.current) return null; // one or both agents not visible in the tree
+  return <FlightEnvelope from={pos.current.from} to={pos.current.to} kind={flight.kind} />;
+}
+
+export function MessageFlightHome() {
+  const flights = useA2AFlights();
+  if (flights.length === 0) return null;
+  return (
+    <div
+      aria-hidden="true"
+      style={{ position: "fixed", inset: 0, pointerEvents: "none", zIndex: 50 }}
+    >
+      {flights.map((f) => (
+        <HomeFlight key={f.key} flight={f} />
+      ))}
+    </div>
+  );
+}
@@ -3,13 +3,36 @@
 import { useEffect, useMemo, useState } from "react";
 import { api } from "@/lib/api";
 import { runtimeDisplayName } from "@/lib/runtime-names";
+import { isSaaSTenant } from "@/lib/tenant";
 import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
 import type { WorkspaceCompute } from "@/store/socket";

-const INSTANCE_TYPES = ["t3.medium", "t3.large", "t3.xlarge", "t3.2xlarge", "m6i.large", "m6i.xlarge", "c6i.xlarge"];
+// Machine sizes keyed by cloud provider — an AWS t3.* is meaningless on Hetzner,
+// etc. MUST mirror the workspace-server workspaceComputeInstanceAllowlist (which
+// mirrors the CP provider configs); the PATCH validation rejects a mismatch 400.
+const INSTANCE_TYPES_BY_PROVIDER: Record<string, string[]> = {
+  aws: ["t3.medium", "t3.large", "t3.xlarge", "t3.2xlarge", "m6i.large", "m6i.xlarge", "c6i.xlarge"],
+  hetzner: ["cpx11", "cpx21", "cpx31", "cpx41", "cpx51", "cax11", "cax21", "cax31", "cax41"],
+  gcp: ["e2-small", "e2-medium", "e2-standard-2", "e2-standard-4", "e2-standard-8"],
+};
+const DEFAULT_INSTANCE_BY_PROVIDER: Record<string, string> = {
+  aws: "t3.medium", hetzner: "cpx31", gcp: "e2-standard-2",
+};
+const normalizeProvider = (p?: string): string => (p === "gcp" || p === "hetzner" ? p : "aws");
+const instanceTypesForProvider = (p?: string): string[] =>
+  INSTANCE_TYPES_BY_PROVIDER[normalizeProvider(p)] ?? INSTANCE_TYPES_BY_PROVIDER.aws;
+const defaultInstanceForProvider = (p?: string): string =>
+  DEFAULT_INSTANCE_BY_PROVIDER[normalizeProvider(p)] ?? "t3.medium";
+
+// Editable cloud-provider options (multi-provider RFC) — mirrors CreateWorkspaceDialog.
+const CLOUD_PROVIDER_OPTIONS = [
+  { value: "aws", label: "AWS (default)" },
+  { value: "gcp", label: "GCP" },
+  { value: "hetzner", label: "Hetzner" },
+];
+
 const RUNTIME_OPTIONS = ["claude-code", "codex", "hermes", "openclaw", "kimi", "kimi-cli", "external"];
 const RESOLUTIONS = ["1280x720", "1440x900", "1920x1080", "2560x1440"];
-const DEFAULT_HEADLESS_INSTANCE_TYPE = "t3.medium";
 const DEFAULT_HEADLESS_ROOT_GB = 30;

 type Props = {
@@ -23,6 +46,7 @@ type Props = {

 type FormState = {
  runtime: string;
+  provider: string; // cloud backend; editable in SaaS (in-place switch recreates the box)
  instanceType: string;
  rootGB: string;
  displayEnabled: boolean;
@@ -38,16 +62,16 @@ const DATA_PERSISTENCE_OPTIONS = ["", "persist", "ephemeral"];
 const dataPersistenceLabel = (v: string): string =>
  v === "persist" ? "Always keep (persist)" : v === "ephemeral" ? "Don't keep (ephemeral)" : "Auto";

-// Cloud/compute backend display name. The provider is chosen at create time and
-// is NOT editable here (changing a workspace's cloud requires a recreate), so
-// it renders as a read-only badge — but we must preserve it across Save (the
-// compute payload is rebuilt below, and dropping it would wipe the column).
+// Cloud/compute backend display name (read-only fallback for non-SaaS / legacy).
 const cloudProviderLabel = (v: string | undefined): string =>
  v === "gcp" ? "GCP" : v === "hetzner" ? "Hetzner" : "AWS";

 export function ContainerConfigTab({ workspaceId, data }: Props) {
+  // Provider is editable only in SaaS (CP-provisioned boxes). Local/Docker has no
+  // cloud-provider concept, so we keep the read-only badge there.
+  const isSaaS = useMemo(() => isSaaSTenant(), []);
  const runtime = data.runtime;
-  const provider = data.compute?.provider; // read-only; set at create time
+  const provider = data.compute?.provider;
  const instanceType = data.compute?.instance_type;
  const rootGB = data.compute?.volume?.root_gb;
  const displayMode = data.compute?.display?.mode;
@@ -56,8 +80,8 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
  const displayHeight = data.compute?.display?.height;
  const dataPersistence = data.compute?.data_persistence;
  const initial = useMemo(
-    () => formFromData({ runtime, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence }),
-    [runtime, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence],
+    () => formFromData({ runtime, provider, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence }),
+    [runtime, provider, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence],
  );
  const [form, setForm] = useState<FormState>(initial);
  const [saving, setSaving] = useState(false);
@@ -87,6 +111,21 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
    try {
      let applyTemplateOnRestart = data.applyTemplateOnRestart ?? false;
      if (dirty) {
+        // In-place cloud switch is DESTRUCTIVE: changing the provider recreates the
+        // box on the new cloud (the workspace-server deprovisions the old box on
+        // its old cloud first, then the restart provisions on the new one). Confirm
+        // before doing it — the current box and any non-persisted state are lost.
+        const providerChanged = normalizeProvider(form.provider) !== normalizeProvider(initial.provider);
+        if (providerChanged && typeof window !== "undefined") {
+          const ok = window.confirm(
+            `Switch this workspace to ${cloudProviderLabel(form.provider)}? This RECREATES the box on the new cloud — the current box and any non-persisted state are replaced.`,
+          );
+          if (!ok) {
+            setSaving(false);
+            return;
+          }
+        }
+
        const rootGB = parseInt(form.rootGB, 10);
        if (!Number.isFinite(rootGB)) {
          setError("Root volume must be a number");
@@ -102,10 +141,11 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
            : { mode: "none" },
          // internal#734: omit when "auto" so the wire/default behavior is unchanged.
          ...(form.dataPersistence ? { data_persistence: form.dataPersistence } : {}),
-          // Preserve the create-time cloud provider — it's not editable here, but
-          // this PATCH rebuilds the whole compute object, so omitting it would
-          // wipe the persisted provider (and mislead the badge after a Save).
-          ...(provider ? { provider } : {}),
+          // Cloud backend: send the (possibly switched) provider. Omit for the
+          // default (aws) so a non-switching AWS save keeps the wire unchanged;
+          // a switch TO aws (omit) vs FROM aws (explicit) both register correctly
+          // because the workspace-server normalizes ""→aws when diffing.
+          ...(normalizeProvider(form.provider) !== "aws" ? { provider: normalizeProvider(form.provider) } : {}),
        };

        const resp = await api.patch<{ needs_restart?: boolean }>(`/workspaces/${workspaceId}`, {
@@ -140,15 +180,16 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
        <div className="mb-3 flex items-center justify-between gap-3">
          <div className="flex items-center gap-2">
            <h3 className="text-sm font-semibold text-ink">Container Config</h3>
-            {/* Read-only cloud-provider badge — which cloud this workspace's box
-                runs on (AWS/GCP/Hetzner). Defaults to AWS when unset (legacy
-                rows). Set at create time in the Create Workspace dialog. */}
-            <span
-              title="Cloud provider for this workspace's compute (set at create time)"
-              className="rounded-full border border-line/60 bg-surface-sunken px-2 py-0.5 font-mono text-[10px] uppercase tracking-wide text-ink-mid"
-            >
-              {cloudProviderLabel(provider)}
-            </span>
+            {/* Non-SaaS (local/Docker) has no cloud-provider concept → read-only
+                badge. In SaaS the provider is an editable selector in the form. */}
+            {!isSaaS && (
+              <span
+                title="Cloud provider for this workspace's compute"
+                className="rounded-full border border-line/60 bg-surface-sunken px-2 py-0.5 font-mono text-[10px] uppercase tracking-wide text-ink-mid"
+              >
+                {cloudProviderLabel(provider)}
+              </span>
+            )}
          </div>
          {data.needsRestart && <span className="text-[11px] text-warm">Restart required</span>}
        </div>
@@ -162,11 +203,32 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
            optionLabel={runtimeDisplayName}
            onChange={(runtime) => setForm((s) => ({ ...s, runtime }))}
          />
+          {isSaaS && (
+            <SelectField
+              id="cloud-provider"
+              label="Cloud provider"
+              value={normalizeProvider(form.provider)}
+              options={CLOUD_PROVIDER_OPTIONS.map((p) => p.value)}
+              optionLabel={(v) => CLOUD_PROVIDER_OPTIONS.find((p) => p.value === v)?.label ?? v}
+              // Switching cloud resets the instance type to the new provider's
+              // default (an AWS t3.* is invalid on Hetzner, etc.) — also keeps the
+              // instance-type dropdown below in sync with the provider's sizes.
+              onChange={(provider) =>
+                setForm((s) => ({
+                  ...s,
+                  provider,
+                  instanceType: instanceTypesForProvider(provider).includes(s.instanceType)
+                    ? s.instanceType
+                    : defaultInstanceForProvider(provider),
+                }))
+              }
+            />
+          )}
          <SelectField
            id="instance-type"
            label="Instance type"
            value={form.instanceType}
-            options={INSTANCE_TYPES}
+            options={instanceTypesForProvider(form.provider)}
            onChange={(instanceType) => setForm((s) => ({ ...s, instanceType }))}
          />
          <label className="grid gap-1" htmlFor="root-volume-gb">
@@ -270,6 +332,7 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {

 function formFromData(data: {
  runtime?: string;
+  provider?: string;
  instanceType?: string;
  rootGB?: number;
  displayMode?: string;
@@ -281,9 +344,11 @@ function formFromData(data: {
  const width = data.displayWidth ?? 1920;
  const height = data.displayHeight ?? 1080;
  const resolution = `${width}x${height}`;
+  const provider = normalizeProvider(data.provider);
  return {
    runtime: data.runtime || "claude-code",
-    instanceType: data.instanceType || DEFAULT_HEADLESS_INSTANCE_TYPE,
+    provider,
+    instanceType: data.instanceType || defaultInstanceForProvider(provider),
    rootGB: String(data.rootGB || DEFAULT_HEADLESS_ROOT_GB),
    displayEnabled: !!data.displayMode && data.displayMode !== "none",
    displayMode: data.displayMode && data.displayMode !== "none" ? data.displayMode : "desktop-control",
@@ -23,6 +23,13 @@ vi.mock("@/store/canvas", () => ({
  ),
 }));

+// SaaS so the editable cloud-provider selector renders (non-SaaS shows a read-only
+// badge). Existing tests keep provider=aws (default), which is omitted from the
+// PATCH payload, so their assertions are unaffected.
+vi.mock("@/lib/tenant", () => ({
+  isSaaSTenant: () => true,
+}));
+
 import { ContainerConfigTab } from "../ContainerConfigTab";

 afterEach(() => {
@@ -314,4 +321,67 @@ describe("ContainerConfigTab", () => {
    await waitFor(() => expect(restartWorkspace).toHaveBeenCalledWith("ws-compute", { applyTemplate: true }));
    expect(apiPatch).not.toHaveBeenCalled();
  });
+
+  it("switches cloud provider — keys the instance-type list to the provider, confirms the recreate, and PATCHes the new provider", async () => {
+    const confirmSpy = vi.spyOn(window, "confirm").mockReturnValue(true);
+    render(
+      <ContainerConfigTab
+        workspaceId="ws-switch"
+        data={{
+          runtime: "claude-code",
+          status: "online",
+          needsRestart: false,
+          activeTasks: 0,
+          maxConcurrentTasks: null,
+          workspaceAccess: "read-write",
+          deliveryMode: "push",
+          compute: { instance_type: "t3.large", provider: "aws", volume: { root_gb: 30 } },
+        }}
+      />,
+    );
+
+    const providerSel = screen.getByLabelText("Cloud provider");
+    expect(providerSel).toHaveProperty("value", "aws");
+    expect(screen.getByLabelText("Instance type")).toHaveProperty("value", "t3.large");
+
+    // Switch to Hetzner → the instance type resets to the Hetzner default (an AWS
+    // t3.* is invalid on Hetzner) and the options become Hetzner sizes.
+    fireEvent.change(providerSel, { target: { value: "hetzner" } });
+    expect(screen.getByLabelText("Instance type")).toHaveProperty("value", "cpx31");
+
+    fireEvent.click(screen.getByRole("button", { name: "Save" }));
+    await waitFor(() => expect(apiPatch).toHaveBeenCalledTimes(1));
+    expect(confirmSpy).toHaveBeenCalled(); // destructive recreate confirmed
+    const body = apiPatch.mock.calls[0][1] as { compute: { provider?: string; instance_type?: string } };
+    expect(body.compute.provider).toBe("hetzner");
+    expect(body.compute.instance_type).toBe("cpx31");
+    confirmSpy.mockRestore();
+  });
+
+  it("does not treat a non-provider edit as a recreate (no confirm; aws default omitted)", async () => {
+    const confirmSpy = vi.spyOn(window, "confirm").mockReturnValue(true);
+    render(
+      <ContainerConfigTab
+        workspaceId="ws-noswitch"
+        data={{
+          runtime: "claude-code",
+          status: "online",
+          needsRestart: false,
+          activeTasks: 0,
+          maxConcurrentTasks: null,
+          workspaceAccess: "read-write",
+          deliveryMode: "push",
+          compute: { instance_type: "t3.large", provider: "aws", volume: { root_gb: 30 } },
+        }}
+      />,
+    );
+
+    fireEvent.change(screen.getByLabelText("Root volume"), { target: { value: "60" } });
+    fireEvent.click(screen.getByRole("button", { name: "Save" }));
+    await waitFor(() => expect(apiPatch).toHaveBeenCalledTimes(1));
+    expect(confirmSpy).not.toHaveBeenCalled();
+    const body = apiPatch.mock.calls[0][1] as { compute: { provider?: string } };
+    expect(body.compute.provider).toBeUndefined(); // aws default omitted (wire unchanged)
+    confirmSpy.mockRestore();
+  });
 });
@@ -162,6 +162,11 @@ describe("DisplayTab", () => {
      controller: "user",
      ttl_seconds: 300,
    });
+    // Defensive: the noVNC constructor is async (dynamic import), so wait
+    // for it to be called before asserting arguments (prevents flake in CI).
+    await waitFor(() => {
+      expect(mockRFBConstructor).toHaveBeenCalled();
+    });
    expect(mockRFBConstructor).toHaveBeenCalledWith(
      expect.any(HTMLElement),
      expect.stringContaining("/workspaces/ws-display/display/session/websockify"),
@@ -0,0 +1,105 @@
+// @vitest-environment jsdom
+/** Unit tests for useA2AFlights — the event→flight lifecycle that drives the
+ *  envelope animations on the canvas (MessageFlightLayer) and the concierge
+ *  home (MessageFlightHome). useSocketEvent is mocked so we can drive the
+ *  ACTIVITY_LOGGED handler directly. */
+import { renderHook, act } from "@testing-library/react";
+import { describe, it, expect, vi, beforeEach } from "vitest";
+
+// Capture the handler the hook registers with the socket bus. vi.hoisted is
+// required because vi.mock factories are hoisted above normal declarations and
+// may only close over hoisted state.
+const h = vi.hoisted(() => ({ captured: null as ((msg: unknown) => void) | null }));
+vi.mock("@/hooks/useSocketEvent", () => ({
+  useSocketEvent: (cb: (msg: unknown) => void) => {
+    h.captured = cb;
+  },
+}));
+
+import { useA2AFlights, FLIGHT_DURATION_MS } from "@/hooks/useA2AFlights";
+
+function setReducedMotion(reduce: boolean) {
+  window.matchMedia = vi.fn().mockImplementation((q: string) => ({
+    matches: reduce && q.includes("reduce"),
+    media: q,
+    onchange: null,
+    addEventListener: vi.fn(),
+    removeEventListener: vi.fn(),
+    addListener: vi.fn(),
+    removeListener: vi.fn(),
+    dispatchEvent: vi.fn(),
+  }));
+}
+
+const msg = (payload: Record<string, unknown>, event = "ACTIVITY_LOGGED") => ({
+  event,
+  workspace_id: "a",
+  timestamp: "2026-06-08T00:00:00Z",
+  payload,
+});
+const a2aSend = (over: Record<string, unknown> = {}) =>
+  msg({ activity_type: "a2a_send", source_id: "a", target_id: "b", ...over });
+
+describe("useA2AFlights", () => {
+  beforeEach(() => {
+    h.captured = null;
+    vi.useRealTimers();
+    setReducedMotion(false);
+  });
+
+  it("emits a flight for an a2a_send between two distinct agents", () => {
+    const { result } = renderHook(() => useA2AFlights());
+    act(() => h.captured?.(a2aSend()));
+    expect(result.current).toHaveLength(1);
+    expect(result.current[0]).toMatchObject({ sourceId: "a", targetId: "b", kind: "send" });
+  });
+
+  it("maps a2a_receive / task_update to their kinds", () => {
+    const { result } = renderHook(() => useA2AFlights());
+    act(() => h.captured?.(a2aSend({ activity_type: "a2a_receive" })));
+    act(() => h.captured?.(a2aSend({ activity_type: "task_update" })));
+    const kinds = result.current.map((f) => f.kind);
+    expect(kinds).toContain("receive");
+    expect(kinds).toContain("task");
+  });
+
+  it("ignores non-A2A activity and non-ACTIVITY_LOGGED events", () => {
+    const { result } = renderHook(() => useA2AFlights());
+    act(() => h.captured?.(msg({ activity_type: "status_change", source_id: "a", target_id: "b" })));
+    act(() => h.captured?.(a2aSend({}, )));
+    act(() => h.captured?.({ event: "WORKSPACE_UPDATED", workspace_id: "a", payload: {} }));
+    expect(result.current.every((f) => f.kind === "send")).toBe(true);
+    expect(result.current).toHaveLength(1); // only the one valid a2aSend
+  });
+
+  it("skips self-loops and flights with no target", () => {
+    const { result } = renderHook(() => useA2AFlights());
+    act(() => h.captured?.(a2aSend({ target_id: "a" }))); // self-loop
+    act(() => h.captured?.(a2aSend({ target_id: "" }))); // missing target
+    expect(result.current).toHaveLength(0);
+  });
+
+  it("emits nothing when prefers-reduced-motion is set", () => {
+    setReducedMotion(true);
+    const { result } = renderHook(() => useA2AFlights());
+    act(() => h.captured?.(a2aSend()));
+    expect(result.current).toHaveLength(0);
+  });
+
+  it("emits nothing when disabled", () => {
+    const { result } = renderHook(() => useA2AFlights(false));
+    act(() => h.captured?.(a2aSend()));
+    expect(result.current).toHaveLength(0);
+  });
+
+  it("expires a flight after the TTL", () => {
+    vi.useFakeTimers();
+    const { result } = renderHook(() => useA2AFlights());
+    act(() => h.captured?.(a2aSend()));
+    expect(result.current).toHaveLength(1);
+    act(() => {
+      vi.advanceTimersByTime(FLIGHT_DURATION_MS + 300);
+    });
+    expect(result.current).toHaveLength(0);
+  });
+});
@@ -0,0 +1,103 @@
+/** useA2AFlights — turns the org's live A2A activity stream into transient
+ *  "flights" (one per delegate / message event, source → target) that an
+ *  overlay can animate as an envelope travelling between two agents.
+ *
+ *  This hook owns ONLY the event→flight lifecycle: it subscribes to the same
+ *  ACTIVITY_LOGGED WS bus the CommunicationOverlay uses, keeps a small bounded
+ *  list of in-flight envelopes, and expires each after the animation window.
+ *  The caller resolves positions and renders the envelope, so the exact same
+ *  flight data drives both the spatial canvas (flow coords) and the concierge
+ *  home (DOM row rects).
+ *
+ *  Honours `prefers-reduced-motion`: when the user opts out of motion the hook
+ *  emits no flights at all, so no envelope ever animates. */
+import { useEffect, useRef, useState } from "react";
+import { useSocketEvent } from "@/hooks/useSocketEvent";
+
+export type A2AFlightKind = "send" | "receive" | "task";
+
+export interface A2AFlight {
+  /** unique per flight instance (not per pair) so a burst renders distinct envelopes */
+  key: string;
+  sourceId: string;
+  targetId: string;
+  kind: A2AFlightKind;
+}
+
+/** Total time an envelope is alive (ms). Kept in sync with the overlay's
+ *  Web-Animations duration; the extra tail gives the fade-out room to finish
+ *  before the element unmounts. */
+export const FLIGHT_DURATION_MS = 1200;
+const FLIGHT_TTL_MS = FLIGHT_DURATION_MS + 120;
+
+/** Cap concurrent envelopes so a delegation storm can't spawn unbounded DOM. */
+const MAX_CONCURRENT = 12;
+
+function reducedMotionNow(): boolean {
+  return (
+    typeof window !== "undefined" &&
+    typeof window.matchMedia === "function" &&
+    window.matchMedia("(prefers-reduced-motion: reduce)").matches
+  );
+}
+
+export function useA2AFlights(enabled = true): A2AFlight[] {
+  const [flights, setFlights] = useState<A2AFlight[]>([]);
+  const reduced = useRef<boolean>(reducedMotionNow());
+  const timers = useRef<number[]>([]);
+
+  // Track reduced-motion preference changes live (a user can toggle it mid-session).
+  useEffect(() => {
+    if (typeof window === "undefined" || typeof window.matchMedia !== "function") return;
+    const mq = window.matchMedia("(prefers-reduced-motion: reduce)");
+    const onChange = () => {
+      reduced.current = mq.matches;
+      if (mq.matches) setFlights([]); // drop any in-flight envelopes immediately
+    };
+    mq.addEventListener?.("change", onChange);
+    return () => mq.removeEventListener?.("change", onChange);
+  }, []);
+
+  // Clear pending expiry timers on unmount.
+  useEffect(() => {
+    const t = timers.current;
+    return () => {
+      t.forEach((id) => window.clearTimeout(id));
+    };
+  }, []);
+
+  useSocketEvent((msg) => {
+    if (!enabled || reduced.current) return;
+    if (msg.event !== "ACTIVITY_LOGGED") return;
+
+    const p = (msg.payload || {}) as {
+      activity_type?: string;
+      source_id?: string | null;
+      target_id?: string | null;
+    };
+    const t = p.activity_type;
+    if (t !== "a2a_send" && t !== "a2a_receive" && t !== "task_update") return;
+
+    const sourceId = p.source_id || msg.workspace_id;
+    const targetId = p.target_id || "";
+    // A flight needs two distinct endpoints; a self-loop or missing peer has
+    // nowhere to fly, so skip it.
+    if (!sourceId || !targetId || sourceId === targetId) return;
+
+    const kind: A2AFlightKind =
+      t === "a2a_receive" ? "receive" : t === "task_update" ? "task" : "send";
+    const key = `${msg.timestamp || Date.now()}:${sourceId}:${targetId}:${Math.random()
+      .toString(36)
+      .slice(2, 8)}`;
+
+    setFlights((prev) => [...prev.slice(-(MAX_CONCURRENT - 1)), { key, sourceId, targetId, kind }]);
+
+    const id = window.setTimeout(() => {
+      setFlights((prev) => prev.filter((f) => f.key !== key));
+      timers.current = timers.current.filter((x) => x !== id);
+    }, FLIGHT_TTL_MS);
+    timers.current.push(id);
+  });
+
+  return flights;
+}
@@ -162,6 +162,27 @@ describe("hydrate", () => {
    useCanvasStore.getState().hydrate([ws]);
    expect(useCanvasStore.getState().nodes[0].data.currentTask).toBe("");
  });
+
+  it("preserves in-flight turn status after refresh (issue #2391)", () => {
+    // Simulates a page refresh: the canvas re-hydrates from GET /workspaces
+    // while the agent has an active in-flight turn. The store must reflect
+    // "working" immediately — no dependence on a subsequent TASK_UPDATED
+    // socket event. This prevents the "stuck idle" UX after reload.
+    const ws = makeWS({
+      id: "ws-1",
+      status: "online",
+      current_task: "Analyzing data",
+      active_tasks: 2,
+    });
+    useCanvasStore.getState().hydrate([ws]);
+    const node = useCanvasStore.getState().nodes[0];
+    expect(node.data.currentTask).toBe("Analyzing data");
+    expect(node.data.activeTasks).toBe(2);
+    expect(node.data.status).toBe("online");
+    // Defensive: the node must be considered "working" for any UI that
+    // gates on currentTask (e.g. ChatTab thinking indicator).
+    expect(!!node.data.currentTask).toBe(true);
+  });
 });

 describe("summarizeWorkspaceCapabilities", () => {
@@ -488,6 +488,12 @@ echo ""
 # Step 5 — proxy reach (ws-<id>:8000 Docker-DNS rewrite, end to end).
 # ----------------------------------------------------------------------------
 echo "--- Step 5: proxy reach (POST /workspaces/$WSID/a2a) ---"
+# Debug: print the workspace URL the platform stored so SSRF failures are
+# actionable (#2468 RCA).
+WS_DEBUG=$(admin_curl "$BASE/workspaces/$WSID")
+WS_URL_DEBUG=$(ws_field "$WS_DEBUG" "url")
+WS_STATUS_DEBUG=$(ws_field "$WS_DEBUG" "status")
+echo "  workspace url=$WS_URL_DEBUG status=$WS_STATUS_DEBUG"
 # In minimax mode we send a DETERMINISTIC known-answer prompt and assert the
 # model echoes the answer back — proving a real LLM round-trip, not just
 # reachability. Otherwise a plain "ping".
@@ -0,0 +1,286 @@
+//go:build integration
+// +build integration
+
+// postgres_replay_integration_test.go — REAL Postgres integration tests for
+// the boot-time migration runner (db.RunMigrations) and the connection
+// bootstrap (db.InitPostgres).
+//
+// Issue #2150 (SOP rule internal#765 regression-coverage). test_layer:
+// real-postgres.
+//
+// Run locally with:
+//
+//	docker run --rm -d --name pg-replay \
+//	  -e POSTGRES_PASSWORD=test -e POSTGRES_DB=molecule \
+//	  -p 55432:5432 postgres:15-alpine
+//	sleep 4
+//	cd workspace-server
+//	INTEGRATION_DB_URL="postgres://postgres:test@localhost:55432/molecule?sslmode=disable" \
+//	  go test -tags=integration ./internal/db/ -run '^TestIntegration_Migration|^TestIntegration_InitPostgres'
+//
+// In CI these run on .gitea/workflows/handlers-postgres-integration.yml,
+// which already provisions a real Postgres on the operator-host bridge and
+// triggers on workspace-server/migrations/** changes — the exact blast
+// radius this gate must cover.
+//
+// WHY A REAL DATABASE — and why the existing coverage is NOT enough
+// -----------------------------------------------------------------
+// postgres_migrate_test.go and postgres_schema_migrations_test.go are
+// sqlmock-only: they pin which SQL *statements* fire, but a mock cannot
+// execute SQL, so it cannot prove the 118-file (.up + legacy .sql) chain
+// actually REPLAYS FROM SCRATCH against a real Postgres. The CI psql loop
+// in handlers-postgres-integration.yml deliberately *skips* failing
+// migrations (`⊘ skipped`), so it would stay green even if the chain
+// stopped replaying — it is not a replay gate.
+//
+// This file closes that gap. It boots a Postgres, resets the public schema
+// to a blank slate, and runs the PRODUCTION db.RunMigrations entrypoint —
+// the same function platform boot calls — with hard-fail semantics. It
+// would FAIL (watch-fail intent) against:
+//
+//   - Issue #211: if RunMigrations regresses to globbing `*.sql` and
+//     sorting `.down.sql` before `.up.sql`, the rollback runs before the
+//     forward for any pair (020_workspace_auth_tokens was the canary),
+//     either erroring on the DROP or wiping the just-created table.
+//
+//   - The 045 crash-loop class (cp#429 / project_cp_migration_045_*): the
+//     runner re-applies every recorded-absent file every boot, so a
+//     non-idempotent migration (bare CREATE / INSERT without IF NOT EXISTS
+//     / ON CONFLICT) replays cleanly the first time and FAILS the second.
+//     TestIntegration_MigrationReplay_IsIdempotent_DoubleApply runs the
+//     full chain twice against the same DB to catch that at PR time.
+//
+//   - A new migration that depends on a table a later migration drops, or
+//     is mis-ordered in the lexicographic chain — it simply will not apply
+//     from scratch and the replay errors.
+//
+// All assertions key off the OBSERVABLE database state after the real run,
+// not a proxy for "a statement fired".
+
+package db
+
+import (
+	"database/sql"
+	"os"
+	"path/filepath"
+	"testing"
+
+	_ "github.com/lib/pq"
+)
+
+// migrationsDir is the on-disk path to the forward+legacy migration chain
+// relative to this test file (workspace-server/internal/db → ../../migrations).
+const migrationsDir = "../../migrations"
+
+// freshIntegrationDB opens $INTEGRATION_DB_URL (skipping the test if unset),
+// resets the `public` schema to an empty slate so the run is a true
+// replay-from-scratch regardless of what an earlier CI step applied, and
+// registers a Cleanup that closes the connection.
+//
+// It also points the package-global db.DB at this connection, because
+// RunMigrations operates on db.DB. NOT SAFE for t.Parallel() — it owns the
+// schema for the duration of the test.
+func freshIntegrationDB(t *testing.T) *sql.DB {
+	t.Helper()
+	url := os.Getenv("INTEGRATION_DB_URL")
+	if url == "" {
+		t.Skip("INTEGRATION_DB_URL not set; skipping real-PG replay test (local devs: see file header)")
+	}
+	conn, err := sql.Open("postgres", url)
+	if err != nil {
+		t.Fatalf("open: %v", err)
+	}
+	if err := conn.Ping(); err != nil {
+		t.Fatalf("ping: %v", err)
+	}
+	// True from-scratch: blow away any schema a prior CI step (e.g. the
+	// handlers psql apply-all loop) left behind, then start clean. This is
+	// what makes the test a *replay-from-scratch* gate rather than a
+	// re-apply-onto-existing test.
+	if _, err := conn.Exec(`DROP SCHEMA public CASCADE; CREATE SCHEMA public`); err != nil {
+		t.Fatalf("reset public schema: %v", err)
+	}
+	// gen_random_uuid() (used by 001_workspaces.sql et al.) lives in
+	// pgcrypto on PG < 13 and core on PG 13+. postgres:15-alpine has it in
+	// core, but create the extension defensively so the test does not pin a
+	// specific PG minor.
+	if _, err := conn.Exec(`CREATE EXTENSION IF NOT EXISTS pgcrypto`); err != nil {
+		t.Fatalf("create pgcrypto: %v", err)
+	}
+	t.Cleanup(func() { conn.Close() })
+	return conn
+}
+
+// forwardMigrationCount counts the files RunMigrations is expected to apply:
+// every *.sql that is NOT a *.down.sql. This is derived from the real
+// directory so the gate auto-tracks new migrations without an edit here.
+func forwardMigrationCount(t *testing.T) int {
+	t.Helper()
+	all, err := filepath.Glob(filepath.Join(migrationsDir, "*.sql"))
+	if err != nil {
+		t.Fatalf("glob migrations: %v", err)
+	}
+	n := 0
+	for _, f := range all {
+		if len(f) >= len(".down.sql") && f[len(f)-len(".down.sql"):] == ".down.sql" {
+			continue
+		}
+		n++
+	}
+	if n == 0 {
+		t.Fatalf("found zero forward migrations under %s — wrong path?", migrationsDir)
+	}
+	return n
+}
+
+// TestIntegration_InitPostgres_PingSucceeds proves the production connection
+// bootstrap actually establishes a usable pool against a real server. A
+// sqlmock test can never exercise the real DB.Ping() inside InitPostgres,
+// which is the line that turns a bad DSN / unreachable host into a boot
+// failure instead of a silently-broken pool.
+func TestIntegration_InitPostgres_PingSucceeds(t *testing.T) {
+	url := os.Getenv("INTEGRATION_DB_URL")
+	if url == "" {
+		t.Skip("INTEGRATION_DB_URL not set; skipping")
+	}
+	if err := InitPostgres(url); err != nil {
+		t.Fatalf("InitPostgres against real PG failed: %v", err)
+	}
+	if DB == nil {
+		t.Fatal("InitPostgres returned nil error but db.DB is nil")
+	}
+	// The pool must be live, not just opened.
+	if err := DB.Ping(); err != nil {
+		t.Fatalf("db.DB.Ping after InitPostgres: %v", err)
+	}
+	// Round-trip a trivial query to prove the connection actually serves.
+	var one int
+	if err := DB.QueryRow("SELECT 1").Scan(&one); err != nil {
+		t.Fatalf("SELECT 1 round-trip: %v", err)
+	}
+	if one != 1 {
+		t.Fatalf("SELECT 1 returned %d", one)
+	}
+}
+
+// TestIntegration_InitPostgres_BadDSNFails proves InitPostgres surfaces an
+// unreachable/garbage DSN as an error (the ping path), rather than handing
+// back a half-open pool. Watch-fail: if someone drops the DB.Ping() check
+// from InitPostgres, this stops returning an error and fails.
+func TestIntegration_InitPostgres_BadDSNFails(t *testing.T) {
+	if os.Getenv("INTEGRATION_DB_URL") == "" {
+		t.Skip("INTEGRATION_DB_URL not set; skipping")
+	}
+	// Valid DSN shape, but nothing is listening on this port.
+	err := InitPostgres("postgres://postgres:test@127.0.0.1:1/does_not_exist?sslmode=disable&connect_timeout=2")
+	if err == nil {
+		t.Fatal("expected InitPostgres to fail against an unreachable DSN, got nil (DB.Ping check removed?)")
+	}
+}
+
+// TestIntegration_MigrationReplay_FromScratch is the core gate: run the
+// PRODUCTION RunMigrations over a blank public schema and assert the full
+// forward chain applies cleanly with zero skips.
+//
+// Watch-fail intent:
+//   - #211 .down-wipe: a `.down.sql` leaking into the forward set would
+//     run a DROP before its CREATE → error here (hard fail), or wipe a
+//     table → the schema_migrations / table-presence assertions catch it.
+//   - mis-ordered / dangling-dependency migration → RunMigrations returns
+//     a non-nil error and this test fails.
+func TestIntegration_MigrationReplay_FromScratch(t *testing.T) {
+	conn := freshIntegrationDB(t)
+	DB = conn // RunMigrations operates on the package-global DB.
+
+	if err := RunMigrations(migrationsDir); err != nil {
+		t.Fatalf("full-chain replay-from-scratch failed: %v", err)
+	}
+
+	// Every forward migration must be recorded as applied — proves none was
+	// silently skipped (the failure mode the CI psql loop tolerates).
+	want := forwardMigrationCount(t)
+	var got int
+	if err := DB.QueryRow("SELECT COUNT(*) FROM schema_migrations").Scan(&got); err != nil {
+		t.Fatalf("count schema_migrations: %v", err)
+	}
+	if got != want {
+		t.Errorf("schema_migrations recorded %d migrations, expected %d (the full forward chain)", got, want)
+	}
+
+	// No `.down.sql` may ever be recorded — that is the #211 signature.
+	var downRecorded int
+	if err := DB.QueryRow(
+		"SELECT COUNT(*) FROM schema_migrations WHERE filename LIKE '%.down.sql'",
+	).Scan(&downRecorded); err != nil {
+		t.Fatalf("count down migrations: %v", err)
+	}
+	if downRecorded != 0 {
+		t.Errorf("a .down.sql migration was applied (#211 regression): %d recorded", downRecorded)
+	}
+
+	// Spot-check load-bearing tables that survive to HEAD of the chain.
+	// workspaces is the root table; workspace_auth_tokens was the #211
+	// canary (its data wipe regressed AdminAuth to fail-open).
+	for _, tbl := range []string{"workspaces", "workspace_auth_tokens", "delegations", "activity_logs"} {
+		var exists bool
+		if err := DB.QueryRow(
+			"SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_schema='public' AND table_name=$1)",
+			tbl,
+		).Scan(&exists); err != nil {
+			t.Fatalf("check table %s: %v", tbl, err)
+		}
+		if !exists {
+			t.Errorf("table %q missing after full replay — chain did not land it", tbl)
+		}
+	}
+
+	// agent_memories is CREATEd at 008 and DROPped at the end of the chain
+	// (20260524110000_drop_agent_memories). Its absence proves the late
+	// drop migration actually ran AFTER the early create — i.e. ordering
+	// held. If the chain ever runs a drop before its create, this flips.
+	var legacyExists bool
+	if err := DB.QueryRow(
+		"SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_schema='public' AND table_name='agent_memories')",
+	).Scan(&legacyExists); err != nil {
+		t.Fatalf("check agent_memories: %v", err)
+	}
+	if legacyExists {
+		t.Error("agent_memories still present at HEAD — the late drop migration did not replay in order")
+	}
+}
+
+// TestIntegration_MigrationReplay_IsIdempotent_DoubleApply guards the 045
+// crash-loop class (cp#429 / project_cp_migration_045_crashloop_idempotency_guard):
+// the runner re-checks every file on every boot, so a non-idempotent
+// migration replays fine once and FAILS on the second pass. Here we run the
+// full chain twice. The second pass must apply ZERO new files (all recorded)
+// and must not error.
+//
+// NOTE: this runs against the SAME populated schema, so it also exercises
+// the "skip already-applied" tracking path end-to-end against real PG, which
+// the sqlmock tests only simulate.
+func TestIntegration_MigrationReplay_IsIdempotent_DoubleApply(t *testing.T) {
+	conn := freshIntegrationDB(t)
+	DB = conn
+
+	if err := RunMigrations(migrationsDir); err != nil {
+		t.Fatalf("first replay failed: %v", err)
+	}
+	var afterFirst int
+	if err := DB.QueryRow("SELECT COUNT(*) FROM schema_migrations").Scan(&afterFirst); err != nil {
+		t.Fatalf("count after first: %v", err)
+	}
+
+	// Second boot: nothing new should apply, and it must not error even
+	// though the runner re-evaluates every file (the 045 failure mode).
+	if err := RunMigrations(migrationsDir); err != nil {
+		t.Fatalf("second replay failed (non-idempotent migration / 045 crash-loop class): %v", err)
+	}
+	var afterSecond int
+	if err := DB.QueryRow("SELECT COUNT(*) FROM schema_migrations").Scan(&afterSecond); err != nil {
+		t.Fatalf("count after second: %v", err)
+	}
+	if afterSecond != afterFirst {
+		t.Errorf("second boot changed schema_migrations from %d to %d — re-application is not a clean no-op", afterFirst, afterSecond)
+	}
+}
@@ -0,0 +1,291 @@
+package db
+
+// redis_test.go — regression coverage for the workspace online-status and
+// URL-resolution Redis layer (redis.go), which previously had NO test.
+//
+// Issue #2150 (SOP rule internal#765). redis.go drives two fleet-wide
+// behaviours that break silently if a key name or TTL drifts:
+//
+//   - online detection: SetOnline / RefreshTTL / IsOnline on `ws:<id>`.
+//     A wrong key prefix or a TTL shorter than the heartbeat interval makes
+//     live workspaces flap to "unreachable — restart" (the exact failure
+//     LivenessTTL=180s was tuned to avoid). A TTL too long hides real
+//     crashes.
+//   - proxy URL resolution: CacheURL / GetCachedURL / CacheInternalURL /
+//     GetCachedInternalURL on `ws:<id>:url` and `ws:<id>:internal_url`.
+//     A2A forwarding resolves the target workspace through these keys; a
+//     prefix collision (e.g. the liveness key overlapping the URL key)
+//     would serve the wrong URL or a literal "online" string as a URL.
+//
+// These tests run against miniredis — an in-process Redis that speaks the
+// real RESP protocol and enforces real TTL/expiry semantics — so they
+// exercise the actual go-redis client calls and key/TTL behaviour, not a
+// mock that rubber-stamps them. miniredis is already a module dependency.
+//
+// Watch-fail intent: change any `ws:%s...` format string in redis.go, or
+// regress LivenessTTL below the heartbeat window, and a test here fails.
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/alicebob/miniredis/v2"
+	"github.com/redis/go-redis/v9"
+)
+
+// withMiniRedis spins up an in-process Redis, points the package-global RDB
+// at it, and registers Cleanup. Returns the server handle so tests can drive
+// the clock (FastForward) to exercise TTL expiry deterministically.
+func withMiniRedis(t *testing.T) *miniredis.Miniredis {
+	t.Helper()
+	mr, err := miniredis.Run()
+	if err != nil {
+		t.Fatalf("miniredis.Run: %v", err)
+	}
+	RDB = redis.NewClient(&redis.Options{Addr: mr.Addr()})
+	t.Cleanup(func() {
+		RDB.Close()
+		mr.Close()
+	})
+	return mr
+}
+
+// TestLivenessTTL_ExceedsHeartbeatWindow pins the tuned TTL. The heartbeat
+// loop fires every 30s; LivenessTTL must allow several missed beats (the
+// comment in redis.go targets ~5) so a busy leader starved for 60-120s is
+// not falsely declared dead. 180s = 6×30s. Regressing this toward the old
+// 60s value reintroduces the false-positive restart cycle.
+func TestLivenessTTL_ExceedsHeartbeatWindow(t *testing.T) {
+	const heartbeatInterval = 30 * time.Second
+	const minMissedBeats = 5
+	if LivenessTTL < heartbeatInterval*minMissedBeats {
+		t.Errorf("LivenessTTL=%s is too short: must tolerate >=%d missed %s heartbeats (>= %s) to avoid false-positive restarts",
+			LivenessTTL, minMissedBeats, heartbeatInterval, heartbeatInterval*minMissedBeats)
+	}
+}
+
+// TestSetOnline_KeyAndTTL verifies SetOnline writes the canonical `ws:<id>`
+// key with the value "online" and the LivenessTTL — the exact contract
+// IsOnline and the a2a_proxy reactive check rely on.
+func TestSetOnline_KeyAndTTL(t *testing.T) {
+	mr := withMiniRedis(t)
+	ctx := context.Background()
+	const ws = "ws-abc-123"
+
+	if err := SetOnline(ctx, ws); err != nil {
+		t.Fatalf("SetOnline: %v", err)
+	}
+
+	// Key name must be exactly ws:<id> — not, say, ws:<id>:online.
+	if !mr.Exists("ws:" + ws) {
+		t.Fatalf("expected key %q to exist; keys present: %v", "ws:"+ws, mr.Keys())
+	}
+	got, err := mr.Get("ws:" + ws)
+	if err != nil {
+		t.Fatalf("mr.Get: %v", err)
+	}
+	if got != "online" {
+		t.Errorf("liveness value = %q, want %q", got, "online")
+	}
+
+	// TTL must be the tuned LivenessTTL (allow miniredis's whole-second
+	// granularity).
+	ttl := mr.TTL("ws:" + ws)
+	if ttl != LivenessTTL {
+		t.Errorf("TTL = %s, want %s", ttl, LivenessTTL)
+	}
+}
+
+// TestIsOnline_TrueThenExpires drives the real TTL clock: a freshly-set
+// workspace is online; after the TTL elapses it is offline. This is the
+// behaviour online-detection depends on — proven against real expiry, not
+// asserted from a mock.
+func TestIsOnline_TrueThenExpires(t *testing.T) {
+	mr := withMiniRedis(t)
+	ctx := context.Background()
+	const ws = "ws-expiry"
+
+	if err := SetOnline(ctx, ws); err != nil {
+		t.Fatalf("SetOnline: %v", err)
+	}
+	online, err := IsOnline(ctx, ws)
+	if err != nil {
+		t.Fatalf("IsOnline: %v", err)
+	}
+	if !online {
+		t.Fatal("expected workspace online immediately after SetOnline")
+	}
+
+	// Fast-forward just past the TTL; the liveness key must expire.
+	mr.FastForward(LivenessTTL + time.Second)
+
+	online, err = IsOnline(ctx, ws)
+	if err != nil {
+		t.Fatalf("IsOnline after expiry: %v", err)
+	}
+	if online {
+		t.Error("expected workspace offline after TTL elapsed")
+	}
+}
+
+// TestRefreshTTL_ExtendsLiveness proves a heartbeat (RefreshTTL) keeps a
+// workspace alive across what would otherwise be an expiry. Without the
+// refresh the key expires; with it, IsOnline stays true. Watch-fail: if
+// RefreshTTL targets the wrong key, the refresh is a no-op and this fails.
+func TestRefreshTTL_ExtendsLiveness(t *testing.T) {
+	mr := withMiniRedis(t)
+	ctx := context.Background()
+	const ws = "ws-refresh"
+
+	if err := SetOnline(ctx, ws); err != nil {
+		t.Fatalf("SetOnline: %v", err)
+	}
+	// Advance most of the way to expiry, then heartbeat.
+	mr.FastForward(LivenessTTL - 5*time.Second)
+	if err := RefreshTTL(ctx, ws); err != nil {
+		t.Fatalf("RefreshTTL: %v", err)
+	}
+	// Advance past where the ORIGINAL TTL would have expired. Still online.
+	mr.FastForward(10 * time.Second)
+	online, err := IsOnline(ctx, ws)
+	if err != nil {
+		t.Fatalf("IsOnline: %v", err)
+	}
+	if !online {
+		t.Error("expected workspace still online after RefreshTTL heartbeat")
+	}
+}
+
+// TestIsOnline_UnknownWorkspace returns false (and no error) for a workspace
+// that was never set — the default for a never-registered / long-dead agent.
+func TestIsOnline_UnknownWorkspace(t *testing.T) {
+	withMiniRedis(t)
+	ctx := context.Background()
+	online, err := IsOnline(ctx, "never-seen")
+	if err != nil {
+		t.Fatalf("IsOnline: %v", err)
+	}
+	if online {
+		t.Error("expected unknown workspace to be offline")
+	}
+}
+
+// TestURLCache_RoundTrip pins the `ws:<id>:url` key and its 5-minute TTL,
+// and proves the value round-trips. A2A push resolves the target through
+// this key.
+func TestURLCache_RoundTrip(t *testing.T) {
+	mr := withMiniRedis(t)
+	ctx := context.Background()
+	const ws = "ws-url"
+	const url = "https://ws-url.workspaces.moleculesai.app"
+
+	if err := CacheURL(ctx, ws, url); err != nil {
+		t.Fatalf("CacheURL: %v", err)
+	}
+	got, err := GetCachedURL(ctx, ws)
+	if err != nil {
+		t.Fatalf("GetCachedURL: %v", err)
+	}
+	if got != url {
+		t.Errorf("GetCachedURL = %q, want %q", got, url)
+	}
+	if !mr.Exists("ws:" + ws + ":url") {
+		t.Errorf("expected key %q; present: %v", "ws:"+ws+":url", mr.Keys())
+	}
+	if ttl := mr.TTL("ws:" + ws + ":url"); ttl != 5*time.Minute {
+		t.Errorf("url cache TTL = %s, want 5m", ttl)
+	}
+}
+
+// TestInternalURLCache_RoundTrip pins the `ws:<id>:internal_url` key (the
+// Docker-internal address used for workspace-to-workspace discovery) and its
+// 5-minute TTL.
+func TestInternalURLCache_RoundTrip(t *testing.T) {
+	mr := withMiniRedis(t)
+	ctx := context.Background()
+	const ws = "ws-int"
+	const url = "http://ws-int:8080"
+
+	if err := CacheInternalURL(ctx, ws, url); err != nil {
+		t.Fatalf("CacheInternalURL: %v", err)
+	}
+	got, err := GetCachedInternalURL(ctx, ws)
+	if err != nil {
+		t.Fatalf("GetCachedInternalURL: %v", err)
+	}
+	if got != url {
+		t.Errorf("GetCachedInternalURL = %q, want %q", got, url)
+	}
+	if ttl := mr.TTL("ws:" + ws + ":internal_url"); ttl != 5*time.Minute {
+		t.Errorf("internal url cache TTL = %s, want 5m", ttl)
+	}
+}
+
+// TestKeyNamespacesDoNotCollide is the prefix-collision regression: the
+// liveness key (ws:<id>), the URL key (ws:<id>:url), and the internal-URL
+// key (ws:<id>:internal_url) must be three DISTINCT keys for the same
+// workspace. If a future edit collapses the format strings, IsOnline would
+// read a URL as liveness (or vice versa) and online-detection / proxy
+// resolution would corrupt each other fleet-wide.
+func TestKeyNamespacesDoNotCollide(t *testing.T) {
+	mr := withMiniRedis(t)
+	ctx := context.Background()
+	const ws = "ws-collide"
+
+	if err := SetOnline(ctx, ws); err != nil {
+		t.Fatalf("SetOnline: %v", err)
+	}
+	if err := CacheURL(ctx, ws, "https://public"); err != nil {
+		t.Fatalf("CacheURL: %v", err)
+	}
+	if err := CacheInternalURL(ctx, ws, "http://internal:8080"); err != nil {
+		t.Fatalf("CacheInternalURL: %v", err)
+	}
+
+	// Liveness value must still be "online", NOT a URL.
+	if v, _ := mr.Get("ws:" + ws); v != "online" {
+		t.Errorf("liveness key clobbered by a URL write: got %q", v)
+	}
+	if v, _ := mr.Get("ws:" + ws + ":url"); v != "https://public" {
+		t.Errorf("url key = %q, want https://public", v)
+	}
+	if v, _ := mr.Get("ws:" + ws + ":internal_url"); v != "http://internal:8080" {
+		t.Errorf("internal_url key = %q, want http://internal:8080", v)
+	}
+}
+
+// TestClearWorkspaceKeys_RemovesAllThree proves teardown removes the
+// liveness, URL, and internal-URL keys together — a leaked liveness key
+// after deletion would keep a dead workspace looking online; a leaked URL
+// key would let the proxy forward to a recycled address.
+func TestClearWorkspaceKeys_RemovesAllThree(t *testing.T) {
+	mr := withMiniRedis(t)
+	ctx := context.Background()
+	const ws = "ws-clear"
+
+	if err := SetOnline(ctx, ws); err != nil {
+		t.Fatalf("SetOnline: %v", err)
+	}
+	if err := CacheURL(ctx, ws, "https://x"); err != nil {
+		t.Fatalf("CacheURL: %v", err)
+	}
+	if err := CacheInternalURL(ctx, ws, "http://x:8080"); err != nil {
+		t.Fatalf("CacheInternalURL: %v", err)
+	}
+
+	ClearWorkspaceKeys(ctx, ws)
+
+	for _, k := range []string{"ws:" + ws, "ws:" + ws + ":url", "ws:" + ws + ":internal_url"} {
+		if mr.Exists(k) {
+			t.Errorf("key %q survived ClearWorkspaceKeys", k)
+		}
+	}
+	online, err := IsOnline(ctx, ws)
+	if err != nil {
+		t.Fatalf("IsOnline: %v", err)
+	}
+	if online {
+		t.Error("workspace still online after ClearWorkspaceKeys")
+	}
+}
@@ -225,6 +225,16 @@ func (e *proxyA2AError) Error() string {
 	return "proxy a2a error"
 }

+// EnqueueA2A is a method wrapper around the package-level EnqueueA2A function so
+// that *WorkspaceHandler satisfies the scheduler's A2AProxy interface. The
+// scheduler cannot call the package function directly (it would have to import
+// internal/handlers, but handlers already imports internal/scheduler → import
+// cycle), so it goes through this method on the proxy it already holds. Used by
+// the cron scheduler to durably buffer a tick when the target workspace is busy.
+func (h *WorkspaceHandler) EnqueueA2A(ctx context.Context, workspaceID, callerID string, priority int, body []byte, method, idempotencyKey string, expiresAt *time.Time) (string, int, error) {
+	return EnqueueA2A(ctx, workspaceID, callerID, priority, body, method, idempotencyKey, expiresAt)
+}
+
 // ProxyA2ARequest is the public wrapper for proxyA2ARequest, used by the
 // cron scheduler and other internal callers that need to send A2A messages
 // to workspaces programmatically (not from an HTTP handler).
@@ -97,10 +97,10 @@ type QueuedItem struct {
 // returns the new row ID + current queue depth. Caller MUST have already
 // determined the target is busy — this function does not check.
 //
-// Idempotency: when idempotencyKey is non-empty, the partial unique index
-// `idx_a2a_queue_idempotency` prevents duplicate active rows for the same
-// (workspace_id, idempotency_key). On conflict this returns the existing
-// row's ID so the caller's log still points at the live queue entry.
+// Idempotency: when idempotencyKey is non-empty, a duplicate active enqueue
+// for the same (workspace, key) is collapsed rather than double-buffered. On
+// a duplicate this returns the existing row's ID so the caller's log still
+// points at the live queue entry.
 func EnqueueA2A(
 	ctx context.Context,
 	workspaceID, callerID string,
@@ -129,6 +129,32 @@ func EnqueueA2A(
 		expiresAtArg = *expiresAt
 	}

+	// Supersede any already-expired pending row for this same key before we
+	// insert. The drain path skips expired pending rows, so such a row never
+	// completes on its own — it lingers in the active set and would block the
+	// conflict check below, silently swallowing this fresh enqueue. Retiring
+	// it here (a) frees the active set so the insert below proceeds and (b)
+	// cleans the stale row up so expired rows don't accumulate. Scoped to the
+	// idempotency key so unrelated traffic is untouched.
+	if idempotencyKey != "" {
+		if _, supErr := db.DB.ExecContext(ctx, `
+			UPDATE a2a_queue
+			SET status = 'dropped',
+			    last_error = 'superseded: expired before drain; replaced by a fresh enqueue'
+			WHERE workspace_id = $1
+			  AND idempotency_key = $2
+			  AND status = 'queued'
+			  AND expires_at IS NOT NULL
+			  AND expires_at <= now()
+		`, workspaceID, idempotencyKey); supErr != nil {
+			// Non-fatal: if the cleanup fails we still attempt the insert. Worst
+			// case the conflict path returns the (stale) existing row's id, which
+			// is the pre-fix behaviour — no new breakage introduced here.
+			log.Printf("A2AQueue: supersede-expired cleanup failed for workspace %s key %s: %v",
+				workspaceID, idempotencyKey, supErr)
+		}
+	}
+
 	// INSERT ... ON CONFLICT DO NOTHING RETURNING id. The conflict target
 	// must reference the partial unique INDEX columns + WHERE clause directly
 	// (Postgres can't reference partial unique indexes by name in
@@ -246,20 +272,6 @@ func MarkQueueItemFailed(ctx context.Context, id, errMsg string) {
 	}
 }

-// QueueDepth returns the number of currently-queued (not dispatched/completed)
-// items for a workspace. Used by the busy-return response body so callers
-// can see how many ahead of them.
-func QueueDepth(ctx context.Context, workspaceID string) int {
-	var n int
-	if err := db.DB.QueryRowContext(ctx,
-		`SELECT COUNT(*) FROM a2a_queue WHERE workspace_id = $1 AND status = 'queued'`,
-		workspaceID,
-	).Scan(&n); err != nil {
-		log.Printf("A2AQueue: QueueDepth query failed for workspace %s: %v", workspaceID, err)
-	}
-	return n
-}
-
 // DropStaleQueueItems marks queued items older than maxAge as 'dropped' with a
 // system-generated reason so PM agents stop processing stale post-incident noise.
 // Called with a workspaceID to scope cleanup to one workspace, or empty to sweep
@@ -0,0 +1,160 @@
+package handlers
+
+// a2a_queue_enqueue_expired_test.go — regression for CR3 RC 9853.
+//
+// Bug: a pending buffered tick that expires before the drain reaches it is
+// skipped by the drain (it filters out expired pending rows) yet still occupies
+// the active set the idempotency check guards. A later tick for the SAME key
+// would then collapse onto that dead row and be silently swallowed — the exact
+// drop the busy-buffer path was built to prevent.
+//
+// Fix: EnqueueA2A retires any already-expired pending row for the key BEFORE the
+// insert, so the fresh tick buffers (and the stale row is cleaned up) instead of
+// being dropped.
+//
+// These tests use the QueryMatcherEqual mock (setupTestDBForQueueTests) so the
+// SQL strings below must match the handler's queries verbatim.
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/DATA-DOG/go-sqlmock"
+)
+
+const (
+	enqWorkspaceID = "ws-enq-expired"
+	enqKey         = "sched-aaaa-bbbb" // schedule_id used as idempotency key
+	enqBody        = `{"method":"message/send"}`
+	enqMethod      = "message/send"
+)
+
+// expectSupersedeExpired registers the cleanup UPDATE EnqueueA2A issues before
+// the insert when an idempotency key is present. rowsRetired is how many expired
+// pending rows the UPDATE claims to have dropped.
+func expectSupersedeExpired(mock sqlmock.Sqlmock, workspaceID, key string, rowsRetired int64) {
+	mock.ExpectExec(`
+			UPDATE a2a_queue
+			SET status = 'dropped',
+			    last_error = 'superseded: expired before drain; replaced by a fresh enqueue'
+			WHERE workspace_id = $1
+			  AND idempotency_key = $2
+			  AND status = 'queued'
+			  AND expires_at IS NOT NULL
+			  AND expires_at <= now()
+		`).
+		WithArgs(workspaceID, key).
+		WillReturnResult(sqlmock.NewResult(0, rowsRetired))
+}
+
+// expectInsert registers the INSERT ... ON CONFLICT DO NOTHING RETURNING id.
+// newID is the id the insert returns (non-conflict / fresh enqueue path).
+func expectInsert(mock sqlmock.Sqlmock, newID string) {
+	mock.ExpectQuery(`
+		INSERT INTO a2a_queue (workspace_id, caller_id, priority, body, method, idempotency_key, expires_at)
+		VALUES ($1, $2, $3, $4::jsonb, $5, $6, $7)
+		ON CONFLICT (workspace_id, idempotency_key)
+			WHERE idempotency_key IS NOT NULL AND status IN ('queued','dispatched')
+			DO NOTHING
+		RETURNING id
+	`).WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow(newID))
+}
+
+// expectDepth registers the trailing queue-depth count query.
+func expectDepth(mock sqlmock.Sqlmock, workspaceID string, depth int) {
+	mock.ExpectQuery(`
+		SELECT COUNT(*) FROM a2a_queue
+		WHERE workspace_id = $1 AND status = 'queued'
+	`).WithArgs(workspaceID).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(depth))
+}
+
+// TestEnqueueA2A_ExpiredRowDoesNotBlockFreshTick is the core CR3 regression:
+// an existing expired pending row for a schedule's key must NOT cause the next
+// tick's enqueue to be dropped. The expired row is retired first, then the
+// fresh tick inserts and returns a NEW id.
+func TestEnqueueA2A_ExpiredRowDoesNotBlockFreshTick(t *testing.T) {
+	mock := setupTestDBForQueueTests(t)
+
+	// One expired pending row exists for this key and gets retired.
+	expectSupersedeExpired(mock, enqWorkspaceID, enqKey, 1)
+	// With the active set cleared, the insert proceeds (no conflict) → new id.
+	const freshID = "fresh-tick-id"
+	expectInsert(mock, freshID)
+	expectDepth(mock, enqWorkspaceID, 1)
+
+	nextRun := time.Now().Add(30 * time.Second)
+	id, depth, err := EnqueueA2A(
+		context.Background(), enqWorkspaceID, "", PriorityTask,
+		[]byte(enqBody), enqMethod, enqKey, &nextRun,
+	)
+	if err != nil {
+		t.Fatalf("EnqueueA2A returned error: %v", err)
+	}
+	if id != freshID {
+		t.Errorf("expected the fresh tick to enqueue with a new id %q, got %q "+
+			"(an expired row must not swallow the new tick)", freshID, id)
+	}
+	if depth != 1 {
+		t.Errorf("expected depth 1, got %d", depth)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestEnqueueA2A_NoExpiredRow_NormalEnqueue: when no expired row exists the
+// supersede UPDATE simply affects zero rows and the enqueue proceeds normally.
+func TestEnqueueA2A_NoExpiredRow_NormalEnqueue(t *testing.T) {
+	mock := setupTestDBForQueueTests(t)
+
+	expectSupersedeExpired(mock, enqWorkspaceID, enqKey, 0) // nothing to retire
+	const newID = "new-id"
+	expectInsert(mock, newID)
+	expectDepth(mock, enqWorkspaceID, 2)
+
+	nextRun := time.Now().Add(30 * time.Second)
+	id, depth, err := EnqueueA2A(
+		context.Background(), enqWorkspaceID, "", PriorityTask,
+		[]byte(enqBody), enqMethod, enqKey, &nextRun,
+	)
+	if err != nil {
+		t.Fatalf("EnqueueA2A returned error: %v", err)
+	}
+	if id != newID {
+		t.Errorf("expected id %q, got %q", newID, id)
+	}
+	if depth != 2 {
+		t.Errorf("expected depth 2, got %d", depth)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestEnqueueA2A_NoKey_SkipsSupersede: with no idempotency key there is no
+// active-set conflict to guard, so the supersede cleanup is skipped entirely
+// and only the insert + depth queries run.
+func TestEnqueueA2A_NoKey_SkipsSupersede(t *testing.T) {
+	mock := setupTestDBForQueueTests(t)
+
+	// No expectSupersedeExpired — it must NOT be issued when key is empty.
+	const newID = "no-key-id"
+	expectInsert(mock, newID)
+	expectDepth(mock, enqWorkspaceID, 1)
+
+	id, _, err := EnqueueA2A(
+		context.Background(), enqWorkspaceID, "", PriorityTask,
+		[]byte(enqBody), enqMethod, "", nil,
+	)
+	if err != nil {
+		t.Fatalf("EnqueueA2A returned error: %v", err)
+	}
+	if id != newID {
+		t.Errorf("expected id %q, got %q", newID, id)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
@@ -309,6 +309,8 @@ func validateAgentURL(rawURL string) error {
 // (covers prod `*.moleculesai.app` and staging `*.staging.moleculesai.app`) and
 // is overridable via MOLECULE_APP_DOMAIN for other deployments.
 func isPlatformTunnelHostname(h string) bool {
+	// DNS is case-insensitive and FQDN-form hostnames may carry a trailing dot.
+	h = strings.ToLower(strings.TrimSuffix(h, "."))
 	if !strings.HasPrefix(h, "ws-") {
 		return false
 	}
@@ -316,6 +318,7 @@ func isPlatformTunnelHostname(h string) bool {
 	if domain == "" {
 		domain = "moleculesai.app"
 	}
+	domain = strings.ToLower(strings.TrimSuffix(domain, "."))
 	return strings.HasSuffix(h, "."+domain)
 }

@@ -724,6 +727,25 @@ func (h *RegistryHandler) Heartbeat(c *gin.Context) {
 		return
 	}

+	// #2421: backfill agent_card when the initial register failed and the
+	// heartbeat carries it. Only writes when NULL — never overwrites a
+	// reconciled or updated card. This is the recovery path for fast-cloud
+	// workspaces whose DNS wasn't ready at first register.
+	if len(payload.AgentCard) > 0 {
+		res, err := db.DB.ExecContext(ctx, `
+			UPDATE workspaces
+			SET agent_card = $2
+			WHERE id = $1 AND agent_card IS NULL
+		`, payload.WorkspaceID, payload.AgentCard)
+		if err != nil {
+			log.Printf("Registry heartbeat: agent_card backfill failed for %s: %v", payload.WorkspaceID, err)
+		} else {
+			if rows, _ := res.RowsAffected(); rows > 0 {
+				log.Printf("Registry heartbeat: backfilled agent_card for %s (initial register had failed)", payload.WorkspaceID)
+			}
+		}
+	}
+
 	// Refresh Redis TTL
 	if err := db.RefreshTTL(ctx, payload.WorkspaceID); err != nil {
 		log.Printf("Heartbeat redis error: %v", err)
@@ -755,6 +755,88 @@ func TestHeartbeat_SkipsRemovedRows(t *testing.T) {
 	}
 }

+// ==================== Heartbeat — agent_card backfill (#2421) ====================
+
+func TestHeartbeatHandler_BackfillsAgentCard_WhenNull(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-nocard").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
+
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-nocard", 0.0, "", 0, 0, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// #2421: backfill agent_card when heartbeat carries it and DB row is NULL
+	mock.ExpectExec("UPDATE workspaces SET agent_card =").
+		WithArgs("ws-nocard", sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-nocard").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow(models.StatusOnline))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	body := `{"workspace_id":"ws-nocard","agent_card":{"name":"backfilled"}}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+func TestHeartbeatHandler_SkipsAgentCardBackfill_WhenAlreadySet(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-hascard").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
+
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-hascard", 0.0, "", 0, 0, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// #2421: backfill must be a no-op when agent_card already exists (0 rows affected)
+	mock.ExpectExec("UPDATE workspaces SET agent_card =").
+		WithArgs("ws-hascard", sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 0))
+
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-hascard").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow(models.StatusOnline))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	body := `{"workspace_id":"ws-hascard","agent_card":{"name":"ignored"}}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
 // ------------------------------------------------------------
 // validateAgentURL (C6 SSRF fix)
 // ------------------------------------------------------------
@@ -895,9 +977,13 @@ func TestValidateAgentURL_PendingPlatformTunnel(t *testing.T) {
 	}{
 		{"ws-abc123.moleculesai.app", true},
 		{"ws-abc123.staging.moleculesai.app", true},
-		{"ws-abc123.evil.com", false},       // not under the platform domain
-		{"api.moleculesai.app", false},      // no ws- prefix
-		{"ws-x.fakemoleculesai.app", false}, // lookalike domain, not a subdomain
+		{"WS-ABC123.MOLECULESAI.APP", true},          // case-insensitive DNS
+		{"ws-abc123.moleculesai.app.", true},         // FQDN trailing dot
+		{"WS-ABC123.STAGING.MOLECULESAI.APP.", true}, // both case + trailing dot
+		{"ws-abc123.evil.com", false},                // not under the platform domain
+		{"api.moleculesai.app", false},               // no ws- prefix
+		{"ws-x.fakemoleculesai.app", false},          // lookalike domain, not a subdomain
+		{"ws-abc123moleculesai.app", false},          // missing dot before platform domain
 	} {
 		if got := isPlatformTunnelHostname(tc.h); got != tc.want {
 			t.Errorf("isPlatformTunnelHostname(%q)=%v want %v", tc.h, got, tc.want)
@@ -31,22 +31,88 @@ type workspaceDisplayResponse struct {
 	Status    string `json:"status,omitempty"`
 }

-var workspaceComputeInstanceAllowlist = map[string]struct{}{
-	"t3.medium":  {},
-	"t3.large":   {},
-	"t3.xlarge":  {},
-	"t3.2xlarge": {},
-	"m6i.large":  {},
-	"m6i.xlarge": {},
-	"c6i.xlarge": {},
+// workspaceComputeInstanceAllowlist is keyed by cloud provider (multi-provider /
+// in-place switch): each provider's box accepts only that provider's machine
+// sizes (an AWS t3.* is meaningless on Hetzner, and vice-versa). Mirrors the CP
+// provider SSOT — keep in lock-step with the controlplane provider configs
+// (Hetzner ServerType cpx*/cax*, GCP MachineType e2-*, AWS EC2 t3*/m6i*/c6i*).
+// TestValidateWorkspaceCompute_Provider / _InstanceTypePerProvider pin the sets.
+// "" provider = AWS default.
+var workspaceComputeInstanceAllowlist = map[string]map[string]struct{}{
+	"aws": {
+		"t3.medium": {}, "t3.large": {}, "t3.xlarge": {}, "t3.2xlarge": {},
+		"m6i.large": {}, "m6i.xlarge": {}, "c6i.xlarge": {},
+	},
+	"hetzner": {
+		"cpx11": {}, "cpx21": {}, "cpx31": {}, "cpx41": {}, "cpx51": {},
+		"cax11": {}, "cax21": {}, "cax31": {}, "cax41": {},
+	},
+	"gcp": {
+		"e2-small": {}, "e2-medium": {},
+		"e2-standard-2": {}, "e2-standard-4": {}, "e2-standard-8": {},
+	},
+}
+
+// normalizeCloudProvider maps "" → "aws" so the in-place switch comparison
+// treats the default and an explicit "aws" as the same cloud (no spurious switch).
+func normalizeCloudProvider(p string) string {
+	if p == "" {
+		return "aws"
+	}
+	return p
+}
+
+// instanceTypeAllowedForProvider reports whether instanceType is valid for the
+// given provider ("" → aws). Empty instanceType is always allowed (CP defaults).
+func instanceTypeAllowedForProvider(provider, instanceType string) bool {
+	if instanceType == "" {
+		return true
+	}
+	p := provider
+	if p == "" {
+		p = "aws"
+	}
+	set, ok := workspaceComputeInstanceAllowlist[p]
+	if !ok {
+		return false
+	}
+	_, ok = set[instanceType]
+	return ok
+}
+
+// workspaceComputeProviderAllowlist mirrors the controlplane cloud-provider SSOT
+// (controlplane internal/cloudprovider.Supported = {aws, hetzner, gcp}).
+// ws-server lives in a different repo and cannot import that package, so this is
+// a DELIBERATE mirror; TestValidateWorkspaceCompute_Provider pins the exact set
+// and this doc-comment names the SSOT, so a CP-side change forces a matching
+// change here (and the CP itself fail-closes an unwired provider with a 422).
+// "" = default (AWS) and is always accepted. This is the gate the switch-provider
+// flow reuses to reject a bad provider with a clean 400 before any CP round-trip.
+var workspaceComputeProviderAllowlist = map[string]struct{}{
+	"aws":     {},
+	"gcp":     {},
+	"hetzner": {},
 }

 func validateWorkspaceCompute(compute models.WorkspaceCompute) error {
-	if compute.InstanceType != "" {
-		if _, ok := workspaceComputeInstanceAllowlist[compute.InstanceType]; !ok {
-			return fmt.Errorf("unsupported compute.instance_type")
+	// Provider first (so the instance-type check below can be provider-scoped).
+	// "" = default (AWS). CP fail-closes an unwired provider with a 422; validating
+	// here gives a clean 400 before the round-trip and is the gate reused by the
+	// switch-provider flow. Mirrors the controlplane cloudprovider SSOT.
+	if compute.Provider != "" {
+		if _, ok := workspaceComputeProviderAllowlist[compute.Provider]; !ok {
+			return fmt.Errorf("unsupported compute.provider (want aws|gcp|hetzner)")
 		}
 	}
+	// Instance type must belong to the chosen provider (an AWS t3.* is invalid on
+	// Hetzner, etc.). Empty = CP default for the provider.
+	if !instanceTypeAllowedForProvider(compute.Provider, compute.InstanceType) {
+		prov := compute.Provider
+		if prov == "" {
+			prov = "aws"
+		}
+		return fmt.Errorf("unsupported compute.instance_type %q for provider %q", compute.InstanceType, prov)
+	}
 	if compute.Volume.RootGB != 0 {
 		if compute.Volume.RootGB < workspaceComputeDiskFloorGB || compute.Volume.RootGB > workspaceComputeDiskCeilingGB {
 			return fmt.Errorf("compute.volume.root_gb must be between %d and %d", workspaceComputeDiskFloorGB, workspaceComputeDiskCeilingGB)
@@ -36,6 +36,68 @@ func TestValidateWorkspaceCompute_RejectsUnknownInstanceType(t *testing.T) {
 	}
 }

+// Multi-provider: compute.provider must be "" (default AWS) or one of the wired
+// cloud backends. Pins the allowlist to the controlplane cloudprovider SSOT
+// (Supported = {aws, hetzner, gcp}); if the SSOT changes, update both sides.
+func TestValidateWorkspaceCompute_Provider(t *testing.T) {
+	for _, ok := range []string{"", "aws", "gcp", "hetzner"} {
+		c := models.WorkspaceCompute{Provider: ok}
+		if err := validateWorkspaceCompute(c); err != nil {
+			t.Errorf("provider=%q must be accepted: %v", ok, err)
+		}
+	}
+	for _, bad := range []string{"AWS", "azure", "digitalocean", "ec2", "google", "hetzner-cloud"} {
+		c := models.WorkspaceCompute{Provider: bad}
+		if err := validateWorkspaceCompute(c); err == nil {
+			t.Errorf("provider=%q must be rejected", bad)
+		}
+	}
+	// Pin the exact SSOT-mirrored set so a silent drift fails here.
+	want := map[string]struct{}{"aws": {}, "gcp": {}, "hetzner": {}}
+	if len(workspaceComputeProviderAllowlist) != len(want) {
+		t.Fatalf("provider allowlist drifted from SSOT {aws,gcp,hetzner}: %v", workspaceComputeProviderAllowlist)
+	}
+	for p := range want {
+		if _, ok := workspaceComputeProviderAllowlist[p]; !ok {
+			t.Fatalf("provider allowlist missing %q (SSOT drift)", p)
+		}
+	}
+}
+
+// Multi-provider / in-place switch: an instance type must belong to the chosen
+// provider — an AWS t3.* is meaningless on Hetzner, a cpx* on AWS, etc. Pins the
+// provider-keyed allowlist (mirrors the CP provider configs).
+func TestValidateWorkspaceCompute_InstanceTypePerProvider(t *testing.T) {
+	good := []struct{ provider, instance string }{
+		{"", "t3.medium"}, {"aws", "t3.2xlarge"}, {"aws", "c6i.xlarge"},
+		{"hetzner", "cpx31"}, {"hetzner", "cax41"},
+		{"gcp", "e2-standard-2"}, {"gcp", "e2-small"},
+		{"hetzner", ""}, {"gcp", ""}, // empty instance = CP default, always ok
+	}
+	for _, g := range good {
+		c := models.WorkspaceCompute{Provider: g.provider, InstanceType: g.instance}
+		if err := validateWorkspaceCompute(c); err != nil {
+			t.Errorf("provider=%q instance=%q must be accepted: %v", g.provider, g.instance, err)
+		}
+	}
+	bad := []struct{ provider, instance string }{
+		{"hetzner", "t3.medium"}, // AWS type on Hetzner
+		{"aws", "cpx31"},         // Hetzner type on AWS
+		{"gcp", "t3.large"},      // AWS type on GCP
+		{"hetzner", "e2-small"},  // GCP type on Hetzner
+		{"", "cpx31"},            // default(aws) + Hetzner type
+	}
+	for _, b := range bad {
+		c := models.WorkspaceCompute{Provider: b.provider, InstanceType: b.instance}
+		if err := validateWorkspaceCompute(c); err == nil {
+			t.Errorf("provider=%q instance=%q must be rejected (cross-provider instance type)", b.provider, b.instance)
+		}
+	}
+	if normalizeCloudProvider("") != "aws" || normalizeCloudProvider("hetzner") != "hetzner" {
+		t.Fatal("normalizeCloudProvider: \"\" must map to aws; explicit providers unchanged")
+	}
+}
+
 // internal#734: data_persistence enum. "" (auto), "persist", "ephemeral" are
 // the only accepted values; anything else is a clear 400 before the CP call.
 func TestValidateWorkspaceCompute_DataPersistence(t *testing.T) {
@@ -164,6 +164,7 @@ func (h *WorkspaceHandler) Update(c *gin.Context) {
 		}
 	}
 	var computeJSON string
+	var newComputeProvider string // hoisted: drives the cloud-provider switch detection below
 	computePatch := false
 	if rawCompute, ok := body["compute"]; ok {
 		computePatch = true
@@ -184,6 +185,7 @@ func (h *WorkspaceHandler) Update(c *gin.Context) {
 				c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 				return
 			}
+			newComputeProvider = compute.Provider
 			encoded, err := workspaceComputeJSON(compute)
 			if err != nil {
 				log.Printf("Update compute encode error for %s: %v", id, err)
@@ -262,6 +264,55 @@ func (h *WorkspaceHandler) Update(c *gin.Context) {
 		needsRestart = true
 	}
 	if computePatch {
+		// Cloud-provider SWITCH (in-place): if the incoming provider differs from
+		// the one currently stored, the existing box lives on the OLD cloud. We
+		// MUST deprovision it on the OLD provider BEFORE overwriting compute —
+		// otherwise the subsequent "Save & Restart" restart's provider-aware
+		// deprovision (cpProv.Stop → resolveProvider reads compute->>'provider')
+		// would target the NEW cloud and ORPHAN the old box (a silently-billing
+		// leak). Cloud mode only (the local Docker provisioner has no cross-cloud
+		// concept; provider stays "" there so this never fires). After this, the
+		// canvas's restart provisions the box on the new cloud; its own Stop is a
+		// safe no-op (the box is already gone).
+		if h.cpProv != nil {
+			var oldProvider sql.NullString
+			err := db.DB.QueryRowContext(ctx, `SELECT compute->>'provider' FROM workspaces WHERE id = $1`, id).Scan(&oldProvider)
+			// FAIL-CLOSED on the read. The earlier `err == nil` gate was fail-OPEN:
+			// a transient/unexpected DB error here skipped the whole switch block and
+			// fell through to the compute UPDATE — so during a real switch the later
+			// provider-aware restart deprovision would target the NEW cloud and ORPHAN
+			// the old box (silent billing, unrecoverable). We cannot tell whether this
+			// is a cross-cloud switch without the old provider, so on any error other
+			// than "no such row" we abort exactly like a failed deprovision: compute
+			// untouched, old box still recoverable, user retries. (sql.ErrNoRows means
+			// there is genuinely no prior box — nothing to orphan — so it's safe to
+			// skip the switch and let the UPDATE proceed.)
+			if err != nil && !errors.Is(err, sql.ErrNoRows) {
+				log.Printf("Update: provider-switch precheck for %s ABORTED — could not read current cloud provider (provider left unchanged): %v", id, err)
+				c.JSON(http.StatusBadGateway, gin.H{"error": "could not read the current cloud provider; provider unchanged — please retry"})
+				return
+			}
+			if err == nil && normalizeCloudProvider(oldProvider.String) != normalizeCloudProvider(newComputeProvider) {
+				log.Printf("Update: cloud-provider switch for %s: %q -> %q; deprovisioning old box on old provider before overwriting compute",
+					id, normalizeCloudProvider(oldProvider.String), normalizeCloudProvider(newComputeProvider))
+				// Use the ERROR-returning variant and ABORT before overwriting
+				// compute if the old-box deprovision fails. If we proceeded, the
+				// old box would keep running on the OLD cloud while the row now
+				// records the NEW provider+instance — stranding it with no DB
+				// pointer (an UNRECOVERABLE cross-cloud orphan that no reconciler
+				// can map back). Aborting leaves the row pointing at the
+				// still-recoverable old box; the user can retry the switch. (The
+				// restart paths' void cpStopWithRetry is fine there because the
+				// box stays on the SAME cloud, so the provider record is unchanged
+				// and a provider-scoped sweep can still find it.)
+				if err := h.cpStopWithRetryErr(ctx, id, "provider-switch", false); err != nil {
+					log.Printf("Update: provider-switch for %s ABORTED — could not deprovision old box on %q (provider left unchanged, old box recoverable): %v",
+						id, normalizeCloudProvider(oldProvider.String), err)
+					c.JSON(http.StatusBadGateway, gin.H{"error": "could not deprovision the current cloud box; provider unchanged — please retry"})
+					return
+				}
+			}
+		}
 		if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET compute = $2::jsonb, updated_at = now() WHERE id = $1`, id, computeJSON); err != nil {
 			log.Printf("Update compute error for %s: %v", id, err)
 			c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save compute config"})
@@ -0,0 +1,180 @@
+package handlers
+
+// workspace_provider_switch_test.go — deterministic coverage for the in-place
+// cloud-provider switch in the Update (PATCH /workspaces/:id) handler.
+//
+// The switch is DESTRUCTIVE (it recreates the box on a new cloud) and its
+// safety hinges on ORDER + ABORT, which these tests pin without touching a real
+// cloud (sqlmock DB + the scriptedCPStop fake from workspace_restart_stop_retry_test):
+//
+//   1. On a provider change, the OLD box is deprovisioned (cpProv.Stop) BEFORE
+//      the compute row is overwritten — otherwise the later restart's
+//      provider-aware deprovision would target the NEW cloud and ORPHAN the old
+//      (still-billing) box. The sqlmock query ORDER pins "read old provider →
+//      [Stop] → UPDATE compute".
+//   2. If the old-box deprovision FAILS, the handler ABORTS (502) and does NOT
+//      overwrite compute — leaving the row pointed at the recoverable old box
+//      (an unexpected UPDATE would fail sqlmock's expectations).
+//   3. A non-switch compute edit (same provider) does NOT deprovision anything.
+//   4. If the old-provider READ errors (transient DB fault, not sql.ErrNoRows),
+//      the handler FAILS CLOSED: aborts (502), deprovisions nothing, and does NOT
+//      overwrite compute — closing the fail-open read path that would otherwise
+//      orphan the old box on a real switch (security review RC 9895).
+
+import (
+	"bytes"
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/gin-gonic/gin"
+)
+
+func newPatchContext(t *testing.T, id, body string) (*gin.Context, *httptest.ResponseRecorder) {
+	t.Helper()
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: id}}
+	req := httptest.NewRequest("PATCH", "/workspaces/"+id, bytes.NewBufferString(body))
+	req.Header.Set("Content-Type", "application/json")
+	c.Request = req
+	return c, w
+}
+
+const switchTestWSID = "cccccccc-0001-0000-0000-000000000000"
+
+func newSwitchTestHandler(t *testing.T, cp *scriptedCPStop) *WorkspaceHandler {
+	t.Helper()
+	h := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+	h.cpProv = cp
+	return h
+}
+
+// 1. aws → hetzner: deprovision the OLD box, THEN overwrite compute (200).
+func TestWorkspaceUpdate_ProviderSwitch_DeprovisionsOldBeforeUpdate(t *testing.T) {
+	mock := setupTestDB(t)
+	cp := &scriptedCPStop{} // Stop succeeds
+	h := newSwitchTestHandler(t, cp)
+
+	// Ordered expectations pin: EXISTS → read OLD provider (aws) → UPDATE compute.
+	// The cpProv.Stop deprovision runs (in code) AFTER the provider read and
+	// BEFORE the UPDATE — exactly the orphan-safe order.
+	mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
+		WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
+	mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
+		WillReturnRows(sqlmock.NewRows([]string{"provider"}).AddRow("aws"))
+	mock.ExpectExec("UPDATE workspaces SET compute").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	c, w := newPatchContext(t, switchTestWSID,
+		`{"compute":{"instance_type":"cpx31","provider":"hetzner","volume":{"root_gb":30}}}`)
+	h.Update(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 on a successful switch, got %d: %s", w.Code, w.Body.String())
+	}
+	if cp.calls != 1 {
+		t.Fatalf("expected the OLD box to be deprovisioned exactly once on a provider switch; got %d Stop calls", cp.calls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet/unexpected DB queries (ordering broken?): %v", err)
+	}
+}
+
+// 2. Deprovision FAILS → abort (502) + compute NOT overwritten (no UPDATE).
+func TestWorkspaceUpdate_ProviderSwitch_AbortsWhenDeprovisionFails(t *testing.T) {
+	shrinkRetryBackoff(t) // don't burn the 1s/2s/4s retry backoff
+	mock := setupTestDB(t)
+	// All retry attempts fail → cpStopWithRetryErr returns an error → abort.
+	cp := &scriptedCPStop{errs: []error{
+		fmt.Errorf("cp 503"), fmt.Errorf("cp 503"), fmt.Errorf("cp 503"),
+	}}
+	h := newSwitchTestHandler(t, cp)
+
+	mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
+		WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
+	mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
+		WillReturnRows(sqlmock.NewRows([]string{"provider"}).AddRow("aws"))
+	// NO UPDATE expectation: if the handler overwrote compute after a failed
+	// deprovision (the orphan bug), sqlmock would flag the unexpected query.
+
+	c, w := newPatchContext(t, switchTestWSID,
+		`{"compute":{"instance_type":"cpx31","provider":"hetzner","volume":{"root_gb":30}}}`)
+	h.Update(c)
+
+	if w.Code != http.StatusBadGateway {
+		t.Fatalf("expected 502 when the old-box deprovision fails, got %d: %s", w.Code, w.Body.String())
+	}
+	if cp.calls == 0 {
+		t.Fatal("expected at least one Stop attempt before aborting")
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		// A failure here means an UNEXPECTED UPDATE ran — i.e. compute was
+		// overwritten after a failed deprovision → the orphan bug is back.
+		t.Fatalf("compute must NOT be overwritten when deprovision fails (orphan-prevention): %v", err)
+	}
+}
+
+// 3. Same provider (no switch): no deprovision; compute is updated normally.
+func TestWorkspaceUpdate_NoProviderSwitch_DoesNotDeprovision(t *testing.T) {
+	mock := setupTestDB(t)
+	cp := &scriptedCPStop{}
+	h := newSwitchTestHandler(t, cp)
+
+	mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
+		WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
+	mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
+		WillReturnRows(sqlmock.NewRows([]string{"provider"}).AddRow("aws"))
+	mock.ExpectExec("UPDATE workspaces SET compute").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// provider stays aws (only the instance size changes) → no switch, no Stop.
+	c, w := newPatchContext(t, switchTestWSID,
+		`{"compute":{"instance_type":"t3.large","provider":"aws","volume":{"root_gb":60}}}`)
+	h.Update(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if cp.calls != 0 {
+		t.Fatalf("a non-switching compute edit must NOT deprovision the box; got %d Stop calls", cp.calls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet/unexpected DB queries: %v", err)
+	}
+}
+
+// 4. Provider READ errors (transient DB fault) → fail-CLOSED: abort 502, no
+//    deprovision, no compute overwrite. A fail-open read (the old `err == nil`
+//    gate) would skip switch detection and overwrite compute → orphan the old
+//    cloud box. sqlmock has NO UPDATE/Stop expectations, so either an overwrite
+//    or a stray deprovision trips it.
+func TestWorkspaceUpdate_ProviderSwitch_AbortsOnProviderReadError(t *testing.T) {
+	mock := setupTestDB(t)
+	cp := &scriptedCPStop{}
+	h := newSwitchTestHandler(t, cp)
+
+	mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
+		WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
+	// The old-provider read hits a transient error (NOT sql.ErrNoRows).
+	mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
+		WillReturnError(fmt.Errorf("connection reset by peer"))
+
+	c, w := newPatchContext(t, switchTestWSID,
+		`{"compute":{"instance_type":"cpx31","provider":"hetzner","volume":{"root_gb":30}}}`)
+	h.Update(c)
+
+	if w.Code != http.StatusBadGateway {
+		t.Fatalf("expected 502 when the provider read fails (fail-closed), got %d: %s", w.Code, w.Body.String())
+	}
+	if cp.calls != 0 {
+		t.Fatalf("must NOT deprovision when the current provider can't be read; got %d Stop calls", cp.calls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		// An unexpected UPDATE here = compute was overwritten despite an unreadable
+		// provider → the fail-open orphan path is back.
+		t.Fatalf("compute must NOT be overwritten on a provider read error (fail-closed): %v", err)
+	}
+}
@@ -143,6 +143,12 @@ type HeartbeatPayload struct {
 	// false declared explicitly". Lets the platform distinguish "adapter
 	// said no native ownership" from "old runtime version, didn't say".
 	RuntimeMetadata *RuntimeMetadata `json:"runtime_metadata,omitempty"`
+
+	// AgentCard is sent by the runtime on heartbeat when the initial
+	// /registry/register failed and the workspace has no persisted agent_card.
+	// The heartbeat handler backfills NULL agent_card rows so the workspace
+	// can come online without requiring a full re-register. (#2421)
+	AgentCard json.RawMessage `json:"agent_card,omitempty"`
 }

 // RuntimeMetadata is the adapter-declared capability + override block
@@ -34,6 +34,11 @@ const (
 	// fireSchedule goroutine indefinitely, which blocked wg.Wait() in
 	// tick(), which stalled the entire scheduler until operator restart.
 	dbQueryTimeout = 10 * time.Second
+	// priorityTask mirrors handlers.PriorityTask (50) — the default FIFO A2A
+	// queue priority. Duplicated as a local const because the scheduler cannot
+	// import internal/handlers (handlers imports scheduler → cycle). Buffered
+	// cron ticks enqueue at the same priority as normal busy-retry A2A work.
+	priorityTask = 50
 )

 // sanitizeUTF8 replaces invalid UTF-8 byte sequences with the Unicode
@@ -48,9 +53,14 @@ func sanitizeUTF8(s string) string {
 }

 // A2AProxy is the interface the scheduler needs to send messages to workspaces.
-// WorkspaceHandler.ProxyA2ARequest satisfies this.
+// WorkspaceHandler.ProxyA2ARequest + WorkspaceHandler.EnqueueA2A satisfy this.
 type A2AProxy interface {
 	ProxyA2ARequest(ctx context.Context, workspaceID string, body []byte, callerID string, logActivity bool) (int, []byte, error)
+	// EnqueueA2A durably buffers an A2A message for a busy workspace; the
+	// drain dispatches it serially when the agent frees. idempotencyKey
+	// collapses duplicate pending buffers per (workspace,key). Returns the
+	// buffered entry id, the resulting pending depth, and any error.
+	EnqueueA2A(ctx context.Context, workspaceID, callerID string, priority int, body []byte, method, idempotencyKey string, expiresAt *time.Time) (string, int, error)
 }

 // Broadcaster records events and pushes them to WebSocket clients.
@@ -367,33 +377,6 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
 		sched.WorkspaceID,
 	).Scan(&activeTasks, &maxConcurrent)
 	capCancel()
-	if capErr == nil && activeTasks >= maxConcurrent {
-		log.Printf("Scheduler: '%s' workspace %s at capacity (active_tasks=%d, max=%d), deferring up to 2 min",
-			sched.Name, short(sched.WorkspaceID, 12), activeTasks, maxConcurrent)
-		// Poll every 10s for up to 2 minutes
-		waited := false
-		for i := 0; i < 12; i++ {
-			time.Sleep(10 * time.Second)
-			pollCtx, pollCancel := context.WithTimeout(ctx, dbQueryTimeout)
-			err := db.DB.QueryRowContext(pollCtx,
-				`SELECT COALESCE(active_tasks, 0), COALESCE(max_concurrent_tasks, 1) FROM workspaces WHERE id = $1`,
-				sched.WorkspaceID,
-			).Scan(&activeTasks, &maxConcurrent)
-			pollCancel()
-			if err != nil || activeTasks < maxConcurrent {
-				waited = true
-				break
-			}
-		}
-		if !waited && activeTasks >= maxConcurrent {
-			log.Printf("Scheduler: skipping '%s' on busy workspace %s after 2 min wait (active_tasks=%d, max=%d)",
-				sched.Name, short(sched.WorkspaceID, 12), activeTasks, maxConcurrent)
-			s.recordSkipped(ctx, sched, activeTasks)
-			return
-		}
-		log.Printf("Scheduler: '%s' workspace %s has capacity after deferral, firing",
-			sched.Name, short(sched.WorkspaceID, 12))
-	}

 	fireCtx, cancel := context.WithTimeout(ctx, fireTimeout)
 	defer cancel()
@@ -402,6 +385,9 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
 	// The agent sees recent peer messages before acting, enabling cross-agent
 	// awareness without explicit A2A delegation. Best-effort — if the fetch
 	// fails or the workspace has no Slack channels, the prompt is unchanged.
+	//
+	// Built BEFORE the capacity check so the busy-enqueue path below buffers
+	// the exact same A2A message the fire path would have dispatched.
 	prompt := sched.Prompt
 	if s.channels != nil {
 		if channelCtx := s.channels.FetchWorkspaceChannelContext(fireCtx, sched.WorkspaceID); channelCtx != "" {
@@ -426,6 +412,49 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
 		return
 	}

+	// #969 → durable buffering. When the target workspace is busy
+	// (active_tasks >= max_concurrent_tasks) we do NOT skip the tick and we do
+	// NOT block the scheduler goroutine waiting for capacity. Instead we durably
+	// buffer the cron message, mirroring how busy A2A dispatches already buffer.
+	// The drain then dispatches it serially the moment the agent frees —
+	// execution stays one-at-a-time; max_concurrent_tasks is unchanged.
+	//
+	// This supersedes the previous "poll then recordSkipped" behavior, which
+	// dropped scheduled ticks on workspaces that stayed busy across the whole
+	// poll window.
+	//
+	// Idempotency key = sched.ID (the SCHEDULE id), NOT msgID/a random uuid.
+	// Keying by schedule_id means a busy agent buffers AT MOST ONE pending tick
+	// per schedule — the latest one wins, the obsolete newer tick is collapsed —
+	// so we hold the next tick instead of stacking a stale backlog.
+	if capErr == nil && activeTasks >= maxConcurrent {
+		// Buffered ticks expire at the next scheduled fire: a tick that's been
+		// sitting in the queue past when the cron would naturally tick again is
+		// stale, so let it expire rather than fire late. Best-effort — on a bad
+		// cron expr we enqueue with no TTL (NULL) rather than block the tick.
+		var expiresAt *time.Time
+		if nextRun, nrErr := ComputeNextRun(sched.CronExpr, sched.Timezone, time.Now()); nrErr == nil {
+			expiresAt = &nextRun
+		}
+		enqCtx, enqCancel := context.WithTimeout(ctx, dbQueryTimeout)
+		// Empty callerID = canvas-style (source_id NULL), matching the fire path.
+		qID, depth, enqErr := s.proxy.EnqueueA2A(enqCtx, sched.WorkspaceID, "", priorityTask, a2aBody, "message/send", sched.ID, expiresAt)
+		enqCancel()
+		if enqErr != nil {
+			// Enqueue failed — fall back to recording a skip so the liveness
+			// view still advances and the operator sees the error, rather than
+			// silently dropping the tick or firing into a busy agent.
+			log.Printf("Scheduler: '%s' enqueue on busy workspace %s failed, recording skip: %v",
+				sched.Name, short(sched.WorkspaceID, 12), enqErr)
+			s.recordSkipped(ctx, sched, activeTasks)
+			return
+		}
+		log.Printf("Scheduler: '%s' workspace %s busy (active_tasks=%d, max=%d) — enqueued tick %s (queue depth=%d), will drain when idle",
+			sched.Name, short(sched.WorkspaceID, 12), activeTasks, maxConcurrent, short(qID, 8), depth)
+		s.recordQueued(ctx, sched, activeTasks, qID, depth)
+		return
+	}
+
 	log.Printf("Scheduler: firing '%s' → workspace %s", sched.Name, short(sched.WorkspaceID, 12))

 	// Empty callerID = canvas-style request (bypasses access control, source_id=NULL in activity log).
@@ -727,6 +756,74 @@ func (s *Scheduler) recordSkipped(ctx context.Context, sched scheduleRow, active
 	}
 }

+// recordQueued advances next_run_at and logs a cron_run activity entry with
+// status='queued' when the target workspace was busy and the tick was durably
+// buffered instead of fired. Mirrors recordSkipped (#115) but records a buffer,
+// not a drop: the drain will dispatch qID serially when the agent frees.
+// next_run_at still advances so the liveness view keeps ticking and the NEXT
+// cron slot enqueues (the schedule_id idempotency key then holds at most one
+// pending tick — the latest — per schedule).
+func (s *Scheduler) recordQueued(ctx context.Context, sched scheduleRow, activeTasks int, queueID string, depth int) {
+	reason := fmt.Sprintf("queued: workspace busy (active_tasks=%d), buffered (id=%s, depth=%d)", activeTasks, short(queueID, 8), depth)
+
+	nextRun, nextErr := ComputeNextRun(sched.CronExpr, sched.Timezone, time.Now())
+	var nextRunPtr *time.Time
+	if nextErr == nil {
+		nextRunPtr = &nextRun
+	} else {
+		// Same guard as recordSkipped/fireSchedule — preserve existing
+		// next_run_at rather than writing NULL on an unparseable cron expr.
+		log.Printf("Scheduler: ComputeNextRun error in recordQueued for '%s' (%s) — preserving existing next_run_at: %v",
+			sched.Name, sched.ID, nextErr)
+	}
+
+	queuedUpdCtx, queuedUpdCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
+	if _, err := db.DB.ExecContext(queuedUpdCtx, `
+		UPDATE workspace_schedules
+		SET last_run_at = now(),
+		    next_run_at = COALESCE($2, next_run_at),
+		    run_count = run_count + 1,
+		    last_status = 'queued',
+		    last_error = $3,
+		    updated_at = now()
+		WHERE id = $1
+	`, sched.ID, nextRunPtr, sanitizeUTF8(reason)); err != nil {
+		log.Printf("Scheduler: '%s' queued update failed: %v", sched.Name, err)
+	}
+	queuedUpdCancel()
+
+	cronMeta, marshalErr := json.Marshal(map[string]interface{}{
+		"schedule_id":   sched.ID,
+		"schedule_name": sched.Name,
+		"cron_expr":     sched.CronExpr,
+		"queued":        true,
+		"active_tasks":  activeTasks,
+		"queue_id":      queueID,
+		"queue_depth":   depth,
+	})
+	if marshalErr != nil {
+		log.Printf("Scheduler '%s': json.Marshal cronMeta(queued) failed: %v", sched.Name, marshalErr)
+	} else {
+		queuedInsCtx, queuedInsCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
+		if _, err := db.DB.ExecContext(queuedInsCtx, `
+			INSERT INTO activity_logs (workspace_id, activity_type, source_id, method, summary, request_body, status, error_detail, created_at)
+			VALUES ($1, 'cron_run', NULL, 'cron', $2, $3::jsonb, 'queued', $4, now())
+		`, sched.WorkspaceID, sanitizeUTF8("Cron queued (busy): "+sched.Name), string(cronMeta), sanitizeUTF8(reason)); err != nil {
+			log.Printf("Scheduler: '%s' queued activity log failed: %v", sched.Name, err)
+		}
+		queuedInsCancel()
+	}
+
+	if s.broadcaster != nil {
+		_ = s.broadcaster.RecordAndBroadcast(ctx, string(events.EventCronSkipped), sched.WorkspaceID, map[string]interface{}{
+			"schedule_id":   sched.ID,
+			"schedule_name": sched.Name,
+			"reason":        reason,
+			"queued":        true,
+		})
+	}
+}
+
 // repairNullNextRunAt is called once during Start() to recompute next_run_at
 // for any enabled schedule where it is NULL — a state left by the pre-#722 bug
 // where a ComputeNextRun error caused an UPDATE that wrote NULL.
@@ -73,6 +73,14 @@ type recordingProxy struct {
 	lastCaller  string
 	lastLogFlag bool
 	lastWSID    string
+
+	// enqueue tracking — the busy path calls EnqueueA2A instead of firing.
+	enqueues    int
+	lastEnqBody []byte
+	lastEnqKey  string
+	enqQueueID  string
+	enqDepth    int
+	enqErr      error
 }

 func (p *recordingProxy) ProxyA2ARequest(
@@ -89,6 +97,25 @@ func (p *recordingProxy) ProxyA2ARequest(
 	return p.status, p.body, nil
 }

+// EnqueueA2A records the busy-path enqueue so tests can assert that a tick on a
+// busy workspace was buffered (not fired, not skipped).
+func (p *recordingProxy) EnqueueA2A(
+	_ context.Context, workspaceID, callerID string, _ int, body []byte, _ string, idempotencyKey string, _ *time.Time,
+) (string, int, error) {
+	p.enqueues++
+	p.lastWSID = workspaceID
+	p.lastCaller = callerID
+	p.lastEnqBody = body
+	p.lastEnqKey = idempotencyKey
+	if p.enqErr != nil {
+		return "", 0, p.enqErr
+	}
+	if p.enqQueueID == "" {
+		p.enqQueueID = "q-rec-1"
+	}
+	return p.enqQueueID, p.enqDepth, nil
+}
+
 // ── connection + fixture helpers ──────────────────────────────────────────

 // integrationDB returns the configured integration-test connection or skips
@@ -42,6 +42,13 @@ func (p *panicProxy) ProxyA2ARequest(
 	panic("simulated A2A proxy panic")
 }

+// EnqueueA2A satisfies the extended A2AProxy interface; panics like the fire path.
+func (p *panicProxy) EnqueueA2A(
+	_ context.Context, _ string, _ string, _ int, _ []byte, _ string, _ string, _ *time.Time,
+) (string, int, error) {
+	panic("simulated A2A enqueue panic")
+}
+
 // ── TestLastTickAt_zero ───────────────────────────────────────────────────────

 // TestLastTickAt_zero confirms that LastTickAt returns a zero time.Time on a
@@ -210,6 +217,90 @@ func TestShort_helper(t *testing.T) {
 }

 // ── TestRecordSkipped_writesSkippedStatus ────────────────────────────────────
+// ── busyEnqueueProxy + TestFireSchedule_BusyEnqueuesInsteadOfSkipping ──────────
+//
+// Replaces the old "busy → skip after 2 min" assertion. When the workspace is
+// at capacity, fireSchedule must ENQUEUE the tick into the durable a2a_queue
+// (keyed by schedule_id) and record last_status='queued' — NOT fire and NOT
+// recordSkipped. Proves the scheduled-tick-starvation fix.
+
+type busyEnqueueProxy struct {
+	fired       int
+	enqueued    int
+	enqKey      string
+	enqMethod   string
+	enqPriority int
+}
+
+func (p *busyEnqueueProxy) ProxyA2ARequest(
+	_ context.Context, _ string, _ []byte, _ string, _ bool,
+) (int, []byte, error) {
+	p.fired++
+	return 200, []byte(`{"ok":true}`), nil
+}
+
+func (p *busyEnqueueProxy) EnqueueA2A(
+	_ context.Context, _ string, _ string, priority int, _ []byte, method, idempotencyKey string, _ *time.Time,
+) (string, int, error) {
+	p.enqueued++
+	p.enqKey = idempotencyKey
+	p.enqMethod = method
+	p.enqPriority = priority
+	return "q-busy-1", 1, nil
+}
+
+func TestFireSchedule_BusyEnqueuesInsteadOfSkipping(t *testing.T) {
+	mock := setupTestDB(t)
+
+	sched := scheduleRow{
+		ID:          "77777777-dead-beef-0000-000000000007",
+		WorkspaceID: "88888888-dead-beef-0000-000000000008",
+		Name:        "busy-enqueue-job",
+		CronExpr:    "*/5 * * * *",
+		Timezone:    "UTC",
+		Prompt:      "tick while busy",
+	}
+
+	// Capacity check → active_tasks(2) >= max_concurrent(1): workspace is busy.
+	mock.ExpectQuery(`SELECT COALESCE`).
+		WillReturnRows(sqlmock.NewRows([]string{"active_tasks", "max"}).AddRow(2, 1))
+
+	// recordQueued UPDATE — binds ($1=sched.ID, $2=nextRunPtr, $3=reason);
+	// last_status='queued' is a SQL literal, not a bound arg.
+	mock.ExpectExec(`UPDATE workspace_schedules`).
+		WithArgs(sched.ID, sqlmock.AnyArg(), sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// recordQueued activity_logs INSERT — binds 4 args (workspace_id, summary,
+	// request_body, error_detail); status='queued' is a SQL literal.
+	mock.ExpectExec(`INSERT INTO activity_logs`).
+		WithArgs(sched.WorkspaceID, sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	proxy := &busyEnqueueProxy{}
+	s := New(proxy, nil)
+	s.fireSchedule(context.Background(), sched)
+
+	if proxy.fired != 0 {
+		t.Errorf("busy workspace: ProxyA2ARequest must NOT fire, got %d fires", proxy.fired)
+	}
+	if proxy.enqueued != 1 {
+		t.Fatalf("busy workspace: expected exactly 1 EnqueueA2A, got %d", proxy.enqueued)
+	}
+	if proxy.enqKey != sched.ID {
+		t.Errorf("idempotency key must be schedule_id %q (buffer-latest dedup), got %q", sched.ID, proxy.enqKey)
+	}
+	if proxy.enqMethod != "message/send" {
+		t.Errorf("enqueued method = %q, want \"message/send\"", proxy.enqMethod)
+	}
+	if proxy.enqPriority != priorityTask {
+		t.Errorf("enqueued priority = %d, want priorityTask(%d)", proxy.enqPriority, priorityTask)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet DB expectations — busy tick not recorded as queued: %v", err)
+	}
+}
+
 // #115 coverage gap: the recordSkipped path wasn't tested at all when it
 // first landed. Exercises the UPDATE workspace_schedules + INSERT into
 // activity_logs via sqlmock. Broadcaster is nil so we don't need to stub
@@ -257,6 +348,13 @@ func (p *successProxy) ProxyA2ARequest(
 	return 200, []byte(`{"ok":true}`), nil
 }

+// EnqueueA2A satisfies the extended A2AProxy interface.
+func (p *successProxy) EnqueueA2A(
+	_ context.Context, _ string, _ string, _ int, _ []byte, _ string, _ string, _ *time.Time,
+) (string, int, error) {
+	return "q-success", 1, nil
+}
+
 // ── adapterErrorProxy ─────────────────────────────────────────────────────────

 // adapterErrorProxy is a test double whose ProxyA2ARequest returns HTTP 200
@@ -270,6 +368,13 @@ func (p *adapterErrorProxy) ProxyA2ARequest(
 	return 200, []byte(`{"jsonrpc":"2.0","id":"cron-test-123","error":{"code":-32603,"message":"adapter SDK internal error"}}`), nil
 }

+// EnqueueA2A satisfies the extended A2AProxy interface.
+func (p *adapterErrorProxy) EnqueueA2A(
+	_ context.Context, _ string, _ string, _ int, _ []byte, _ string, _ string, _ *time.Time,
+) (string, int, error) {
+	return "q-adaptererr", 1, nil
+}
+
 // ── TestFireSchedule_AdapterSDKError (#1696) ──────────────────────────────────
 //
 // When the adapter SDK throws internally and returns HTTP 200 with an error
@@ -667,6 +772,7 @@ func TestRecordSkipped_AdvancesNextRunAt(t *testing.T) {
 			"recordSkipped must advance next_run_at when workspace is busy (#1029)", err)
 	}
 }
+
 // trigger CI

 // ── TestDetectResultKind ───────────────────────────────────────────────────────
@@ -833,10 +939,10 @@ func TestDetectResultKind(t *testing.T) {
 //
 // When ProxyA2ARequest returns HTTP 200 but the response body contains a
 // non-ok result_kind, fireSchedule must:
-//   1. Set last_status to the result_kind (not 'ok').
-//   2. Set last_error to describe the SDK error.
-//   3. Increment consecutive_sdk_errors.
-//   4. NOT auto-disable on first occurrence (threshold is 3).
+//  1. Set last_status to the result_kind (not 'ok').
+//  2. Set last_error to describe the SDK error.
+//  3. Increment consecutive_sdk_errors.
+//  4. NOT auto-disable on first occurrence (threshold is 3).
 //
 // This test uses an sdkErrorProxy that returns a rate-limited body and asserts
 // the first run is recorded as 'rate_limited' with consecutive_sdk_errors=1
@@ -999,6 +1105,13 @@ func (p *sdkErrorProxy) ProxyA2ARequest(
 	return 200, body, nil
 }

+// EnqueueA2A satisfies the extended A2AProxy interface.
+func (p *sdkErrorProxy) EnqueueA2A(
+	_ context.Context, _ string, _ string, _ int, _ []byte, _ string, _ string, _ *time.Time,
+) (string, int, error) {
+	return "q-sdkerr", 1, nil
+}
+
 // ── TestTruncate_utf8Safe_regression2026 ──────────────────────────────────────

 // TestTruncate_utf8Safe_regression2026 locks in the #2026 fix: truncate must