chore: retrigger CI — Local Provision E2E stub failed on provisioning timeout (infra flake on main, unrelated to dead-code removal)

chore(dead-code): remove unused QueueDepth function
QueueDepth was added for Phase 2/3 busy-return response visibility but was never wired to a caller. The inline depth query in EnqueueA2A serves today's enqueue response, making this function dead code. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-09 11:41:21 +00:00 · 2026-06-09 11:32:25 +00:00 · 2026-06-09 05:55:31 +00:00 · 2026-06-09 05:29:05 +00:00 · 2026-06-09 03:14:14 +00:00 · 2026-06-09 03:05:21 +00:00
29 changed files with 2675 additions and 197 deletions
@@ -116,28 +116,65 @@ fi
 # 3. Status-check state at the PR HEAD (where checks ran). The merge
 #    commit doesn't get its own checks; we evaluate the PR's last
 #    commit, which is what branch protection compared against.
-# Fail-closed: verify HTTP 200. A 401/403/404 means the status is
-# unreadable — we must NOT treat that as "no statuses" and skip checks.
-STATUS_TMP=$(mktemp)
-STATUS_HTTP=$(curl -sS -o "$STATUS_TMP" -w '%{http_code}' -H "$AUTH" \
-  "${API}/repos/${OWNER}/${NAME}/commits/${HEAD_SHA}/status")
-STATUS=$(cat "$STATUS_TMP")
-rm -f "$STATUS_TMP"
-if [ "$STATUS_HTTP" != "200" ]; then
-  echo "::error::GET /commits/${HEAD_SHA}/status returned HTTP ${STATUS_HTTP} — cannot evaluate required checks."
-  exit 1
-fi
-# FAIL-CLOSED: a 200 status response missing the 'statuses' array, or with
-# 'statuses' set to a non-array type (null/string/object), must NOT be treated
-# as "no checks" — that would silently declare all checks green.
-if ! echo "$STATUS" | jq -e '(.statuses | type) == "array"' >/dev/null; then
-  echo "::error::GET /commits/${HEAD_SHA}/status returned HTTP 200 but 'statuses' is missing or not an array — cannot evaluate required checks."
-  exit 1
-fi
+#
+# Pagination (status-pagination RCA, #2440-family): the combined
+# /commits/{sha}/status endpoint caps its embedded `statuses` array at the
+# Gitea default page size (~30). On a high-churn PR an older-but-still-current
+# required-context SUCCESS row is pushed PAST that cap, so reading the combined
+# view would record that context as `missing` and emit a FALSE-POSITIVE
+# force-merge. We instead page through the dedicated /commits/{sha}/statuses
+# list to EXHAUSTION (until a short/empty page), accumulating every row.
+#
+# Fail-closed is preserved end to end: any non-200 page, or a page whose body
+# is not a JSON array, aborts with exit 1 (we never treat an unreadable/partial
+# page as "no checks"). A genuinely-absent required context appears on NO page,
+# so CHECK_STATE has no entry for it → `${...:-missing}` below keeps it
+# `missing` → it is still counted as not-green. No fail-open path is added.
+PER_PAGE=100
+page=1
+ALL_STATUSES_TMP=$(mktemp)
+printf '[]' > "$ALL_STATUSES_TMP"   # accumulator: a single JSON array of rows
+while :; do
+  STATUS_TMP=$(mktemp)
+  STATUS_HTTP=$(curl -sS -o "$STATUS_TMP" -w '%{http_code}' -H "$AUTH" \
+    "${API}/repos/${OWNER}/${NAME}/commits/${HEAD_SHA}/statuses?page=${page}&limit=${PER_PAGE}")
+  PAGE_BODY=$(cat "$STATUS_TMP")
+  rm -f "$STATUS_TMP"
+  if [ "$STATUS_HTTP" != "200" ]; then
+    rm -f "$ALL_STATUSES_TMP"
+    echo "::error::GET /commits/${HEAD_SHA}/statuses?page=${page} returned HTTP ${STATUS_HTTP} — cannot evaluate required checks."
+    exit 1
+  fi
+  # FAIL-CLOSED: the /statuses endpoint returns a bare JSON array. A non-array
+  # body (null/object/string) means the response is malformed — we must NOT
+  # treat that as "no checks", which would silently declare all checks green.
+  if ! echo "$PAGE_BODY" | jq -e 'type == "array"' >/dev/null 2>&1; then
+    rm -f "$ALL_STATUSES_TMP"
+    echo "::error::GET /commits/${HEAD_SHA}/statuses?page=${page} returned HTTP 200 but body is not a JSON array — cannot evaluate required checks."
+    exit 1
+  fi
+  PAGE_COUNT=$(echo "$PAGE_BODY" | jq 'length')
+  # Append this page's rows to the accumulator (insertion order is preserved
+  # but NOT relied upon — the collapse below selects max-by-id per context).
+  COMBINED=$(jq -s '.[0] + .[1]' "$ALL_STATUSES_TMP" <(echo "$PAGE_BODY"))
+  printf '%s' "$COMBINED" > "$ALL_STATUSES_TMP"
+  # Short page (fewer than PER_PAGE rows) ⇒ last page ⇒ stop.
+  if [ "$PAGE_COUNT" -lt "$PER_PAGE" ]; then
+    break
+  fi
+  page=$((page + 1))
+done
+STATUS=$(cat "$ALL_STATUSES_TMP")
+rm -f "$ALL_STATUSES_TMP"
 declare -A CHECK_STATE
+# Gitea's /commits/{sha}/statuses is roughly newest-first but NOT strictly
+# monotonic by id (observed first ids 157,155,156,… — local inversions from
+# re-runs and page boundaries), so neither first- nor last-occurrence reliably
+# yields the current row. Select the MAX-id row per context explicitly
+# (order-independent), matching prod-auto-deploy.py's latest_status_for_context.
 while IFS=$'\t' read -r ctx state; do
  [ -n "$ctx" ] && CHECK_STATE[$ctx]="$state"
-done < <(echo "$STATUS" | jq -r '.statuses | .[] | "\(.context)\t\(.status)"')
+done < <(echo "$STATUS" | jq -r 'group_by(.context) | map(max_by(.id)) | .[] | "\(.context)\t\(.status)"')

 # 4. For each required check, was it green at merge? YAML block scalars
 #    (`|`) leave a trailing newline; skip blank/whitespace-only lines.
@@ -26,10 +26,21 @@ PROFILES: dict[str, dict[str, str]] = {
        "handlers": (
            r"^workspace-server/internal/handlers/"
            r"|^workspace-server/internal/wsauth/"
+            # #2148: registry-auth real-PG integration tests (CanCommunicate
+            # parent_id hierarchy lives in internal/registry; org-admin token
+            # revoke/validate lives in internal/orgtoken) run in this same
+            # workflow, so a regression in either package MUST trigger the job.
+            r"|^workspace-server/internal/registry/"
+            r"|^workspace-server/internal/orgtoken/"
            # #2149: the scheduler real-PG integration tests run in this same
            # workflow (they reuse its migrated Postgres), so changes to the
            # scheduler package must trigger the job too.
            r"|^workspace-server/internal/scheduler/"
+            # #2150: the db package's real-PG migration-replay-from-scratch
+            # + InitPostgres ping tests also run in this same workflow (they
+            # reuse its sibling Postgres, against a separate `molecule_replay`
+            # database). Changes to db must trigger the job too.
+            r"|^workspace-server/internal/db/"
            r"|^workspace-server/migrations/"
            r"|^\.gitea/workflows/handlers-postgres-integration\.yml$"
        ),
@@ -95,17 +95,27 @@ def build_plan(env: dict[str, str]) -> dict:


 def latest_status_for_context(statuses: list[dict], context: str) -> dict | None:
-    """Return the first matching status.
+    """Return the NEWEST status row for ``context`` (highest ``id``).

-    Gitea's combined-status response is newest-first in practice. The merge
-    queue relies on the same contract; keeping the selector explicit makes
-    stale duplicate contexts easy to test.
+    This must work for BOTH orderings Gitea exposes: the combined
+    ``/status`` view is newest-first, but the exhaustively-paginated
+    ``/statuses`` list (see ``fetch_all_statuses``) is ascending id order
+    (oldest-first). Selecting by max ``id`` collapses duplicate context rows
+    to the current one regardless of input order, so a stale earlier run can
+    never shadow the latest result. Rows without an ``id`` are treated as
+    oldest (id -1) so a well-formed newer row always wins.
    """
-
+    newest: dict | None = None
+    newest_id = -1
    for status in statuses:
-        if status.get("context") == context:
-            return status
-    return None
+        if status.get("context") != context:
+            continue
+        raw_id = status.get("id")
+        sid = raw_id if isinstance(raw_id, int) else -1
+        if newest is None or sid >= newest_id:
+            newest = status
+            newest_id = sid
+    return newest


 def ci_context_state(statuses: list[dict], context: str) -> str:
@@ -351,6 +361,55 @@ def _api_json(url: str, token: str) -> dict:
        raise RuntimeError(f"GET {url} -> HTTP {exc.code}: {body}") from exc


+def _api_json_list(url: str, token: str) -> list:
+    """GET a Gitea list endpoint and return the JSON array.
+
+    Like ``_api_json`` but asserts the body is a list. Fail-closed: a non-list
+    body (or HTTP error) raises so the caller never mistakes an unreadable page
+    for "no more statuses" and silently truncates the required-context scan.
+    """
+    req = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
+    try:
+        with urllib.request.urlopen(req, timeout=20) as resp:
+            body = json.loads(resp.read())
+    except urllib.error.HTTPError as exc:
+        detail = exc.read().decode("utf-8", errors="replace")[:500]
+        raise RuntimeError(f"GET {url} -> HTTP {exc.code}: {detail}") from exc
+    if not isinstance(body, list):
+        raise RuntimeError(f"GET {url} -> expected JSON array, got {type(body).__name__}")
+    return body
+
+
+def fetch_all_statuses(host: str, repo: str, sha: str, token: str, page_size: int = 100) -> list[dict]:
+    """Return EVERY commit-status row for ``sha``, paginating to exhaustion.
+
+    The combined ``/commits/{sha}/status`` endpoint caps its embedded
+    ``statuses`` array at the Gitea default page size (~30). On a high-churn
+    commit, an older-but-still-current required-context SUCCESS row is pushed
+    PAST that cap, so a reader of the combined view sees the required context
+    as ``missing`` and either blocks (force-merge audit) or waits forever
+    (this deploy gate). We instead walk ``/commits/{sha}/statuses`` page by
+    page until a short/empty page, accumulating ALL rows.
+
+    Fail-closed: any page that errors or is not a list raises (see
+    ``_api_json_list``) — we never degrade to a partial list and call a deploy
+    green. A genuinely-absent required context simply never appears on ANY
+    page, so the caller's ``ci_context_state`` still reports ``missing`` and
+    the gate stays closed.
+    """
+    base = f"https://{host}/api/v1/repos/{repo}/commits/{sha}/statuses"
+    results: list[dict] = []
+    page = 1
+    while True:
+        page_url = f"{base}?page={page}&limit={page_size}"
+        rows = _api_json_list(page_url, token)
+        results.extend(r for r in rows if isinstance(r, dict))
+        if len(rows) < page_size:
+            break
+        page += 1
+    return results
+
+
 def _api_json_optional(url: str, token: str) -> tuple[int, dict | None]:
    req = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
    try:
@@ -472,12 +531,19 @@ def wait_for_ci_context(env: dict[str, str]) -> str:
    if not token:
        raise ValueError("GITEA_TOKEN is required to wait for CI status")

-    url = f"https://{host}/api/v1/repos/{repo}/commits/{sha}/status"
    deadline = time.time() + timeout
    last_states: dict[str, str] = {}
    while time.time() <= deadline:
-        body = _api_json(url, token)
-        statuses = body.get("statuses") or []
+        # Read the FULL, exhaustively-paginated /statuses list — NOT the
+        # combined /status view, whose embedded `statuses` array is capped at
+        # the Gitea page size (~30). On a high-churn commit a required-context
+        # SUCCESS row lands past that cap and the combined view would report
+        # it `missing`, so this gate would wait until timeout and refuse a
+        # legitimate prod deploy. Fetching every page closes that hole.
+        # Fail-closed is preserved: a genuinely-absent required context is on
+        # NO page, so ci_context_state() still returns "missing" → never
+        # satisfied → the deploy stays blocked.
+        statuses = fetch_all_statuses(host, repo, sha, token)
        states = {context: ci_context_state(statuses, context) for context in contexts}
        for context, state in states.items():
            if state != last_states.get(context):
@@ -115,5 +115,79 @@ T16=$(validate_required_checks_json "main" '{"main":"CI / all-required"}')
 [ "$T16" = "false" ] || fail "T16: string branch entry should fail"
 pass "T16: string branch entry fails"

+# ---------------------------------------------------------------------------
+# T17+ — /statuses pagination (status-pagination RCA, #2440-family).
+# The reader now pages /commits/{sha}/statuses to exhaustion instead of reading
+# the capped combined /status view. These lock the page-accumulation,
+# newest-wins collapse, short-page stop, and fail-closed contracts.
+# ---------------------------------------------------------------------------
+
+# Page-body type validator used per page (bare array, not an object).
+validate_page_is_array() { jq -e 'type == "array"' >/dev/null 2>&1 && echo true || echo false; }
+
+# newest-wins collapse: mirror the script's max-by-id jq (order-independent).
+collapse_newest_per_context() {
+  declare -A CS
+  while IFS=$'\t' read -r ctx state; do
+    [ -n "$ctx" ] && CS[$ctx]="$state"
+  done < <(jq -r 'group_by(.context) | map(max_by(.id)) | .[] | "\(.context)\t\(.status)"')
+  state="${CS[CI / all-required (push)]:-missing}"
+  echo "$state"
+}
+
+# T17 — a bare JSON array page passes the per-page array check.
+T17=$(echo '[{"context":"c1","status":"success"}]' | validate_page_is_array)
+[ "$T17" = "true" ] || fail "T17: bare array page should pass array check"
+pass "T17: bare array page passes array check"
+
+# T18 — a non-array page (object) fails the per-page array check → fail-closed.
+T18=$(echo '{"statuses":[]}' | validate_page_is_array)
+[ "$T18" = "false" ] || fail "T18: object page should fail array check (fail-closed)"
+pass "T18: object page fails array check (fail-closed)"
+
+# T19 — required SUCCESS on PAGE 2 is FOUND after accumulation (not missing).
+#   page1: 100 noise rows (older ids); page2: the required-context success.
+PAGE1=$(jq -nc '[range(0;100) | {id:., context:("noise-\(.) (push)"), status:"pending"}]')
+PAGE2='[{"id":200,"context":"CI / all-required (push)","status":"success"}]'
+# Accumulation matching the script: two-arg `jq -s '.[0] + .[1]'` over the
+# running accumulator and the new page.
+ACCUM=$(jq -s '.[0] + .[1]' <(echo "$PAGE1") <(echo "$PAGE2"))
+LEN=$(echo "$ACCUM" | jq 'length')
+[ "$LEN" = "101" ] || fail "T19: accumulated length should be 101, got $LEN"
+RESULT=$(echo "$ACCUM" | collapse_newest_per_context)
+[ "$RESULT" = "success" ] || fail "T19: required success on page2 must be FOUND, got '$RESULT'"
+pass "T19: required success on page2 is found after pagination"
+
+# T20 — genuinely-absent required context across all pages stays 'missing'
+#       → fail-closed (counted as not-green, flags the force-merge).
+ABSENT=$(jq -nc '[range(0;100) | {id:., context:("noise-\(.) (push)"), status:"success"}]')
+RESULT2=$(echo "$ABSENT" | collapse_newest_per_context)
+[ "$RESULT2" = "missing" ] || fail "T20: absent required context must stay 'missing', got '$RESULT2'"
+pass "T20: genuinely-absent required context stays missing (fail-closed)"
+
+# T21 — non-monotonic order: newest id (157, neither first nor last in list)
+#       a NEWER success row (oldest-first append → last overwrite wins).
+DUP='[{"id":155,"context":"CI / all-required (push)","status":"pending"},
+      {"id":157,"context":"CI / all-required (push)","status":"success"},
+      {"id":125,"context":"CI / all-required (push)","status":"failure"}]'
+RESULT3=$(echo "$DUP" | collapse_newest_per_context)
+[ "$RESULT3" = "success" ] || fail "T21: newest (success) must win over older (failure), got '$RESULT3'"
+pass "T21: newest row per context wins after pagination collapse"
+
+# T22 — short-page stop condition: a page with fewer than PER_PAGE rows ends
+#       the loop. Emulate the numeric comparison the script uses.
+PER_PAGE=100
+PAGE_COUNT=$(echo "$PAGE2" | jq 'length')   # 1 row
+if [ "$PAGE_COUNT" -lt "$PER_PAGE" ]; then SHORT=stop; else SHORT=continue; fi
+[ "$SHORT" = "stop" ] || fail "T22: short page should stop pagination"
+pass "T22: short page stops pagination loop"
+
+# T23 — a full page (== PER_PAGE) continues the loop.
+FULL=$(jq -nc '[range(0;100) | {id:., context:"x", status:"success"}]')
+FULL_COUNT=$(echo "$FULL" | jq 'length')
+if [ "$FULL_COUNT" -lt "$PER_PAGE" ]; then CONT=stop; else CONT=continue; fi
+[ "$CONT" = "continue" ] || fail "T23: full page should continue pagination"
+pass "T23: full page continues pagination loop"
+
 echo
 echo "ALL AUDIT-FORCE-MERGE CHECKS PASSED"
@@ -105,16 +105,25 @@ def test_build_plan_disable_flag_short_circuits_before_credentials():
    assert plan["disabled_reason"] == "PROD_AUTO_DEPLOY_DISABLED=true"


-def test_latest_status_for_context_uses_first_matching_status():
+def test_latest_status_for_context_picks_newest_by_id_regardless_of_order():
+    # The exhaustively-paginated /statuses list is ascending id order
+    # (oldest-first), the opposite of the combined /status view. The selector
+    # must collapse duplicate context rows to the NEWEST (max id) so a stale
+    # earlier run never shadows the current result, whichever way they arrive.
    statuses = [
-        {"context": "CI / all-required (push)", "status": "pending"},
-        {"context": "CI / all-required (pull_request)", "status": "success"},
-        {"context": "CI / all-required (push)", "status": "success"},
+        {"id": 10, "context": "CI / all-required (push)", "status": "pending"},
+        {"id": 11, "context": "CI / all-required (pull_request)", "status": "success"},
+        {"id": 12, "context": "CI / all-required (push)", "status": "success"},
    ]

    latest = prod.latest_status_for_context(statuses, "CI / all-required (push)")

-    assert latest == {"context": "CI / all-required (push)", "status": "pending"}
+    assert latest == {"id": 12, "context": "CI / all-required (push)", "status": "success"}
+
+    # Same rows shuffled (newest-first, as the combined view would deliver)
+    # must still resolve to the same newest row.
+    latest_rev = prod.latest_status_for_context(list(reversed(statuses)), "CI / all-required (push)")
+    assert latest_rev == {"id": 12, "context": "CI / all-required (push)", "status": "success"}


 def test_ci_context_state_handles_missing_and_gitea_status_key():
@@ -612,3 +621,123 @@ def test_superseded_by_none_for_latest_job_so_it_still_rolls(monkeypatch):
        )
        is None
    )
+
+
+# ---------------------------------------------------------------------------
+# /statuses pagination — required-context SUCCESS on page 2+ must be FOUND,
+# genuinely-absent context must STILL fail-closed (no fail-open).
+# Regression for the single-page-status bug (#2440-family, pagination RCA):
+# the combined /status view caps `statuses` at ~30, so on a high-churn commit
+# the still-current required-context row is pushed past page 1 and the reader
+# falsely reports it `missing`.
+# ---------------------------------------------------------------------------
+def _paged_statuses_stub(pages):
+    """Return a fake _api_json_list that serves `pages` keyed by ?page=N."""
+    def fake(url, _token):
+        # url looks like .../statuses?page=N&limit=100
+        page = 1
+        for part in url.split("?", 1)[-1].split("&"):
+            if part.startswith("page="):
+                page = int(part.split("=", 1)[1])
+        return pages.get(page, [])
+    return fake
+
+
+def test_fetch_all_statuses_finds_required_success_on_page_two(monkeypatch):
+    # Page 1 is a full 100 rows of unrelated/older churn; the required-context
+    # SUCCESS only appears on page 2. A single-page reader would miss it.
+    page1 = [
+        {"id": i, "context": f"noise-{i} (push)", "status": "pending"}
+        for i in range(100)
+    ]
+    page2 = [
+        {"id": 200, "context": "CI / all-required (push)", "status": "success"},
+        {"id": 201, "context": "Secret scan / Scan diff for credential-shaped strings (push)",
+         "status": "success"},
+    ]
+    monkeypatch.setattr(prod, "_api_json_list", _paged_statuses_stub({1: page1, 2: page2}))
+
+    rows = prod.fetch_all_statuses("git.moleculesai.app", "molecule-ai/molecule-core", "a" * 40, "tok")
+    # Must have walked to page 2 and accumulated every row.
+    assert len(rows) == 102
+    assert prod.ci_context_state(rows, "CI / all-required (push)") == "success"
+    assert (
+        prod.ci_context_state(
+            rows, "Secret scan / Scan diff for credential-shaped strings (push)"
+        )
+        == "success"
+    )
+
+
+def test_fetch_all_statuses_genuinely_absent_context_stays_missing(monkeypatch):
+    # The required context is on NO page → fail-closed: ci_context_state must
+    # report "missing", which context_is_satisfied() rejects → gate stays shut.
+    page1 = [
+        {"id": i, "context": f"noise-{i} (push)", "status": "success"}
+        for i in range(100)
+    ]
+    page2 = [{"id": 200, "context": "some-other (push)", "status": "success"}]
+    monkeypatch.setattr(prod, "_api_json_list", _paged_statuses_stub({1: page1, 2: page2}))
+
+    rows = prod.fetch_all_statuses("git.moleculesai.app", "molecule-ai/molecule-core", "b" * 40, "tok")
+    state = prod.ci_context_state(rows, "CI / all-required (push)")
+    assert state == "missing"
+    assert prod.context_is_satisfied(state) is False
+
+
+def test_fetch_all_statuses_fail_closed_on_page_error(monkeypatch):
+    # A page that raises (unreadable) must propagate, never silently truncate
+    # the scan and let the caller treat a partial list as complete.
+    def boom(url, _token):
+        if "page=2" in url:
+            raise RuntimeError("GET .../statuses?page=2 -> HTTP 502: bad gateway")
+        return [{"id": i, "context": f"n-{i}", "status": "success"} for i in range(100)]
+
+    monkeypatch.setattr(prod, "_api_json_list", boom)
+    try:
+        prod.fetch_all_statuses("h", "r", "c" * 40, "tok")
+    except RuntimeError as exc:
+        assert "502" in str(exc)
+    else:
+        raise AssertionError("expected page-2 error to propagate (fail-closed)")
+
+
+def test_wait_for_ci_context_succeeds_when_required_status_is_past_page_one(monkeypatch):
+    # End-to-end: the gate reads the EXHAUSTIVE list, so a required SUCCESS that
+    # only exists past page 1 lets the deploy proceed instead of timing out.
+    full = [
+        {"id": i, "context": f"noise-{i} (push)", "status": "success"}
+        for i in range(100)
+    ] + [
+        {"id": 500, "context": "CI / all-required (push)", "status": "success"},
+        {"id": 501, "context": "Secret scan / Scan diff for credential-shaped strings (push)",
+         "status": "success"},
+    ]
+    monkeypatch.setattr(prod, "fetch_all_statuses", lambda *a, **k: full)
+    result = prod.wait_for_ci_context(
+        {"GITHUB_SHA": "d" * 40, "GITEA_TOKEN": "tok", "CI_STATUS_TIMEOUT_SECONDS": "30"}
+    )
+    assert result == "success"
+
+
+def test_wait_for_ci_context_times_out_fail_closed_when_required_absent(monkeypatch):
+    # Genuinely-absent required context across all pages → never satisfied →
+    # the gate times out rather than green-lighting the deploy (no fail-open).
+    present_but_irrelevant = [
+        {"id": 500, "context": "some-other (push)", "status": "success"},
+    ]
+    monkeypatch.setattr(prod, "fetch_all_statuses", lambda *a, **k: present_but_irrelevant)
+    # Zero timeout + 0 interval → single poll then TimeoutError.
+    try:
+        prod.wait_for_ci_context(
+            {
+                "GITHUB_SHA": "e" * 40,
+                "GITEA_TOKEN": "tok",
+                "CI_STATUS_TIMEOUT_SECONDS": "1",
+                "CI_STATUS_POLL_INTERVAL_SECONDS": "1",
+            }
+        )
+    except TimeoutError as exc:
+        assert "missing" in str(exc)
+    else:
+        raise AssertionError("expected fail-closed TimeoutError, not a satisfied gate")
@@ -244,7 +244,12 @@ jobs:
          # fail if any didn't land — that would be a real regression we
          # want loud.
          # workspace_schedules added for the #2149 scheduler integration tests.
-          for tbl in delegations workspaces activity_logs pending_uploads workspace_schedules; do
+          # workspace_auth_tokens + org_api_tokens added for the #2156
+          # registry-auth TestIntegration_ suite (#2148). Without this
+          # guard, a silently-skipped migration 020 (workspace_auth_tokens)
+          # or 035 (org_api_tokens) would let the auth tests run against
+          # missing tables and falsely green.
+          for tbl in delegations workspaces activity_logs pending_uploads workspace_schedules workspace_auth_tokens org_api_tokens; do
            if ! psql -h "${PG_HOST}" -U postgres -d molecule -tA \
                -c "SELECT 1 FROM information_schema.tables WHERE table_name = '$tbl'" \
                | grep -q 1; then
@@ -285,6 +290,33 @@ jobs:
          # / workspaces all landed by the migration replay step above).
          go test -tags=integration -timeout 5m -v ./internal/scheduler/ -run "^TestIntegration_"

+      - if: needs.detect-changes.outputs.handlers == 'true'
+        name: Migration replay-from-scratch gate (#2150)
+        env:
+          PGPASSWORD: test
+        run: |
+          # Issue #2150 (SOP internal#765): prove the FULL forward migration
+          # chain (.up + legacy .sql) replays from a blank schema via the
+          # PRODUCTION db.RunMigrations entrypoint — hard-fail on any error.
+          #
+          # This is the gap the psql apply loop above does NOT cover: that
+          # loop deliberately SKIPS failing migrations (`⊘ skipped`), so it
+          # stays green even if the chain stops replaying. The Go test below
+          # uses the real boot-time runner with hard-fail semantics, catching
+          # the #211 .down-wipe class and the 045 non-idempotent crash-loop
+          # class (it runs the chain twice).
+          #
+          # Run against a SEPARATE database so the destructive
+          # `DROP SCHEMA public CASCADE` inside the test never touches the
+          # `molecule` DB the handlers integration tests above migrated. No
+          # ordering coupling with the handlers step.
+          createdb -h "${PG_HOST}" -U postgres molecule_replay 2>/dev/null || \
+            psql -h "${PG_HOST}" -U postgres -d molecule \
+              -c "CREATE DATABASE molecule_replay" >/dev/null 2>&1 || true
+          INTEGRATION_DB_URL="postgres://postgres:test@${PG_HOST}:5432/molecule_replay?sslmode=disable" \
+            go test -tags=integration -timeout 5m -v ./internal/db/ \
+              -run '^TestIntegration_Migration|^TestIntegration_InitPostgres'
+
      - if: failure() && needs.detect-changes.outputs.handlers == 'true'
        name: Diagnostic dump on failure
        env:
@@ -74,6 +74,10 @@ jobs:
    env:
      PG_CONTAINER: pg-lpe2e-${{ github.run_id }}-${{ github.run_attempt }}
      REDIS_CONTAINER: redis-lpe2e-${{ github.run_id }}-${{ github.run_attempt }}
+      # Hard-code dev mode at the job level so the platform server ALWAYS sees it,
+      # even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA).
+      MOLECULE_ENV: development
+      SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!!
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
@@ -124,12 +128,16 @@ jobs:

      - name: Configure platform env (admin token + local Docker provisioner)
        run: |
+          # Allocate an unused ephemeral port to avoid collision with concurrent
+          # jobs or stale processes from prior cancelled runs (see #2450).
+          PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
+          echo "PORT=${PORT}" >> "$GITHUB_ENV"
+          echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
          # Deterministic admin token: the script sends MOLECULE_ADMIN_TOKEN as the
          # bearer; the platform checks ADMIN_TOKEN. Set both to the same value.
          T="lpe2e-admin-${{ github.run_id }}-${{ github.run_attempt }}"
          echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
          echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
-          echo "BASE=http://localhost:8080" >> "$GITHUB_ENV"
          # MOLECULE_ENV=development: dev posture. MOLECULE_ORG_ID is left UNSET so
          # main.go wires the LOCAL Docker provisioner (not the CP provisioner), and
          # MOLECULE_IMAGE_REGISTRY is left UNSET so image resolution uses
@@ -143,21 +151,10 @@ jobs:

      - name: Kill stale platform-server before start (issue #1046)
        run: |
-          # ROOT CAUSE of the stub-gate red on docker-host: both this gating job
-          # and the advisory lifecycle-real job bind the SAME fixed host port
-          # :8080 (PORT=8080 ./platform-server). On the small docker-host runner
-          # pool a prior cancelled/timeout run can leave a zombie platform-server
-          # on :8080 (a cancelled run never reaches "Stop platform"), and — until
-          # lifecycle-real was serialised behind this job via needs: — the two
-          # jobs could also co-schedule on one runner and contend for :8080. A
-          # second bind on :8080 is FATAL (the server exits), so "Wait for
-          # /health" times out at 300s and this REQUIRED gate reds. Free the port
-          # before binding — mirrors the e2e-api.yml #1046 fix for the identical
-          # fixed-port-on-shared-runner class.
-          #
-          # /proc scan — works on any Linux without pkill/lsof/ss. comm is
-          # truncated to 15 chars: "platform-serve" matches "platform-server".
-          # Verify via cmdline to avoid false positives.
+          # Dynamic port allocation (see #2450) eliminates the fixed-port race
+          # that caused this gate to red when a prior run left a zombie process.
+          # We still sweep by process name to avoid leaking platform-server
+          # processes on the shared runner.
          killed=0
          for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do
            kpid="${pid%/comm}"; kpid="${kpid##*/}"
@@ -169,35 +166,28 @@ jobs:
            fi
          done
          if [ "$killed" -gt 0 ]; then echo "Killed $killed stale platform-server process(es)."; else echo "No platform-server-named process found."; fi
-          # Belt-and-braces: also free :8080 from ANY holder regardless of process
-          # name. A differently-named squatter (e.g. a leftover Fastify dev server
-          # from another job) survives the comm-name scan above, makes our bind
-          # FATAL, and can false-positive the /health probe below (no-flakes RCA;
-          # tracked alongside #2430). fuser/lsof are present on the ubuntu runner;
-          # if neither exists the name-scan above is the floor.
-          if command -v fuser >/dev/null 2>&1; then fuser -k 8080/tcp 2>/dev/null || true; fi
-          if command -v lsof  >/dev/null 2>&1; then lsof -ti tcp:8080 2>/dev/null | xargs -r kill -9 2>/dev/null || true; fi
-          sleep 2
-          echo ":8080 freed (comm-scan + port-scan swept any squatter)."
+          sleep 1

      - name: Start platform (background)
        working-directory: workspace-server
        run: |
-          # Bind to :8080 (the script's BASE). DATABASE_URL/REDIS_URL/ADMIN_TOKEN/
-          # MOLECULE_ENV are inherited from $GITHUB_ENV.
-          PORT=8080 ./platform-server > platform.log 2>&1 &
+          # Bind to the dynamically allocated port (see #2450).
+          # DATABASE_URL/REDIS_URL/ADMIN_TOKEN/MOLECULE_ENV are inherited from
+          # $GITHUB_ENV.
+          PORT=$PORT ./platform-server > platform.log 2>&1 &
          echo $! > platform.pid

      - name: Wait for /health (+ migrations applied)
        run: |
          DEADLINE=300; PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"; start=$(date +%s)
          while :; do
-            # Verify OUR server owns :8080 BEFORE trusting /health. Our server binds
-            # :8080 or exits FATAL, so "our PID alive" <=> "we own :8080"; checking it
-            # first stops a squatter that answers /health on :8080 (our bind having
-            # failed) from false-positiving the gate (no-flakes RCA).
+            # Verify OUR server is still alive before trusting /health. Our server
+            # binds the allocated port or exits FATAL, so "our PID alive" <=>
+            # "we own the port"; checking it first stops a squatter that answers
+            # /health on the same port (our bind having failed) from false-positiving
+            # the gate (no-flakes RCA).
            if [ -n "$PID" ] && ! kill -0 "$PID" 2>/dev/null; then
-              echo "::error::platform-server exited early (failed to bind :8080 or crashed)"; cat workspace-server/platform.log || true; exit 1
+              echo "::error::platform-server exited early (failed to bind or crashed)"; cat workspace-server/platform.log || true; exit 1
            fi
            if curl -sf "$BASE/health" >/dev/null; then
              tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \
@@ -237,13 +227,13 @@ jobs:
  lifecycle-real:
    name: Local Provision Lifecycle E2E (real image + MiniMax LLM, advisory)
    runs-on: docker-host
-    # Serialise behind the gating stub job: both jobs bind the SAME fixed host
-    # port :8080, so co-scheduling them on one docker-host runner makes the
-    # second platform-server fail to bind (fatal) and reds whichever lost the
-    # race. `needs:` forces this advisory job to start only AFTER lifecycle-stub
-    # finishes, so they never contend for :8080. continue-on-error keeps a real-
-    # job miss non-blocking; `needs:` does NOT gate on the stub's success (a
-    # failed required gate still lets this advisory dependent run).
+    # Serialise behind the gating stub job: both jobs share the same docker-host
+    # runner and provision sibling containers. `needs:` forces this advisory job
+    # to start only AFTER lifecycle-stub finishes, avoiding resource contention.
+    # (Dynamic ports eliminated the fixed-port race; serialisation remains for
+    # docker-host capacity hygiene.) continue-on-error keeps a real-job miss
+    # non-blocking; `needs:` does NOT gate on the stub's success (a failed
+    # required gate still lets this advisory dependent run).
    needs: lifecycle-stub
    if: ${{ always() }}
    # Tracker for lint-continue-on-error-tracking (Tier 2e / internal#350): this
@@ -254,6 +244,10 @@ jobs:
    env:
      PG_CONTAINER: pg-lpe2e-real-${{ github.run_id }}-${{ github.run_attempt }}
      REDIS_CONTAINER: redis-lpe2e-real-${{ github.run_id }}-${{ github.run_attempt }}
+      # Hard-code dev mode at the job level so the platform server ALWAYS sees it,
+      # even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA).
+      MOLECULE_ENV: development
+      SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!!
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
      - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
@@ -299,10 +293,14 @@ jobs:

      - name: Configure platform env
        run: |
+          # Allocate an unused ephemeral port to avoid collision with concurrent
+          # jobs or stale processes from prior cancelled runs (see #2450).
+          PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
+          echo "PORT=${PORT}" >> "$GITHUB_ENV"
+          echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
          T="lpe2e-real-admin-${{ github.run_id }}-${{ github.run_attempt }}"
          echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
          echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
-          echo "BASE=http://localhost:8080" >> "$GITHUB_ENV"
          echo "MOLECULE_ENV=development" >> "$GITHUB_ENV"
          echo "SECRETS_ENCRYPTION_KEY=lpe2e-test-encryption-key-32bytes!!" >> "$GITHUB_ENV"

@@ -312,8 +310,9 @@ jobs:

      - name: Kill stale platform-server before start (issue #1046)
        run: |
-          # Same fixed-:8080 hygiene as the stub job — free the port from any
-          # zombie left by a cancelled run before this job binds it.
+          # Dynamic port allocation (see #2450) eliminates the fixed-port race.
+          # We still sweep by process name to avoid leaking platform-server
+          # processes on the shared runner.
          killed=0
          for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do
            kpid="${pid%/comm}"; kpid="${kpid##*/}"
@@ -325,30 +324,23 @@ jobs:
            fi
          done
          if [ "$killed" -gt 0 ]; then echo "Killed $killed stale platform-server process(es)."; else echo "No platform-server-named process found."; fi
-          # Belt-and-braces: free :8080 from ANY holder regardless of process name
-          # (a differently-named squatter survives the comm-name scan above, makes
-          # our bind FATAL, and can false-positive the /health probe). Mirrors the
-          # stub job's no-flakes fix (tracked alongside #2430).
-          if command -v fuser >/dev/null 2>&1; then fuser -k 8080/tcp 2>/dev/null || true; fi
-          if command -v lsof  >/dev/null 2>&1; then lsof -ti tcp:8080 2>/dev/null | xargs -r kill -9 2>/dev/null || true; fi
-          sleep 2
-          echo ":8080 freed (comm-scan + port-scan swept any squatter)."
+          sleep 1

      - name: Start platform (background)
        working-directory: workspace-server
        run: |
-          PORT=8080 ./platform-server > platform.log 2>&1 &
+          PORT=$PORT ./platform-server > platform.log 2>&1 &
          echo $! > platform.pid

      - name: Wait for /health (+ migrations applied)
        run: |
          DEADLINE=300; PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"; start=$(date +%s)
          while :; do
-            # Verify OUR server owns :8080 before trusting /health (no-flakes RCA):
-            # our server binds :8080 or exits FATAL, so checking our PID first stops
-            # a squatter answering /health on :8080 from false-positiving the gate.
+            # Verify OUR server is still alive before trusting /health. Our server
+            # binds the allocated port or exits FATAL, so checking our PID first
+            # stops a squatter from false-positiving the gate (no-flakes RCA).
            if [ -n "$PID" ] && ! kill -0 "$PID" 2>/dev/null; then
-              echo "::error::platform-server exited early (failed to bind :8080 or crashed)"; cat workspace-server/platform.log || true; exit 1
+              echo "::error::platform-server exited early (failed to bind or crashed)"; cat workspace-server/platform.log || true; exit 1
            fi
            if curl -sf "$BASE/health" >/dev/null; then
              tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \
@@ -248,16 +248,36 @@ jobs:
            --tag "${STAGING_TENANT_IMAGE_NAME}:${TAG_LATEST}"
          )

-          docker buildx build \
-            --file ./workspace-server/Dockerfile.tenant \
-            --build-arg NEXT_PUBLIC_PLATFORM_URL= \
-            --build-arg GIT_SHA="${GIT_SHA}" \
-            --label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \
-            --label "org.opencontainers.image.revision=${GIT_SHA}" \
-            --label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-            --label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
-            "${build_tags[@]}" \
-            --push .
+          # Retry loop: buildkit EOF (internal#2468) is often transient on the
+          # publish runner under memory pressure. Up to 3 attempts with a fresh
+          # builder each time so a crashed buildkit doesn't poison the next try.
+          for attempt in 1 2 3; do
+            echo "::notice::Tenant image build attempt ${attempt}/3 ..."
+            builder="tenant-builder-${GITHUB_RUN_ID}-${attempt}"
+            docker buildx create --name "${builder}" --use >/dev/null 2>&1 || true
+            if docker buildx build \
+                --builder "${builder}" \
+                --file ./workspace-server/Dockerfile.tenant \
+                --build-arg NEXT_PUBLIC_PLATFORM_URL= \
+                --build-arg GIT_SHA="${GIT_SHA}" \
+                --label "org.opencontainers.image.source=https://git.moleculesai.app/molecule-ai/${REPO}" \
+                --label "org.opencontainers.image.revision=${GIT_SHA}" \
+                --label "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+                --label "molecule.workflow.run_id=${GITHUB_RUN_ID}" \
+                "${build_tags[@]}" \
+                --push .; then
+              docker buildx rm "${builder}" >/dev/null 2>&1 || true
+              echo "::notice::Tenant image build succeeded on attempt ${attempt}"
+              break
+            fi
+            echo "::warning::Tenant image build attempt ${attempt} failed — cleaning builder and retrying"
+            docker buildx rm "${builder}" >/dev/null 2>&1 || true
+            sleep 10
+            if [ "$attempt" -eq 3 ]; then
+              echo "::error::Tenant image build failed after 3 attempts"
+              exit 1
+            fi
+          done

  # bp-exempt: production deploy side-effect; merge is gated by CI / all-required and this job waits for push CI before acting.
  deploy-production:
@@ -3,13 +3,36 @@
 import { useEffect, useMemo, useState } from "react";
 import { api } from "@/lib/api";
 import { runtimeDisplayName } from "@/lib/runtime-names";
+import { isSaaSTenant } from "@/lib/tenant";
 import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
 import type { WorkspaceCompute } from "@/store/socket";

-const INSTANCE_TYPES = ["t3.medium", "t3.large", "t3.xlarge", "t3.2xlarge", "m6i.large", "m6i.xlarge", "c6i.xlarge"];
+// Machine sizes keyed by cloud provider — an AWS t3.* is meaningless on Hetzner,
+// etc. MUST mirror the workspace-server workspaceComputeInstanceAllowlist (which
+// mirrors the CP provider configs); the PATCH validation rejects a mismatch 400.
+const INSTANCE_TYPES_BY_PROVIDER: Record<string, string[]> = {
+  aws: ["t3.medium", "t3.large", "t3.xlarge", "t3.2xlarge", "m6i.large", "m6i.xlarge", "c6i.xlarge"],
+  hetzner: ["cpx11", "cpx21", "cpx31", "cpx41", "cpx51", "cax11", "cax21", "cax31", "cax41"],
+  gcp: ["e2-small", "e2-medium", "e2-standard-2", "e2-standard-4", "e2-standard-8"],
+};
+const DEFAULT_INSTANCE_BY_PROVIDER: Record<string, string> = {
+  aws: "t3.medium", hetzner: "cpx31", gcp: "e2-standard-2",
+};
+const normalizeProvider = (p?: string): string => (p === "gcp" || p === "hetzner" ? p : "aws");
+const instanceTypesForProvider = (p?: string): string[] =>
+  INSTANCE_TYPES_BY_PROVIDER[normalizeProvider(p)] ?? INSTANCE_TYPES_BY_PROVIDER.aws;
+const defaultInstanceForProvider = (p?: string): string =>
+  DEFAULT_INSTANCE_BY_PROVIDER[normalizeProvider(p)] ?? "t3.medium";
+
+// Editable cloud-provider options (multi-provider RFC) — mirrors CreateWorkspaceDialog.
+const CLOUD_PROVIDER_OPTIONS = [
+  { value: "aws", label: "AWS (default)" },
+  { value: "gcp", label: "GCP" },
+  { value: "hetzner", label: "Hetzner" },
+];
+
 const RUNTIME_OPTIONS = ["claude-code", "codex", "hermes", "openclaw", "kimi", "kimi-cli", "external"];
 const RESOLUTIONS = ["1280x720", "1440x900", "1920x1080", "2560x1440"];
-const DEFAULT_HEADLESS_INSTANCE_TYPE = "t3.medium";
 const DEFAULT_HEADLESS_ROOT_GB = 30;

 type Props = {
@@ -23,6 +46,7 @@ type Props = {

 type FormState = {
  runtime: string;
+  provider: string; // cloud backend; editable in SaaS (in-place switch recreates the box)
  instanceType: string;
  rootGB: string;
  displayEnabled: boolean;
@@ -38,16 +62,16 @@ const DATA_PERSISTENCE_OPTIONS = ["", "persist", "ephemeral"];
 const dataPersistenceLabel = (v: string): string =>
  v === "persist" ? "Always keep (persist)" : v === "ephemeral" ? "Don't keep (ephemeral)" : "Auto";

-// Cloud/compute backend display name. The provider is chosen at create time and
-// is NOT editable here (changing a workspace's cloud requires a recreate), so
-// it renders as a read-only badge — but we must preserve it across Save (the
-// compute payload is rebuilt below, and dropping it would wipe the column).
+// Cloud/compute backend display name (read-only fallback for non-SaaS / legacy).
 const cloudProviderLabel = (v: string | undefined): string =>
  v === "gcp" ? "GCP" : v === "hetzner" ? "Hetzner" : "AWS";

 export function ContainerConfigTab({ workspaceId, data }: Props) {
+  // Provider is editable only in SaaS (CP-provisioned boxes). Local/Docker has no
+  // cloud-provider concept, so we keep the read-only badge there.
+  const isSaaS = useMemo(() => isSaaSTenant(), []);
  const runtime = data.runtime;
-  const provider = data.compute?.provider; // read-only; set at create time
+  const provider = data.compute?.provider;
  const instanceType = data.compute?.instance_type;
  const rootGB = data.compute?.volume?.root_gb;
  const displayMode = data.compute?.display?.mode;
@@ -56,8 +80,8 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
  const displayHeight = data.compute?.display?.height;
  const dataPersistence = data.compute?.data_persistence;
  const initial = useMemo(
-    () => formFromData({ runtime, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence }),
-    [runtime, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence],
+    () => formFromData({ runtime, provider, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence }),
+    [runtime, provider, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence],
  );
  const [form, setForm] = useState<FormState>(initial);
  const [saving, setSaving] = useState(false);
@@ -87,6 +111,21 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
    try {
      let applyTemplateOnRestart = data.applyTemplateOnRestart ?? false;
      if (dirty) {
+        // In-place cloud switch is DESTRUCTIVE: changing the provider recreates the
+        // box on the new cloud (the workspace-server deprovisions the old box on
+        // its old cloud first, then the restart provisions on the new one). Confirm
+        // before doing it — the current box and any non-persisted state are lost.
+        const providerChanged = normalizeProvider(form.provider) !== normalizeProvider(initial.provider);
+        if (providerChanged && typeof window !== "undefined") {
+          const ok = window.confirm(
+            `Switch this workspace to ${cloudProviderLabel(form.provider)}? This RECREATES the box on the new cloud — the current box and any non-persisted state are replaced.`,
+          );
+          if (!ok) {
+            setSaving(false);
+            return;
+          }
+        }
+
        const rootGB = parseInt(form.rootGB, 10);
        if (!Number.isFinite(rootGB)) {
          setError("Root volume must be a number");
@@ -102,10 +141,11 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
            : { mode: "none" },
          // internal#734: omit when "auto" so the wire/default behavior is unchanged.
          ...(form.dataPersistence ? { data_persistence: form.dataPersistence } : {}),
-          // Preserve the create-time cloud provider — it's not editable here, but
-          // this PATCH rebuilds the whole compute object, so omitting it would
-          // wipe the persisted provider (and mislead the badge after a Save).
-          ...(provider ? { provider } : {}),
+          // Cloud backend: send the (possibly switched) provider. Omit for the
+          // default (aws) so a non-switching AWS save keeps the wire unchanged;
+          // a switch TO aws (omit) vs FROM aws (explicit) both register correctly
+          // because the workspace-server normalizes ""→aws when diffing.
+          ...(normalizeProvider(form.provider) !== "aws" ? { provider: normalizeProvider(form.provider) } : {}),
        };

        const resp = await api.patch<{ needs_restart?: boolean }>(`/workspaces/${workspaceId}`, {
@@ -140,15 +180,16 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
        <div className="mb-3 flex items-center justify-between gap-3">
          <div className="flex items-center gap-2">
            <h3 className="text-sm font-semibold text-ink">Container Config</h3>
-            {/* Read-only cloud-provider badge — which cloud this workspace's box
-                runs on (AWS/GCP/Hetzner). Defaults to AWS when unset (legacy
-                rows). Set at create time in the Create Workspace dialog. */}
-            <span
-              title="Cloud provider for this workspace's compute (set at create time)"
-              className="rounded-full border border-line/60 bg-surface-sunken px-2 py-0.5 font-mono text-[10px] uppercase tracking-wide text-ink-mid"
-            >
-              {cloudProviderLabel(provider)}
-            </span>
+            {/* Non-SaaS (local/Docker) has no cloud-provider concept → read-only
+                badge. In SaaS the provider is an editable selector in the form. */}
+            {!isSaaS && (
+              <span
+                title="Cloud provider for this workspace's compute"
+                className="rounded-full border border-line/60 bg-surface-sunken px-2 py-0.5 font-mono text-[10px] uppercase tracking-wide text-ink-mid"
+              >
+                {cloudProviderLabel(provider)}
+              </span>
+            )}
          </div>
          {data.needsRestart && <span className="text-[11px] text-warm">Restart required</span>}
        </div>
@@ -162,11 +203,32 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
            optionLabel={runtimeDisplayName}
            onChange={(runtime) => setForm((s) => ({ ...s, runtime }))}
          />
+          {isSaaS && (
+            <SelectField
+              id="cloud-provider"
+              label="Cloud provider"
+              value={normalizeProvider(form.provider)}
+              options={CLOUD_PROVIDER_OPTIONS.map((p) => p.value)}
+              optionLabel={(v) => CLOUD_PROVIDER_OPTIONS.find((p) => p.value === v)?.label ?? v}
+              // Switching cloud resets the instance type to the new provider's
+              // default (an AWS t3.* is invalid on Hetzner, etc.) — also keeps the
+              // instance-type dropdown below in sync with the provider's sizes.
+              onChange={(provider) =>
+                setForm((s) => ({
+                  ...s,
+                  provider,
+                  instanceType: instanceTypesForProvider(provider).includes(s.instanceType)
+                    ? s.instanceType
+                    : defaultInstanceForProvider(provider),
+                }))
+              }
+            />
+          )}
          <SelectField
            id="instance-type"
            label="Instance type"
            value={form.instanceType}
-            options={INSTANCE_TYPES}
+            options={instanceTypesForProvider(form.provider)}
            onChange={(instanceType) => setForm((s) => ({ ...s, instanceType }))}
          />
          <label className="grid gap-1" htmlFor="root-volume-gb">
@@ -270,6 +332,7 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {

 function formFromData(data: {
  runtime?: string;
+  provider?: string;
  instanceType?: string;
  rootGB?: number;
  displayMode?: string;
@@ -281,9 +344,11 @@ function formFromData(data: {
  const width = data.displayWidth ?? 1920;
  const height = data.displayHeight ?? 1080;
  const resolution = `${width}x${height}`;
+  const provider = normalizeProvider(data.provider);
  return {
    runtime: data.runtime || "claude-code",
-    instanceType: data.instanceType || DEFAULT_HEADLESS_INSTANCE_TYPE,
+    provider,
+    instanceType: data.instanceType || defaultInstanceForProvider(provider),
    rootGB: String(data.rootGB || DEFAULT_HEADLESS_ROOT_GB),
    displayEnabled: !!data.displayMode && data.displayMode !== "none",
    displayMode: data.displayMode && data.displayMode !== "none" ? data.displayMode : "desktop-control",
@@ -23,6 +23,13 @@ vi.mock("@/store/canvas", () => ({
  ),
 }));

+// SaaS so the editable cloud-provider selector renders (non-SaaS shows a read-only
+// badge). Existing tests keep provider=aws (default), which is omitted from the
+// PATCH payload, so their assertions are unaffected.
+vi.mock("@/lib/tenant", () => ({
+  isSaaSTenant: () => true,
+}));
+
 import { ContainerConfigTab } from "../ContainerConfigTab";

 afterEach(() => {
@@ -314,4 +321,67 @@ describe("ContainerConfigTab", () => {
    await waitFor(() => expect(restartWorkspace).toHaveBeenCalledWith("ws-compute", { applyTemplate: true }));
    expect(apiPatch).not.toHaveBeenCalled();
  });
+
+  it("switches cloud provider — keys the instance-type list to the provider, confirms the recreate, and PATCHes the new provider", async () => {
+    const confirmSpy = vi.spyOn(window, "confirm").mockReturnValue(true);
+    render(
+      <ContainerConfigTab
+        workspaceId="ws-switch"
+        data={{
+          runtime: "claude-code",
+          status: "online",
+          needsRestart: false,
+          activeTasks: 0,
+          maxConcurrentTasks: null,
+          workspaceAccess: "read-write",
+          deliveryMode: "push",
+          compute: { instance_type: "t3.large", provider: "aws", volume: { root_gb: 30 } },
+        }}
+      />,
+    );
+
+    const providerSel = screen.getByLabelText("Cloud provider");
+    expect(providerSel).toHaveProperty("value", "aws");
+    expect(screen.getByLabelText("Instance type")).toHaveProperty("value", "t3.large");
+
+    // Switch to Hetzner → the instance type resets to the Hetzner default (an AWS
+    // t3.* is invalid on Hetzner) and the options become Hetzner sizes.
+    fireEvent.change(providerSel, { target: { value: "hetzner" } });
+    expect(screen.getByLabelText("Instance type")).toHaveProperty("value", "cpx31");
+
+    fireEvent.click(screen.getByRole("button", { name: "Save" }));
+    await waitFor(() => expect(apiPatch).toHaveBeenCalledTimes(1));
+    expect(confirmSpy).toHaveBeenCalled(); // destructive recreate confirmed
+    const body = apiPatch.mock.calls[0][1] as { compute: { provider?: string; instance_type?: string } };
+    expect(body.compute.provider).toBe("hetzner");
+    expect(body.compute.instance_type).toBe("cpx31");
+    confirmSpy.mockRestore();
+  });
+
+  it("does not treat a non-provider edit as a recreate (no confirm; aws default omitted)", async () => {
+    const confirmSpy = vi.spyOn(window, "confirm").mockReturnValue(true);
+    render(
+      <ContainerConfigTab
+        workspaceId="ws-noswitch"
+        data={{
+          runtime: "claude-code",
+          status: "online",
+          needsRestart: false,
+          activeTasks: 0,
+          maxConcurrentTasks: null,
+          workspaceAccess: "read-write",
+          deliveryMode: "push",
+          compute: { instance_type: "t3.large", provider: "aws", volume: { root_gb: 30 } },
+        }}
+      />,
+    );
+
+    fireEvent.change(screen.getByLabelText("Root volume"), { target: { value: "60" } });
+    fireEvent.click(screen.getByRole("button", { name: "Save" }));
+    await waitFor(() => expect(apiPatch).toHaveBeenCalledTimes(1));
+    expect(confirmSpy).not.toHaveBeenCalled();
+    const body = apiPatch.mock.calls[0][1] as { compute: { provider?: string } };
+    expect(body.compute.provider).toBeUndefined(); // aws default omitted (wire unchanged)
+    confirmSpy.mockRestore();
+  });
 });
@@ -162,6 +162,11 @@ describe("DisplayTab", () => {
      controller: "user",
      ttl_seconds: 300,
    });
+    // Defensive: the noVNC constructor is async (dynamic import), so wait
+    // for it to be called before asserting arguments (prevents flake in CI).
+    await waitFor(() => {
+      expect(mockRFBConstructor).toHaveBeenCalled();
+    });
    expect(mockRFBConstructor).toHaveBeenCalledWith(
      expect.any(HTMLElement),
      expect.stringContaining("/workspaces/ws-display/display/session/websockify"),
@@ -162,6 +162,27 @@ describe("hydrate", () => {
    useCanvasStore.getState().hydrate([ws]);
    expect(useCanvasStore.getState().nodes[0].data.currentTask).toBe("");
  });
+
+  it("preserves in-flight turn status after refresh (issue #2391)", () => {
+    // Simulates a page refresh: the canvas re-hydrates from GET /workspaces
+    // while the agent has an active in-flight turn. The store must reflect
+    // "working" immediately — no dependence on a subsequent TASK_UPDATED
+    // socket event. This prevents the "stuck idle" UX after reload.
+    const ws = makeWS({
+      id: "ws-1",
+      status: "online",
+      current_task: "Analyzing data",
+      active_tasks: 2,
+    });
+    useCanvasStore.getState().hydrate([ws]);
+    const node = useCanvasStore.getState().nodes[0];
+    expect(node.data.currentTask).toBe("Analyzing data");
+    expect(node.data.activeTasks).toBe(2);
+    expect(node.data.status).toBe("online");
+    // Defensive: the node must be considered "working" for any UI that
+    // gates on currentTask (e.g. ChatTab thinking indicator).
+    expect(!!node.data.currentTask).toBe(true);
+  });
 });

 describe("summarizeWorkspaceCapabilities", () => {
@@ -488,6 +488,12 @@ echo ""
 # Step 5 — proxy reach (ws-<id>:8000 Docker-DNS rewrite, end to end).
 # ----------------------------------------------------------------------------
 echo "--- Step 5: proxy reach (POST /workspaces/$WSID/a2a) ---"
+# Debug: print the workspace URL the platform stored so SSRF failures are
+# actionable (#2468 RCA).
+WS_DEBUG=$(admin_curl "$BASE/workspaces/$WSID")
+WS_URL_DEBUG=$(ws_field "$WS_DEBUG" "url")
+WS_STATUS_DEBUG=$(ws_field "$WS_DEBUG" "status")
+echo "  workspace url=$WS_URL_DEBUG status=$WS_STATUS_DEBUG"
 # In minimax mode we send a DETERMINISTIC known-answer prompt and assert the
 # model echoes the answer back — proving a real LLM round-trip, not just
 # reachability. Otherwise a plain "ping".
@@ -0,0 +1,286 @@
+//go:build integration
+// +build integration
+
+// postgres_replay_integration_test.go — REAL Postgres integration tests for
+// the boot-time migration runner (db.RunMigrations) and the connection
+// bootstrap (db.InitPostgres).
+//
+// Issue #2150 (SOP rule internal#765 regression-coverage). test_layer:
+// real-postgres.
+//
+// Run locally with:
+//
+//	docker run --rm -d --name pg-replay \
+//	  -e POSTGRES_PASSWORD=test -e POSTGRES_DB=molecule \
+//	  -p 55432:5432 postgres:15-alpine
+//	sleep 4
+//	cd workspace-server
+//	INTEGRATION_DB_URL="postgres://postgres:test@localhost:55432/molecule?sslmode=disable" \
+//	  go test -tags=integration ./internal/db/ -run '^TestIntegration_Migration|^TestIntegration_InitPostgres'
+//
+// In CI these run on .gitea/workflows/handlers-postgres-integration.yml,
+// which already provisions a real Postgres on the operator-host bridge and
+// triggers on workspace-server/migrations/** changes — the exact blast
+// radius this gate must cover.
+//
+// WHY A REAL DATABASE — and why the existing coverage is NOT enough
+// -----------------------------------------------------------------
+// postgres_migrate_test.go and postgres_schema_migrations_test.go are
+// sqlmock-only: they pin which SQL *statements* fire, but a mock cannot
+// execute SQL, so it cannot prove the 118-file (.up + legacy .sql) chain
+// actually REPLAYS FROM SCRATCH against a real Postgres. The CI psql loop
+// in handlers-postgres-integration.yml deliberately *skips* failing
+// migrations (`⊘ skipped`), so it would stay green even if the chain
+// stopped replaying — it is not a replay gate.
+//
+// This file closes that gap. It boots a Postgres, resets the public schema
+// to a blank slate, and runs the PRODUCTION db.RunMigrations entrypoint —
+// the same function platform boot calls — with hard-fail semantics. It
+// would FAIL (watch-fail intent) against:
+//
+//   - Issue #211: if RunMigrations regresses to globbing `*.sql` and
+//     sorting `.down.sql` before `.up.sql`, the rollback runs before the
+//     forward for any pair (020_workspace_auth_tokens was the canary),
+//     either erroring on the DROP or wiping the just-created table.
+//
+//   - The 045 crash-loop class (cp#429 / project_cp_migration_045_*): the
+//     runner re-applies every recorded-absent file every boot, so a
+//     non-idempotent migration (bare CREATE / INSERT without IF NOT EXISTS
+//     / ON CONFLICT) replays cleanly the first time and FAILS the second.
+//     TestIntegration_MigrationReplay_IsIdempotent_DoubleApply runs the
+//     full chain twice against the same DB to catch that at PR time.
+//
+//   - A new migration that depends on a table a later migration drops, or
+//     is mis-ordered in the lexicographic chain — it simply will not apply
+//     from scratch and the replay errors.
+//
+// All assertions key off the OBSERVABLE database state after the real run,
+// not a proxy for "a statement fired".
+
+package db
+
+import (
+	"database/sql"
+	"os"
+	"path/filepath"
+	"testing"
+
+	_ "github.com/lib/pq"
+)
+
+// migrationsDir is the on-disk path to the forward+legacy migration chain
+// relative to this test file (workspace-server/internal/db → ../../migrations).
+const migrationsDir = "../../migrations"
+
+// freshIntegrationDB opens $INTEGRATION_DB_URL (skipping the test if unset),
+// resets the `public` schema to an empty slate so the run is a true
+// replay-from-scratch regardless of what an earlier CI step applied, and
+// registers a Cleanup that closes the connection.
+//
+// It also points the package-global db.DB at this connection, because
+// RunMigrations operates on db.DB. NOT SAFE for t.Parallel() — it owns the
+// schema for the duration of the test.
+func freshIntegrationDB(t *testing.T) *sql.DB {
+	t.Helper()
+	url := os.Getenv("INTEGRATION_DB_URL")
+	if url == "" {
+		t.Skip("INTEGRATION_DB_URL not set; skipping real-PG replay test (local devs: see file header)")
+	}
+	conn, err := sql.Open("postgres", url)
+	if err != nil {
+		t.Fatalf("open: %v", err)
+	}
+	if err := conn.Ping(); err != nil {
+		t.Fatalf("ping: %v", err)
+	}
+	// True from-scratch: blow away any schema a prior CI step (e.g. the
+	// handlers psql apply-all loop) left behind, then start clean. This is
+	// what makes the test a *replay-from-scratch* gate rather than a
+	// re-apply-onto-existing test.
+	if _, err := conn.Exec(`DROP SCHEMA public CASCADE; CREATE SCHEMA public`); err != nil {
+		t.Fatalf("reset public schema: %v", err)
+	}
+	// gen_random_uuid() (used by 001_workspaces.sql et al.) lives in
+	// pgcrypto on PG < 13 and core on PG 13+. postgres:15-alpine has it in
+	// core, but create the extension defensively so the test does not pin a
+	// specific PG minor.
+	if _, err := conn.Exec(`CREATE EXTENSION IF NOT EXISTS pgcrypto`); err != nil {
+		t.Fatalf("create pgcrypto: %v", err)
+	}
+	t.Cleanup(func() { conn.Close() })
+	return conn
+}
+
+// forwardMigrationCount counts the files RunMigrations is expected to apply:
+// every *.sql that is NOT a *.down.sql. This is derived from the real
+// directory so the gate auto-tracks new migrations without an edit here.
+func forwardMigrationCount(t *testing.T) int {
+	t.Helper()
+	all, err := filepath.Glob(filepath.Join(migrationsDir, "*.sql"))
+	if err != nil {
+		t.Fatalf("glob migrations: %v", err)
+	}
+	n := 0
+	for _, f := range all {
+		if len(f) >= len(".down.sql") && f[len(f)-len(".down.sql"):] == ".down.sql" {
+			continue
+		}
+		n++
+	}
+	if n == 0 {
+		t.Fatalf("found zero forward migrations under %s — wrong path?", migrationsDir)
+	}
+	return n
+}
+
+// TestIntegration_InitPostgres_PingSucceeds proves the production connection
+// bootstrap actually establishes a usable pool against a real server. A
+// sqlmock test can never exercise the real DB.Ping() inside InitPostgres,
+// which is the line that turns a bad DSN / unreachable host into a boot
+// failure instead of a silently-broken pool.
+func TestIntegration_InitPostgres_PingSucceeds(t *testing.T) {
+	url := os.Getenv("INTEGRATION_DB_URL")
+	if url == "" {
+		t.Skip("INTEGRATION_DB_URL not set; skipping")
+	}
+	if err := InitPostgres(url); err != nil {
+		t.Fatalf("InitPostgres against real PG failed: %v", err)
+	}
+	if DB == nil {
+		t.Fatal("InitPostgres returned nil error but db.DB is nil")
+	}
+	// The pool must be live, not just opened.
+	if err := DB.Ping(); err != nil {
+		t.Fatalf("db.DB.Ping after InitPostgres: %v", err)
+	}
+	// Round-trip a trivial query to prove the connection actually serves.
+	var one int
+	if err := DB.QueryRow("SELECT 1").Scan(&one); err != nil {
+		t.Fatalf("SELECT 1 round-trip: %v", err)
+	}
+	if one != 1 {
+		t.Fatalf("SELECT 1 returned %d", one)
+	}
+}
+
+// TestIntegration_InitPostgres_BadDSNFails proves InitPostgres surfaces an
+// unreachable/garbage DSN as an error (the ping path), rather than handing
+// back a half-open pool. Watch-fail: if someone drops the DB.Ping() check
+// from InitPostgres, this stops returning an error and fails.
+func TestIntegration_InitPostgres_BadDSNFails(t *testing.T) {
+	if os.Getenv("INTEGRATION_DB_URL") == "" {
+		t.Skip("INTEGRATION_DB_URL not set; skipping")
+	}
+	// Valid DSN shape, but nothing is listening on this port.
+	err := InitPostgres("postgres://postgres:test@127.0.0.1:1/does_not_exist?sslmode=disable&connect_timeout=2")
+	if err == nil {
+		t.Fatal("expected InitPostgres to fail against an unreachable DSN, got nil (DB.Ping check removed?)")
+	}
+}
+
+// TestIntegration_MigrationReplay_FromScratch is the core gate: run the
+// PRODUCTION RunMigrations over a blank public schema and assert the full
+// forward chain applies cleanly with zero skips.
+//
+// Watch-fail intent:
+//   - #211 .down-wipe: a `.down.sql` leaking into the forward set would
+//     run a DROP before its CREATE → error here (hard fail), or wipe a
+//     table → the schema_migrations / table-presence assertions catch it.
+//   - mis-ordered / dangling-dependency migration → RunMigrations returns
+//     a non-nil error and this test fails.
+func TestIntegration_MigrationReplay_FromScratch(t *testing.T) {
+	conn := freshIntegrationDB(t)
+	DB = conn // RunMigrations operates on the package-global DB.
+
+	if err := RunMigrations(migrationsDir); err != nil {
+		t.Fatalf("full-chain replay-from-scratch failed: %v", err)
+	}
+
+	// Every forward migration must be recorded as applied — proves none was
+	// silently skipped (the failure mode the CI psql loop tolerates).
+	want := forwardMigrationCount(t)
+	var got int
+	if err := DB.QueryRow("SELECT COUNT(*) FROM schema_migrations").Scan(&got); err != nil {
+		t.Fatalf("count schema_migrations: %v", err)
+	}
+	if got != want {
+		t.Errorf("schema_migrations recorded %d migrations, expected %d (the full forward chain)", got, want)
+	}
+
+	// No `.down.sql` may ever be recorded — that is the #211 signature.
+	var downRecorded int
+	if err := DB.QueryRow(
+		"SELECT COUNT(*) FROM schema_migrations WHERE filename LIKE '%.down.sql'",
+	).Scan(&downRecorded); err != nil {
+		t.Fatalf("count down migrations: %v", err)
+	}
+	if downRecorded != 0 {
+		t.Errorf("a .down.sql migration was applied (#211 regression): %d recorded", downRecorded)
+	}
+
+	// Spot-check load-bearing tables that survive to HEAD of the chain.
+	// workspaces is the root table; workspace_auth_tokens was the #211
+	// canary (its data wipe regressed AdminAuth to fail-open).
+	for _, tbl := range []string{"workspaces", "workspace_auth_tokens", "delegations", "activity_logs"} {
+		var exists bool
+		if err := DB.QueryRow(
+			"SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_schema='public' AND table_name=$1)",
+			tbl,
+		).Scan(&exists); err != nil {
+			t.Fatalf("check table %s: %v", tbl, err)
+		}
+		if !exists {
+			t.Errorf("table %q missing after full replay — chain did not land it", tbl)
+		}
+	}
+
+	// agent_memories is CREATEd at 008 and DROPped at the end of the chain
+	// (20260524110000_drop_agent_memories). Its absence proves the late
+	// drop migration actually ran AFTER the early create — i.e. ordering
+	// held. If the chain ever runs a drop before its create, this flips.
+	var legacyExists bool
+	if err := DB.QueryRow(
+		"SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_schema='public' AND table_name='agent_memories')",
+	).Scan(&legacyExists); err != nil {
+		t.Fatalf("check agent_memories: %v", err)
+	}
+	if legacyExists {
+		t.Error("agent_memories still present at HEAD — the late drop migration did not replay in order")
+	}
+}
+
+// TestIntegration_MigrationReplay_IsIdempotent_DoubleApply guards the 045
+// crash-loop class (cp#429 / project_cp_migration_045_crashloop_idempotency_guard):
+// the runner re-checks every file on every boot, so a non-idempotent
+// migration replays fine once and FAILS on the second pass. Here we run the
+// full chain twice. The second pass must apply ZERO new files (all recorded)
+// and must not error.
+//
+// NOTE: this runs against the SAME populated schema, so it also exercises
+// the "skip already-applied" tracking path end-to-end against real PG, which
+// the sqlmock tests only simulate.
+func TestIntegration_MigrationReplay_IsIdempotent_DoubleApply(t *testing.T) {
+	conn := freshIntegrationDB(t)
+	DB = conn
+
+	if err := RunMigrations(migrationsDir); err != nil {
+		t.Fatalf("first replay failed: %v", err)
+	}
+	var afterFirst int
+	if err := DB.QueryRow("SELECT COUNT(*) FROM schema_migrations").Scan(&afterFirst); err != nil {
+		t.Fatalf("count after first: %v", err)
+	}
+
+	// Second boot: nothing new should apply, and it must not error even
+	// though the runner re-evaluates every file (the 045 failure mode).
+	if err := RunMigrations(migrationsDir); err != nil {
+		t.Fatalf("second replay failed (non-idempotent migration / 045 crash-loop class): %v", err)
+	}
+	var afterSecond int
+	if err := DB.QueryRow("SELECT COUNT(*) FROM schema_migrations").Scan(&afterSecond); err != nil {
+		t.Fatalf("count after second: %v", err)
+	}
+	if afterSecond != afterFirst {
+		t.Errorf("second boot changed schema_migrations from %d to %d — re-application is not a clean no-op", afterFirst, afterSecond)
+	}
+}
@@ -0,0 +1,291 @@
+package db
+
+// redis_test.go — regression coverage for the workspace online-status and
+// URL-resolution Redis layer (redis.go), which previously had NO test.
+//
+// Issue #2150 (SOP rule internal#765). redis.go drives two fleet-wide
+// behaviours that break silently if a key name or TTL drifts:
+//
+//   - online detection: SetOnline / RefreshTTL / IsOnline on `ws:<id>`.
+//     A wrong key prefix or a TTL shorter than the heartbeat interval makes
+//     live workspaces flap to "unreachable — restart" (the exact failure
+//     LivenessTTL=180s was tuned to avoid). A TTL too long hides real
+//     crashes.
+//   - proxy URL resolution: CacheURL / GetCachedURL / CacheInternalURL /
+//     GetCachedInternalURL on `ws:<id>:url` and `ws:<id>:internal_url`.
+//     A2A forwarding resolves the target workspace through these keys; a
+//     prefix collision (e.g. the liveness key overlapping the URL key)
+//     would serve the wrong URL or a literal "online" string as a URL.
+//
+// These tests run against miniredis — an in-process Redis that speaks the
+// real RESP protocol and enforces real TTL/expiry semantics — so they
+// exercise the actual go-redis client calls and key/TTL behaviour, not a
+// mock that rubber-stamps them. miniredis is already a module dependency.
+//
+// Watch-fail intent: change any `ws:%s...` format string in redis.go, or
+// regress LivenessTTL below the heartbeat window, and a test here fails.
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/alicebob/miniredis/v2"
+	"github.com/redis/go-redis/v9"
+)
+
+// withMiniRedis spins up an in-process Redis, points the package-global RDB
+// at it, and registers Cleanup. Returns the server handle so tests can drive
+// the clock (FastForward) to exercise TTL expiry deterministically.
+func withMiniRedis(t *testing.T) *miniredis.Miniredis {
+	t.Helper()
+	mr, err := miniredis.Run()
+	if err != nil {
+		t.Fatalf("miniredis.Run: %v", err)
+	}
+	RDB = redis.NewClient(&redis.Options{Addr: mr.Addr()})
+	t.Cleanup(func() {
+		RDB.Close()
+		mr.Close()
+	})
+	return mr
+}
+
+// TestLivenessTTL_ExceedsHeartbeatWindow pins the tuned TTL. The heartbeat
+// loop fires every 30s; LivenessTTL must allow several missed beats (the
+// comment in redis.go targets ~5) so a busy leader starved for 60-120s is
+// not falsely declared dead. 180s = 6×30s. Regressing this toward the old
+// 60s value reintroduces the false-positive restart cycle.
+func TestLivenessTTL_ExceedsHeartbeatWindow(t *testing.T) {
+	const heartbeatInterval = 30 * time.Second
+	const minMissedBeats = 5
+	if LivenessTTL < heartbeatInterval*minMissedBeats {
+		t.Errorf("LivenessTTL=%s is too short: must tolerate >=%d missed %s heartbeats (>= %s) to avoid false-positive restarts",
+			LivenessTTL, minMissedBeats, heartbeatInterval, heartbeatInterval*minMissedBeats)
+	}
+}
+
+// TestSetOnline_KeyAndTTL verifies SetOnline writes the canonical `ws:<id>`
+// key with the value "online" and the LivenessTTL — the exact contract
+// IsOnline and the a2a_proxy reactive check rely on.
+func TestSetOnline_KeyAndTTL(t *testing.T) {
+	mr := withMiniRedis(t)
+	ctx := context.Background()
+	const ws = "ws-abc-123"
+
+	if err := SetOnline(ctx, ws); err != nil {
+		t.Fatalf("SetOnline: %v", err)
+	}
+
+	// Key name must be exactly ws:<id> — not, say, ws:<id>:online.
+	if !mr.Exists("ws:" + ws) {
+		t.Fatalf("expected key %q to exist; keys present: %v", "ws:"+ws, mr.Keys())
+	}
+	got, err := mr.Get("ws:" + ws)
+	if err != nil {
+		t.Fatalf("mr.Get: %v", err)
+	}
+	if got != "online" {
+		t.Errorf("liveness value = %q, want %q", got, "online")
+	}
+
+	// TTL must be the tuned LivenessTTL (allow miniredis's whole-second
+	// granularity).
+	ttl := mr.TTL("ws:" + ws)
+	if ttl != LivenessTTL {
+		t.Errorf("TTL = %s, want %s", ttl, LivenessTTL)
+	}
+}
+
+// TestIsOnline_TrueThenExpires drives the real TTL clock: a freshly-set
+// workspace is online; after the TTL elapses it is offline. This is the
+// behaviour online-detection depends on — proven against real expiry, not
+// asserted from a mock.
+func TestIsOnline_TrueThenExpires(t *testing.T) {
+	mr := withMiniRedis(t)
+	ctx := context.Background()
+	const ws = "ws-expiry"
+
+	if err := SetOnline(ctx, ws); err != nil {
+		t.Fatalf("SetOnline: %v", err)
+	}
+	online, err := IsOnline(ctx, ws)
+	if err != nil {
+		t.Fatalf("IsOnline: %v", err)
+	}
+	if !online {
+		t.Fatal("expected workspace online immediately after SetOnline")
+	}
+
+	// Fast-forward just past the TTL; the liveness key must expire.
+	mr.FastForward(LivenessTTL + time.Second)
+
+	online, err = IsOnline(ctx, ws)
+	if err != nil {
+		t.Fatalf("IsOnline after expiry: %v", err)
+	}
+	if online {
+		t.Error("expected workspace offline after TTL elapsed")
+	}
+}
+
+// TestRefreshTTL_ExtendsLiveness proves a heartbeat (RefreshTTL) keeps a
+// workspace alive across what would otherwise be an expiry. Without the
+// refresh the key expires; with it, IsOnline stays true. Watch-fail: if
+// RefreshTTL targets the wrong key, the refresh is a no-op and this fails.
+func TestRefreshTTL_ExtendsLiveness(t *testing.T) {
+	mr := withMiniRedis(t)
+	ctx := context.Background()
+	const ws = "ws-refresh"
+
+	if err := SetOnline(ctx, ws); err != nil {
+		t.Fatalf("SetOnline: %v", err)
+	}
+	// Advance most of the way to expiry, then heartbeat.
+	mr.FastForward(LivenessTTL - 5*time.Second)
+	if err := RefreshTTL(ctx, ws); err != nil {
+		t.Fatalf("RefreshTTL: %v", err)
+	}
+	// Advance past where the ORIGINAL TTL would have expired. Still online.
+	mr.FastForward(10 * time.Second)
+	online, err := IsOnline(ctx, ws)
+	if err != nil {
+		t.Fatalf("IsOnline: %v", err)
+	}
+	if !online {
+		t.Error("expected workspace still online after RefreshTTL heartbeat")
+	}
+}
+
+// TestIsOnline_UnknownWorkspace returns false (and no error) for a workspace
+// that was never set — the default for a never-registered / long-dead agent.
+func TestIsOnline_UnknownWorkspace(t *testing.T) {
+	withMiniRedis(t)
+	ctx := context.Background()
+	online, err := IsOnline(ctx, "never-seen")
+	if err != nil {
+		t.Fatalf("IsOnline: %v", err)
+	}
+	if online {
+		t.Error("expected unknown workspace to be offline")
+	}
+}
+
+// TestURLCache_RoundTrip pins the `ws:<id>:url` key and its 5-minute TTL,
+// and proves the value round-trips. A2A push resolves the target through
+// this key.
+func TestURLCache_RoundTrip(t *testing.T) {
+	mr := withMiniRedis(t)
+	ctx := context.Background()
+	const ws = "ws-url"
+	const url = "https://ws-url.workspaces.moleculesai.app"
+
+	if err := CacheURL(ctx, ws, url); err != nil {
+		t.Fatalf("CacheURL: %v", err)
+	}
+	got, err := GetCachedURL(ctx, ws)
+	if err != nil {
+		t.Fatalf("GetCachedURL: %v", err)
+	}
+	if got != url {
+		t.Errorf("GetCachedURL = %q, want %q", got, url)
+	}
+	if !mr.Exists("ws:" + ws + ":url") {
+		t.Errorf("expected key %q; present: %v", "ws:"+ws+":url", mr.Keys())
+	}
+	if ttl := mr.TTL("ws:" + ws + ":url"); ttl != 5*time.Minute {
+		t.Errorf("url cache TTL = %s, want 5m", ttl)
+	}
+}
+
+// TestInternalURLCache_RoundTrip pins the `ws:<id>:internal_url` key (the
+// Docker-internal address used for workspace-to-workspace discovery) and its
+// 5-minute TTL.
+func TestInternalURLCache_RoundTrip(t *testing.T) {
+	mr := withMiniRedis(t)
+	ctx := context.Background()
+	const ws = "ws-int"
+	const url = "http://ws-int:8080"
+
+	if err := CacheInternalURL(ctx, ws, url); err != nil {
+		t.Fatalf("CacheInternalURL: %v", err)
+	}
+	got, err := GetCachedInternalURL(ctx, ws)
+	if err != nil {
+		t.Fatalf("GetCachedInternalURL: %v", err)
+	}
+	if got != url {
+		t.Errorf("GetCachedInternalURL = %q, want %q", got, url)
+	}
+	if ttl := mr.TTL("ws:" + ws + ":internal_url"); ttl != 5*time.Minute {
+		t.Errorf("internal url cache TTL = %s, want 5m", ttl)
+	}
+}
+
+// TestKeyNamespacesDoNotCollide is the prefix-collision regression: the
+// liveness key (ws:<id>), the URL key (ws:<id>:url), and the internal-URL
+// key (ws:<id>:internal_url) must be three DISTINCT keys for the same
+// workspace. If a future edit collapses the format strings, IsOnline would
+// read a URL as liveness (or vice versa) and online-detection / proxy
+// resolution would corrupt each other fleet-wide.
+func TestKeyNamespacesDoNotCollide(t *testing.T) {
+	mr := withMiniRedis(t)
+	ctx := context.Background()
+	const ws = "ws-collide"
+
+	if err := SetOnline(ctx, ws); err != nil {
+		t.Fatalf("SetOnline: %v", err)
+	}
+	if err := CacheURL(ctx, ws, "https://public"); err != nil {
+		t.Fatalf("CacheURL: %v", err)
+	}
+	if err := CacheInternalURL(ctx, ws, "http://internal:8080"); err != nil {
+		t.Fatalf("CacheInternalURL: %v", err)
+	}
+
+	// Liveness value must still be "online", NOT a URL.
+	if v, _ := mr.Get("ws:" + ws); v != "online" {
+		t.Errorf("liveness key clobbered by a URL write: got %q", v)
+	}
+	if v, _ := mr.Get("ws:" + ws + ":url"); v != "https://public" {
+		t.Errorf("url key = %q, want https://public", v)
+	}
+	if v, _ := mr.Get("ws:" + ws + ":internal_url"); v != "http://internal:8080" {
+		t.Errorf("internal_url key = %q, want http://internal:8080", v)
+	}
+}
+
+// TestClearWorkspaceKeys_RemovesAllThree proves teardown removes the
+// liveness, URL, and internal-URL keys together — a leaked liveness key
+// after deletion would keep a dead workspace looking online; a leaked URL
+// key would let the proxy forward to a recycled address.
+func TestClearWorkspaceKeys_RemovesAllThree(t *testing.T) {
+	mr := withMiniRedis(t)
+	ctx := context.Background()
+	const ws = "ws-clear"
+
+	if err := SetOnline(ctx, ws); err != nil {
+		t.Fatalf("SetOnline: %v", err)
+	}
+	if err := CacheURL(ctx, ws, "https://x"); err != nil {
+		t.Fatalf("CacheURL: %v", err)
+	}
+	if err := CacheInternalURL(ctx, ws, "http://x:8080"); err != nil {
+		t.Fatalf("CacheInternalURL: %v", err)
+	}
+
+	ClearWorkspaceKeys(ctx, ws)
+
+	for _, k := range []string{"ws:" + ws, "ws:" + ws + ":url", "ws:" + ws + ":internal_url"} {
+		if mr.Exists(k) {
+			t.Errorf("key %q survived ClearWorkspaceKeys", k)
+		}
+	}
+	online, err := IsOnline(ctx, ws)
+	if err != nil {
+		t.Fatalf("IsOnline: %v", err)
+	}
+	if online {
+		t.Error("workspace still online after ClearWorkspaceKeys")
+	}
+}
@@ -225,6 +225,16 @@ func (e *proxyA2AError) Error() string {
 	return "proxy a2a error"
 }

+// EnqueueA2A is a method wrapper around the package-level EnqueueA2A function so
+// that *WorkspaceHandler satisfies the scheduler's A2AProxy interface. The
+// scheduler cannot call the package function directly (it would have to import
+// internal/handlers, but handlers already imports internal/scheduler → import
+// cycle), so it goes through this method on the proxy it already holds. Used by
+// the cron scheduler to durably buffer a tick when the target workspace is busy.
+func (h *WorkspaceHandler) EnqueueA2A(ctx context.Context, workspaceID, callerID string, priority int, body []byte, method, idempotencyKey string, expiresAt *time.Time) (string, int, error) {
+	return EnqueueA2A(ctx, workspaceID, callerID, priority, body, method, idempotencyKey, expiresAt)
+}
+
 // ProxyA2ARequest is the public wrapper for proxyA2ARequest, used by the
 // cron scheduler and other internal callers that need to send A2A messages
 // to workspaces programmatically (not from an HTTP handler).
@@ -97,10 +97,10 @@ type QueuedItem struct {
 // returns the new row ID + current queue depth. Caller MUST have already
 // determined the target is busy — this function does not check.
 //
-// Idempotency: when idempotencyKey is non-empty, the partial unique index
-// `idx_a2a_queue_idempotency` prevents duplicate active rows for the same
-// (workspace_id, idempotency_key). On conflict this returns the existing
-// row's ID so the caller's log still points at the live queue entry.
+// Idempotency: when idempotencyKey is non-empty, a duplicate active enqueue
+// for the same (workspace, key) is collapsed rather than double-buffered. On
+// a duplicate this returns the existing row's ID so the caller's log still
+// points at the live queue entry.
 func EnqueueA2A(
 	ctx context.Context,
 	workspaceID, callerID string,
@@ -129,6 +129,32 @@ func EnqueueA2A(
 		expiresAtArg = *expiresAt
 	}

+	// Supersede any already-expired pending row for this same key before we
+	// insert. The drain path skips expired pending rows, so such a row never
+	// completes on its own — it lingers in the active set and would block the
+	// conflict check below, silently swallowing this fresh enqueue. Retiring
+	// it here (a) frees the active set so the insert below proceeds and (b)
+	// cleans the stale row up so expired rows don't accumulate. Scoped to the
+	// idempotency key so unrelated traffic is untouched.
+	if idempotencyKey != "" {
+		if _, supErr := db.DB.ExecContext(ctx, `
+			UPDATE a2a_queue
+			SET status = 'dropped',
+			    last_error = 'superseded: expired before drain; replaced by a fresh enqueue'
+			WHERE workspace_id = $1
+			  AND idempotency_key = $2
+			  AND status = 'queued'
+			  AND expires_at IS NOT NULL
+			  AND expires_at <= now()
+		`, workspaceID, idempotencyKey); supErr != nil {
+			// Non-fatal: if the cleanup fails we still attempt the insert. Worst
+			// case the conflict path returns the (stale) existing row's id, which
+			// is the pre-fix behaviour — no new breakage introduced here.
+			log.Printf("A2AQueue: supersede-expired cleanup failed for workspace %s key %s: %v",
+				workspaceID, idempotencyKey, supErr)
+		}
+	}
+
 	// INSERT ... ON CONFLICT DO NOTHING RETURNING id. The conflict target
 	// must reference the partial unique INDEX columns + WHERE clause directly
 	// (Postgres can't reference partial unique indexes by name in
@@ -246,20 +272,6 @@ func MarkQueueItemFailed(ctx context.Context, id, errMsg string) {
 	}
 }

-// QueueDepth returns the number of currently-queued (not dispatched/completed)
-// items for a workspace. Used by the busy-return response body so callers
-// can see how many ahead of them.
-func QueueDepth(ctx context.Context, workspaceID string) int {
-	var n int
-	if err := db.DB.QueryRowContext(ctx,
-		`SELECT COUNT(*) FROM a2a_queue WHERE workspace_id = $1 AND status = 'queued'`,
-		workspaceID,
-	).Scan(&n); err != nil {
-		log.Printf("A2AQueue: QueueDepth query failed for workspace %s: %v", workspaceID, err)
-	}
-	return n
-}
-
 // DropStaleQueueItems marks queued items older than maxAge as 'dropped' with a
 // system-generated reason so PM agents stop processing stale post-incident noise.
 // Called with a workspaceID to scope cleanup to one workspace, or empty to sweep
@@ -0,0 +1,160 @@
+package handlers
+
+// a2a_queue_enqueue_expired_test.go — regression for CR3 RC 9853.
+//
+// Bug: a pending buffered tick that expires before the drain reaches it is
+// skipped by the drain (it filters out expired pending rows) yet still occupies
+// the active set the idempotency check guards. A later tick for the SAME key
+// would then collapse onto that dead row and be silently swallowed — the exact
+// drop the busy-buffer path was built to prevent.
+//
+// Fix: EnqueueA2A retires any already-expired pending row for the key BEFORE the
+// insert, so the fresh tick buffers (and the stale row is cleaned up) instead of
+// being dropped.
+//
+// These tests use the QueryMatcherEqual mock (setupTestDBForQueueTests) so the
+// SQL strings below must match the handler's queries verbatim.
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/DATA-DOG/go-sqlmock"
+)
+
+const (
+	enqWorkspaceID = "ws-enq-expired"
+	enqKey         = "sched-aaaa-bbbb" // schedule_id used as idempotency key
+	enqBody        = `{"method":"message/send"}`
+	enqMethod      = "message/send"
+)
+
+// expectSupersedeExpired registers the cleanup UPDATE EnqueueA2A issues before
+// the insert when an idempotency key is present. rowsRetired is how many expired
+// pending rows the UPDATE claims to have dropped.
+func expectSupersedeExpired(mock sqlmock.Sqlmock, workspaceID, key string, rowsRetired int64) {
+	mock.ExpectExec(`
+			UPDATE a2a_queue
+			SET status = 'dropped',
+			    last_error = 'superseded: expired before drain; replaced by a fresh enqueue'
+			WHERE workspace_id = $1
+			  AND idempotency_key = $2
+			  AND status = 'queued'
+			  AND expires_at IS NOT NULL
+			  AND expires_at <= now()
+		`).
+		WithArgs(workspaceID, key).
+		WillReturnResult(sqlmock.NewResult(0, rowsRetired))
+}
+
+// expectInsert registers the INSERT ... ON CONFLICT DO NOTHING RETURNING id.
+// newID is the id the insert returns (non-conflict / fresh enqueue path).
+func expectInsert(mock sqlmock.Sqlmock, newID string) {
+	mock.ExpectQuery(`
+		INSERT INTO a2a_queue (workspace_id, caller_id, priority, body, method, idempotency_key, expires_at)
+		VALUES ($1, $2, $3, $4::jsonb, $5, $6, $7)
+		ON CONFLICT (workspace_id, idempotency_key)
+			WHERE idempotency_key IS NOT NULL AND status IN ('queued','dispatched')
+			DO NOTHING
+		RETURNING id
+	`).WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow(newID))
+}
+
+// expectDepth registers the trailing queue-depth count query.
+func expectDepth(mock sqlmock.Sqlmock, workspaceID string, depth int) {
+	mock.ExpectQuery(`
+		SELECT COUNT(*) FROM a2a_queue
+		WHERE workspace_id = $1 AND status = 'queued'
+	`).WithArgs(workspaceID).
+		WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(depth))
+}
+
+// TestEnqueueA2A_ExpiredRowDoesNotBlockFreshTick is the core CR3 regression:
+// an existing expired pending row for a schedule's key must NOT cause the next
+// tick's enqueue to be dropped. The expired row is retired first, then the
+// fresh tick inserts and returns a NEW id.
+func TestEnqueueA2A_ExpiredRowDoesNotBlockFreshTick(t *testing.T) {
+	mock := setupTestDBForQueueTests(t)
+
+	// One expired pending row exists for this key and gets retired.
+	expectSupersedeExpired(mock, enqWorkspaceID, enqKey, 1)
+	// With the active set cleared, the insert proceeds (no conflict) → new id.
+	const freshID = "fresh-tick-id"
+	expectInsert(mock, freshID)
+	expectDepth(mock, enqWorkspaceID, 1)
+
+	nextRun := time.Now().Add(30 * time.Second)
+	id, depth, err := EnqueueA2A(
+		context.Background(), enqWorkspaceID, "", PriorityTask,
+		[]byte(enqBody), enqMethod, enqKey, &nextRun,
+	)
+	if err != nil {
+		t.Fatalf("EnqueueA2A returned error: %v", err)
+	}
+	if id != freshID {
+		t.Errorf("expected the fresh tick to enqueue with a new id %q, got %q "+
+			"(an expired row must not swallow the new tick)", freshID, id)
+	}
+	if depth != 1 {
+		t.Errorf("expected depth 1, got %d", depth)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestEnqueueA2A_NoExpiredRow_NormalEnqueue: when no expired row exists the
+// supersede UPDATE simply affects zero rows and the enqueue proceeds normally.
+func TestEnqueueA2A_NoExpiredRow_NormalEnqueue(t *testing.T) {
+	mock := setupTestDBForQueueTests(t)
+
+	expectSupersedeExpired(mock, enqWorkspaceID, enqKey, 0) // nothing to retire
+	const newID = "new-id"
+	expectInsert(mock, newID)
+	expectDepth(mock, enqWorkspaceID, 2)
+
+	nextRun := time.Now().Add(30 * time.Second)
+	id, depth, err := EnqueueA2A(
+		context.Background(), enqWorkspaceID, "", PriorityTask,
+		[]byte(enqBody), enqMethod, enqKey, &nextRun,
+	)
+	if err != nil {
+		t.Fatalf("EnqueueA2A returned error: %v", err)
+	}
+	if id != newID {
+		t.Errorf("expected id %q, got %q", newID, id)
+	}
+	if depth != 2 {
+		t.Errorf("expected depth 2, got %d", depth)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestEnqueueA2A_NoKey_SkipsSupersede: with no idempotency key there is no
+// active-set conflict to guard, so the supersede cleanup is skipped entirely
+// and only the insert + depth queries run.
+func TestEnqueueA2A_NoKey_SkipsSupersede(t *testing.T) {
+	mock := setupTestDBForQueueTests(t)
+
+	// No expectSupersedeExpired — it must NOT be issued when key is empty.
+	const newID = "no-key-id"
+	expectInsert(mock, newID)
+	expectDepth(mock, enqWorkspaceID, 1)
+
+	id, _, err := EnqueueA2A(
+		context.Background(), enqWorkspaceID, "", PriorityTask,
+		[]byte(enqBody), enqMethod, "", nil,
+	)
+	if err != nil {
+		t.Fatalf("EnqueueA2A returned error: %v", err)
+	}
+	if id != newID {
+		t.Errorf("expected id %q, got %q", newID, id)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
@@ -727,6 +727,25 @@ func (h *RegistryHandler) Heartbeat(c *gin.Context) {
 		return
 	}

+	// #2421: backfill agent_card when the initial register failed and the
+	// heartbeat carries it. Only writes when NULL — never overwrites a
+	// reconciled or updated card. This is the recovery path for fast-cloud
+	// workspaces whose DNS wasn't ready at first register.
+	if len(payload.AgentCard) > 0 {
+		res, err := db.DB.ExecContext(ctx, `
+			UPDATE workspaces
+			SET agent_card = $2
+			WHERE id = $1 AND agent_card IS NULL
+		`, payload.WorkspaceID, payload.AgentCard)
+		if err != nil {
+			log.Printf("Registry heartbeat: agent_card backfill failed for %s: %v", payload.WorkspaceID, err)
+		} else {
+			if rows, _ := res.RowsAffected(); rows > 0 {
+				log.Printf("Registry heartbeat: backfilled agent_card for %s (initial register had failed)", payload.WorkspaceID)
+			}
+		}
+	}
+
 	// Refresh Redis TTL
 	if err := db.RefreshTTL(ctx, payload.WorkspaceID); err != nil {
 		log.Printf("Heartbeat redis error: %v", err)
@@ -0,0 +1,539 @@
+//go:build integration
+// +build integration
+
+// registry_auth_integration_test.go — REAL Postgres integration tests for
+// the registry-auth + cross-tenant security boundary (issue #2148).
+//
+// Run with:
+//
+//	docker run --rm -d --name pg-integration \
+//	  -e POSTGRES_PASSWORD=test -e POSTGRES_DB=molecule \
+//	  -p 55432:5432 postgres:15-alpine
+//	sleep 4
+//	# apply migrations 001 (workspaces), 020 (workspace_auth_tokens),
+//	# 035/036 (org_api_tokens + org_id), 043 (status enum), 044
+//	# (platform_inbound_secret), 045 (delivery_mode) — CI applies the
+//	# full migrations/ set in lexicographic order (apply-all-or-skip).
+//	cd workspace-server
+//	INTEGRATION_DB_URL="postgres://postgres:test@localhost:55432/molecule?sslmode=disable" \
+//	  go test -tags=integration ./internal/handlers/ -run '^TestIntegration_(RegistryRowState|WSAuth|CanCommunicate|OrgToken)'
+//
+// CI (.gitea/workflows/handlers-postgres-integration.yml) runs this on
+// every PR that touches workspace-server/internal/{handlers,wsauth,
+// registry,orgtoken}/** OR workspace-server/migrations/**. The
+// detect-changes `handlers-postgres` profile was widened to include the
+// registry + orgtoken packages in the same PR that added this file so a
+// regression in CanCommunicate / orgtoken.Revoke actually triggers the
+// suite (#2148).
+//
+// Why these are NOT plain unit tests
+// ----------------------------------
+// The strict-sqlmock unit tests in wsauth/tokens_test.go,
+// orgtoken/tokens_test.go, and registry/access_test.go pin which SQL
+// statements fire — they are fast and let us iterate without a DB. But
+// sqlmock asserts the SQL TEXT, not that a real Postgres ENFORCES the
+// security predicate. The whole value of this package is the
+// cross-tenant non-leak boundary:
+//
+//   - wsauth.ValidateToken binds a bearer to ONE workspace_id; sqlmock
+//     cannot prove the JOIN on workspaces + the workspaceID equality
+//     actually rejects a token replayed against a different workspace,
+//     or a token whose workspace was soft-removed.
+//   - registry.CanCommunicate walks the parent_id chain in real rows;
+//     sqlmock returns canned rows so it can never catch a query that
+//     leaks a SIBLING under a different org root, or a self→self / cross-
+//     tenant decision that depends on the actual stored parent_id.
+//   - orgtoken.Revoke / Validate depend on the partial-index +
+//     revoked_at IS NULL predicate landing the row state; sqlmock is
+//     satisfied by "an UPDATE fired".
+//   - the registry register/heartbeat #73 guard
+//     (WHERE workspaces.status IS DISTINCT FROM 'removed' /
+//     status != 'removed') is a ROW-STATE invariant: a late
+//     register/heartbeat MUST NOT resurrect a soft-deleted tombstone.
+//     sqlmock cannot observe that the row stayed 'removed'.
+//
+// These tests close those gaps by booting a real Postgres, running the
+// production functions (and, for register/heartbeat, replaying the exact
+// production statement documented at registry.go:393 / registry.go:604),
+// and SELECTing the row to verify the observable state.
+
+package handlers
+
+import (
+	"context"
+	"database/sql"
+	"os"
+	"testing"
+	"time"
+
+	mdb "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
+	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/orgtoken"
+	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/registry"
+	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/wsauth"
+	_ "github.com/lib/pq"
+)
+
+// integrationAuthDB opens a connection from $INTEGRATION_DB_URL (skipping
+// the test if unset), wipes the registry-auth tables for isolation, and
+// hot-swaps the package-level mdb.DB so production functions that read the
+// global (registry.CanCommunicate → registry.getWorkspaceRef) see this
+// same connection. Restores the previous global + closes the conn via
+// t.Cleanup.
+//
+// NOT SAFE FOR t.Parallel() — it mutates the package global and owns the
+// tables for the duration of the test. This mirrors integrationDB in
+// delegation_ledger_integration_test.go but wipes the auth tables (not
+// delegations) and is kept separate so each suite's wipe step is local.
+//
+// Wipe order respects FKs: workspace_auth_tokens + org_api_tokens
+// reference workspaces, so they go first. org_api_tokens.org_id is
+// ON DELETE SET NULL and workspace_auth_tokens.workspace_id is
+// ON DELETE CASCADE, but an explicit ordered DELETE keeps the intent
+// obvious and avoids leaving rows behind if the FK actions ever change.
+func integrationAuthDB(t *testing.T) *sql.DB {
+	t.Helper()
+	url := os.Getenv("INTEGRATION_DB_URL")
+	if url == "" {
+		t.Skip("INTEGRATION_DB_URL not set; skipping (local devs: see file header)")
+	}
+	conn, err := sql.Open("postgres", url)
+	if err != nil {
+		t.Fatalf("open: %v", err)
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+	if err := conn.PingContext(ctx); err != nil {
+		t.Fatalf("ping: %v", err)
+	}
+	for _, tbl := range []string{"workspace_auth_tokens", "org_api_tokens", "workspaces"} {
+		if _, err := conn.ExecContext(ctx, "DELETE FROM "+tbl); err != nil {
+			t.Fatalf("cleanup %s: %v", tbl, err)
+		}
+	}
+	prev := mdb.DB
+	mdb.DB = conn
+	t.Cleanup(func() {
+		mdb.DB = prev
+		conn.Close()
+	})
+	return conn
+}
+
+// insertWorkspace creates a workspace row with the given status and
+// optional parent, returning the DB-generated UUID. parentID may be the
+// empty string for a root-level workspace (parent_id IS NULL). status
+// must be a valid workspaces.status enum value (043 migration).
+func insertWorkspace(t *testing.T, conn *sql.DB, name, status, parentID string) string {
+	t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+	var parent any
+	if parentID != "" {
+		parent = parentID
+	}
+	var id string
+	err := conn.QueryRowContext(ctx, `
+		INSERT INTO workspaces (name, status, parent_id, delivery_mode)
+		VALUES ($1, $2, $3, 'push')
+		RETURNING id
+	`, name, status, parent).Scan(&id)
+	if err != nil {
+		t.Fatalf("insertWorkspace(%s): %v", name, err)
+	}
+	return id
+}
+
+// statusOf reads a workspace row's current status, failing the test if
+// the row is gone (a register/heartbeat must never DELETE the row).
+func statusOf(t *testing.T, conn *sql.DB, id string) string {
+	t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+	var status string
+	err := conn.QueryRowContext(ctx, `SELECT status FROM workspaces WHERE id = $1`, id).Scan(&status)
+	if err != nil {
+		t.Fatalf("statusOf(%s): %v", id, err)
+	}
+	return status
+}
+
+// ---------------------------------------------------------------------------
+// 1 — registry register/heartbeat row-state: the #73 tombstone guard.
+//
+// Watch-fail intent: drop `WHERE workspaces.status IS DISTINCT FROM
+// 'removed'` from the upsert (or `AND status != 'removed'` from the
+// heartbeat) and a late register/heartbeat resurrects a soft-deleted
+// workspace back to 'online' — the exact bulk-delete straggler bug the
+// guard fixed. These tests replay the EXACT production statements
+// (registry.go:393 upsert / registry.go:604 heartbeat) so a regression
+// there is caught against a real row, not just an sqlmock SQL-text diff.
+// ---------------------------------------------------------------------------
+
+// registerUpsertSQL mirrors RegistryHandler.Register's upsert at
+// workspace-server/internal/handlers/registry.go:393. Kept in lockstep
+// with that statement; CR2/CI must confirm it still matches the handler.
+const registerUpsertSQL = `
+	INSERT INTO workspaces (id, name, url, agent_card, status, last_heartbeat_at, delivery_mode)
+	VALUES ($1, $2, $3, $4::jsonb, 'online', now(), $5)
+	ON CONFLICT (id) DO UPDATE SET
+		url = CASE
+			WHEN workspaces.url LIKE 'http://127.0.0.1%' THEN workspaces.url
+			ELSE EXCLUDED.url
+		END,
+		agent_card = EXCLUDED.agent_card,
+		status = 'online',
+		last_heartbeat_at = now(),
+		delivery_mode = EXCLUDED.delivery_mode,
+		updated_at = now()
+	WHERE workspaces.status IS DISTINCT FROM 'removed'
+`
+
+// heartbeatUpdateSQL mirrors RegistryHandler.Heartbeat's zero-spend
+// branch at workspace-server/internal/handlers/registry.go:604, reduced
+// to the columns guaranteed present by the base migration set so the
+// test does not depend on optional columns the apply-all-or-skip CI step
+// may not have landed. The security-relevant clause — the
+// `AND status != 'removed'` guard — is preserved verbatim.
+const heartbeatUpdateSQL = `
+	UPDATE workspaces SET
+		last_heartbeat_at = now(),
+		updated_at        = now()
+	WHERE id = $1 AND status != 'removed'
+`
+
+func TestIntegration_RegistryRowState_RegisterDoesNotResurrectRemoved(t *testing.T) {
+	conn := integrationAuthDB(t)
+	ctx := context.Background()
+
+	id := insertWorkspace(t, conn, "tombstoned-ws", "removed", "")
+
+	// A late register for a soft-deleted workspace.
+	if _, err := conn.ExecContext(ctx, registerUpsertSQL,
+		id, id, "https://agent.example.com", `{"name":"x"}`, "push"); err != nil {
+		t.Fatalf("register upsert: %v", err)
+	}
+
+	if got := statusOf(t, conn, id); got != "removed" {
+		t.Fatalf("removed workspace was resurrected by register: status=%q, want 'removed' (#73 guard regressed)", got)
+	}
+}
+
+func TestIntegration_RegistryRowState_RegisterUpsertsLiveWorkspaceToOnline(t *testing.T) {
+	conn := integrationAuthDB(t)
+	ctx := context.Background()
+
+	// A provisioning workspace registering for the first heartbeat should
+	// flip to online — proves the guard does NOT over-block live rows.
+	id := insertWorkspace(t, conn, "live-ws", "provisioning", "")
+
+	if _, err := conn.ExecContext(ctx, registerUpsertSQL,
+		id, id, "https://agent.example.com", `{"name":"x"}`, "push"); err != nil {
+		t.Fatalf("register upsert: %v", err)
+	}
+
+	if got := statusOf(t, conn, id); got != "online" {
+		t.Fatalf("live workspace register: status=%q, want 'online'", got)
+	}
+}
+
+func TestIntegration_RegistryRowState_HeartbeatDoesNotResurrectRemoved(t *testing.T) {
+	conn := integrationAuthDB(t)
+	ctx := context.Background()
+
+	id := insertWorkspace(t, conn, "tombstoned-ws", "removed", "")
+
+	if _, err := conn.ExecContext(ctx, heartbeatUpdateSQL, id); err != nil {
+		t.Fatalf("heartbeat update: %v", err)
+	}
+
+	if got := statusOf(t, conn, id); got != "removed" {
+		t.Fatalf("removed workspace mutated by heartbeat: status=%q, want 'removed' (#73 guard regressed)", got)
+	}
+
+	// And last_heartbeat_at must NOT have been bumped on the tombstone —
+	// a refreshed heartbeat would confuse the liveness monitor.
+	var hb sql.NullTime
+	if err := conn.QueryRowContext(ctx,
+		`SELECT last_heartbeat_at FROM workspaces WHERE id = $1`, id).Scan(&hb); err != nil {
+		t.Fatalf("read last_heartbeat_at: %v", err)
+	}
+	if hb.Valid {
+		t.Fatalf("removed workspace got last_heartbeat_at bumped by heartbeat: %v (#73 guard regressed)", hb.Time)
+	}
+}
+
+func TestIntegration_RegistryRowState_HeartbeatUpdatesLiveWorkspace(t *testing.T) {
+	conn := integrationAuthDB(t)
+	ctx := context.Background()
+
+	id := insertWorkspace(t, conn, "live-ws", "online", "")
+
+	if _, err := conn.ExecContext(ctx, heartbeatUpdateSQL, id); err != nil {
+		t.Fatalf("heartbeat update: %v", err)
+	}
+
+	var hb sql.NullTime
+	if err := conn.QueryRowContext(ctx,
+		`SELECT last_heartbeat_at FROM workspaces WHERE id = $1`, id).Scan(&hb); err != nil {
+		t.Fatalf("read last_heartbeat_at: %v", err)
+	}
+	if !hb.Valid {
+		t.Fatalf("live workspace heartbeat did NOT bump last_heartbeat_at")
+	}
+	if got := statusOf(t, conn, id); got != "online" {
+		t.Fatalf("live workspace heartbeat changed status unexpectedly: %q", got)
+	}
+}
+
+// ---------------------------------------------------------------------------
+// 2 — wsauth.ValidateToken A↔B binding (the cross-tenant non-leak boundary).
+//
+// Watch-fail intent: drop the `workspaceID != expectedWorkspaceID` check
+// (or the JOIN's `w.status != 'removed'`) and a workspace-A token would
+// authenticate workspace B, or a token from a soft-removed workspace
+// would stay live. sqlmock cannot prove the JOIN rejects either.
+// ---------------------------------------------------------------------------
+
+func TestIntegration_WSAuth_TokenBoundToIssuingWorkspace(t *testing.T) {
+	conn := integrationAuthDB(t)
+	ctx := context.Background()
+
+	wsA := insertWorkspace(t, conn, "ws-A", "online", "")
+	wsB := insertWorkspace(t, conn, "ws-B", "online", "")
+
+	plaintext, err := wsauth.IssueToken(ctx, conn, wsA)
+	if err != nil {
+		t.Fatalf("IssueToken: %v", err)
+	}
+
+	// Correct binding: token validates for its own workspace.
+	if err := wsauth.ValidateToken(ctx, conn, wsA, plaintext); err != nil {
+		t.Fatalf("ValidateToken(A, tokenA): want nil, got %v", err)
+	}
+
+	// Cross-workspace replay: A's token MUST NOT authenticate B.
+	if err := wsauth.ValidateToken(ctx, conn, wsB, plaintext); err != wsauth.ErrInvalidToken {
+		t.Fatalf("ValidateToken(B, tokenA): want ErrInvalidToken, got %v (cross-tenant binding regressed)", err)
+	}
+
+	// WorkspaceFromToken resolves the OWNING workspace, never B.
+	owner, err := wsauth.WorkspaceFromToken(ctx, conn, plaintext)
+	if err != nil {
+		t.Fatalf("WorkspaceFromToken: %v", err)
+	}
+	if owner != wsA {
+		t.Fatalf("WorkspaceFromToken: got %q, want %q (token rebinding regressed)", owner, wsA)
+	}
+}
+
+func TestIntegration_WSAuth_TokenOfRemovedWorkspaceRejected(t *testing.T) {
+	conn := integrationAuthDB(t)
+	ctx := context.Background()
+
+	ws := insertWorkspace(t, conn, "ws-soon-removed", "online", "")
+	plaintext, err := wsauth.IssueToken(ctx, conn, ws)
+	if err != nil {
+		t.Fatalf("IssueToken: %v", err)
+	}
+	// Sanity: live before removal.
+	if err := wsauth.ValidateToken(ctx, conn, ws, plaintext); err != nil {
+		t.Fatalf("pre-removal ValidateToken: want nil, got %v", err)
+	}
+
+	// Soft-remove the workspace (tombstone) — the token row is NOT
+	// touched, so only the JOIN's status filter can reject it.
+	if _, err := conn.ExecContext(ctx,
+		`UPDATE workspaces SET status = 'removed' WHERE id = $1`, ws); err != nil {
+		t.Fatalf("soft-remove: %v", err)
+	}
+
+	if err := wsauth.ValidateToken(ctx, conn, ws, plaintext); err != wsauth.ErrInvalidToken {
+		t.Fatalf("ValidateToken after soft-remove: want ErrInvalidToken, got %v (removed-workspace JOIN filter regressed)", err)
+	}
+	if _, err := wsauth.WorkspaceFromToken(ctx, conn, plaintext); err != wsauth.ErrInvalidToken {
+		t.Fatalf("WorkspaceFromToken after soft-remove: want ErrInvalidToken, got %v", err)
+	}
+}
+
+func TestIntegration_WSAuth_RevokeAllForWorkspaceKillsToken(t *testing.T) {
+	conn := integrationAuthDB(t)
+	ctx := context.Background()
+
+	ws := insertWorkspace(t, conn, "ws-rotate", "online", "")
+	plaintext, err := wsauth.IssueToken(ctx, conn, ws)
+	if err != nil {
+		t.Fatalf("IssueToken: %v", err)
+	}
+	if err := wsauth.ValidateToken(ctx, conn, ws, plaintext); err != nil {
+		t.Fatalf("pre-revoke ValidateToken: %v", err)
+	}
+
+	if err := wsauth.RevokeAllForWorkspace(ctx, conn, ws); err != nil {
+		t.Fatalf("RevokeAllForWorkspace: %v", err)
+	}
+
+	if err := wsauth.ValidateToken(ctx, conn, ws, plaintext); err != wsauth.ErrInvalidToken {
+		t.Fatalf("ValidateToken after revoke: want ErrInvalidToken, got %v (revoked_at filter regressed)", err)
+	}
+}
+
+// ---------------------------------------------------------------------------
+// 3 — registry.CanCommunicate parent_id hierarchy (cross-tenant non-chatter).
+//
+// Topology (two distinct tenant trees):
+//
+//	rootX (org root, parent_id NULL)            rootY (org root, parent_id NULL)
+//	  ├── leadX                                   └── leadY
+//	  │     ├── engA  (leaf)
+//	  │     └── engB  (leaf, sibling of engA)
+//
+// Watch-fail intent: relax the parent_id scoping and a leaf under rootX
+// could message a leaf under rootY (cross-tenant leak), or two org roots
+// (both parent_id NULL) would be treated as siblings. CanCommunicate
+// reads the package-global db.DB, which integrationAuthDB hot-swaps to
+// the test connection.
+// ---------------------------------------------------------------------------
+
+func TestIntegration_CanCommunicate_HierarchyAndCrossTenantIsolation(t *testing.T) {
+	conn := integrationAuthDB(t)
+
+	rootX := insertWorkspace(t, conn, "rootX", "online", "")
+	leadX := insertWorkspace(t, conn, "leadX", "online", rootX)
+	engA := insertWorkspace(t, conn, "engA", "online", leadX)
+	engB := insertWorkspace(t, conn, "engB", "online", leadX)
+
+	rootY := insertWorkspace(t, conn, "rootY", "online", "")
+	leadY := insertWorkspace(t, conn, "leadY", "online", rootY)
+
+	cases := []struct {
+		name   string
+		caller string
+		target string
+		want   bool
+	}{
+		{"self to self", engA, engA, true},
+		{"siblings (same parent leadX)", engA, engB, true},
+		{"direct parent to child", leadX, engA, true},
+		{"direct child to parent", engA, leadX, true},
+		{"distant ancestor to descendant", rootX, engA, true},
+		{"distant descendant to ancestor", engA, rootX, true},
+		// Cross-tenant non-leak: nothing under rootX may talk to rootY tree.
+		{"cross-tenant leaf to leaf", engA, leadY, false},
+		{"cross-tenant leaf to other root", engA, rootY, false},
+		{"cross-tenant root to root", rootX, rootY, false},
+		// Two org roots both have parent_id NULL — must NOT be siblings.
+		{"two org roots are not siblings", rootX, rootY, false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := registry.CanCommunicate(tc.caller, tc.target); got != tc.want {
+				t.Fatalf("CanCommunicate(%s, %s) = %v, want %v", tc.caller, tc.target, got, tc.want)
+			}
+		})
+	}
+}
+
+// ---------------------------------------------------------------------------
+// 4 — orgtoken revoke / validate row-state.
+//
+// Watch-fail intent: drop `revoked_at IS NULL` from Validate (or the
+// `AND revoked_at IS NULL` from Revoke's idempotency) and a revoked org-
+// admin token would keep authenticating, or a second revoke would report
+// success on an already-dead token. sqlmock is satisfied by "an UPDATE
+// fired" and cannot observe the row no longer authenticates.
+// ---------------------------------------------------------------------------
+
+func TestIntegration_OrgToken_RevokeStopsValidation(t *testing.T) {
+	conn := integrationAuthDB(t)
+	ctx := context.Background()
+
+	// org_id references workspaces(id); anchor the token to a real org root.
+	org := insertWorkspace(t, conn, "org-root", "online", "")
+
+	plaintext, id, err := orgtoken.Issue(ctx, conn, "ci-token", "tester", org)
+	if err != nil {
+		t.Fatalf("Issue: %v", err)
+	}
+
+	// Live token validates and reports its org anchor.
+	gotID, _, gotOrg, err := orgtoken.Validate(ctx, conn, plaintext)
+	if err != nil {
+		t.Fatalf("Validate (live): %v", err)
+	}
+	if gotID != id {
+		t.Fatalf("Validate id = %q, want %q", gotID, id)
+	}
+	if gotOrg != org {
+		t.Fatalf("Validate org_id = %q, want %q (org-anchor regressed)", gotOrg, org)
+	}
+
+	// First revoke flips live → revoked.
+	transitioned, err := orgtoken.Revoke(ctx, conn, id)
+	if err != nil {
+		t.Fatalf("Revoke: %v", err)
+	}
+	if !transitioned {
+		t.Fatalf("Revoke: want true (live→revoked), got false")
+	}
+
+	// Revoked token MUST NOT validate.
+	if _, _, _, err := orgtoken.Validate(ctx, conn, plaintext); err != orgtoken.ErrInvalidToken {
+		t.Fatalf("Validate after revoke: want ErrInvalidToken, got %v (revoked_at filter regressed)", err)
+	}
+
+	// Idempotent re-revoke reports false (already revoked), not an error.
+	transitioned2, err := orgtoken.Revoke(ctx, conn, id)
+	if err != nil {
+		t.Fatalf("re-Revoke: %v", err)
+	}
+	if transitioned2 {
+		t.Fatalf("re-Revoke: want false (already revoked), got true (idempotency guard regressed)")
+	}
+}
+
+func TestIntegration_OrgToken_ListExcludesRevoked(t *testing.T) {
+	conn := integrationAuthDB(t)
+	ctx := context.Background()
+
+	org := insertWorkspace(t, conn, "org-root", "online", "")
+
+	_, liveID, err := orgtoken.Issue(ctx, conn, "live", "tester", org)
+	if err != nil {
+		t.Fatalf("Issue live: %v", err)
+	}
+	_, deadID, err := orgtoken.Issue(ctx, conn, "dead", "tester", org)
+	if err != nil {
+		t.Fatalf("Issue dead: %v", err)
+	}
+	if _, err := orgtoken.Revoke(ctx, conn, deadID); err != nil {
+		t.Fatalf("Revoke dead: %v", err)
+	}
+
+	// List must return only the live token (exercises the real
+	// COALESCE(org_id::text,'') cast that sqlmock can't type-check —
+	// see the comment in orgtoken.List).
+	tokens, err := orgtoken.List(ctx, conn)
+	if err != nil {
+		t.Fatalf("List: %v", err)
+	}
+	if len(tokens) != 1 {
+		t.Fatalf("List returned %d tokens, want 1 (revoked-exclusion regressed)", len(tokens))
+	}
+	if tokens[0].ID != liveID {
+		t.Fatalf("List returned id %q, want live id %q", tokens[0].ID, liveID)
+	}
+	if tokens[0].OrgID != org {
+		t.Fatalf("List org_id = %q, want %q", tokens[0].OrgID, org)
+	}
+
+	// HasAnyLive is true with one live token, false once it's revoked.
+	if ok, err := orgtoken.HasAnyLive(ctx, conn); err != nil || !ok {
+		t.Fatalf("HasAnyLive (one live): ok=%v err=%v, want true,nil", ok, err)
+	}
+	if _, err := orgtoken.Revoke(ctx, conn, liveID); err != nil {
+		t.Fatalf("Revoke live: %v", err)
+	}
+	if ok, err := orgtoken.HasAnyLive(ctx, conn); err != nil || ok {
+		t.Fatalf("HasAnyLive (none live): ok=%v err=%v, want false,nil", ok, err)
+	}
+}
@@ -755,6 +755,88 @@ func TestHeartbeat_SkipsRemovedRows(t *testing.T) {
 	}
 }

+// ==================== Heartbeat — agent_card backfill (#2421) ====================
+
+func TestHeartbeatHandler_BackfillsAgentCard_WhenNull(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-nocard").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
+
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-nocard", 0.0, "", 0, 0, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// #2421: backfill agent_card when heartbeat carries it and DB row is NULL
+	mock.ExpectExec("UPDATE workspaces SET agent_card =").
+		WithArgs("ws-nocard", sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-nocard").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow(models.StatusOnline))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	body := `{"workspace_id":"ws-nocard","agent_card":{"name":"backfilled"}}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+func TestHeartbeatHandler_SkipsAgentCardBackfill_WhenAlreadySet(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewRegistryHandler(broadcaster)
+
+	mock.ExpectQuery("SELECT COALESCE\\(current_task").
+		WithArgs("ws-hascard").
+		WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
+
+	mock.ExpectExec("UPDATE workspaces SET").
+		WithArgs("ws-hascard", 0.0, "", 0, 0, "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// #2421: backfill must be a no-op when agent_card already exists (0 rows affected)
+	mock.ExpectExec("UPDATE workspaces SET agent_card =").
+		WithArgs("ws-hascard", sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 0))
+
+	mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
+		WithArgs("ws-hascard").
+		WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow(models.StatusOnline))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+
+	body := `{"workspace_id":"ws-hascard","agent_card":{"name":"ignored"}}`
+	c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.Heartbeat(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
 // ------------------------------------------------------------
 // validateAgentURL (C6 SSRF fix)
 // ------------------------------------------------------------
@@ -31,14 +31,53 @@ type workspaceDisplayResponse struct {
 	Status    string `json:"status,omitempty"`
 }

-var workspaceComputeInstanceAllowlist = map[string]struct{}{
-	"t3.medium":  {},
-	"t3.large":   {},
-	"t3.xlarge":  {},
-	"t3.2xlarge": {},
-	"m6i.large":  {},
-	"m6i.xlarge": {},
-	"c6i.xlarge": {},
+// workspaceComputeInstanceAllowlist is keyed by cloud provider (multi-provider /
+// in-place switch): each provider's box accepts only that provider's machine
+// sizes (an AWS t3.* is meaningless on Hetzner, and vice-versa). Mirrors the CP
+// provider SSOT — keep in lock-step with the controlplane provider configs
+// (Hetzner ServerType cpx*/cax*, GCP MachineType e2-*, AWS EC2 t3*/m6i*/c6i*).
+// TestValidateWorkspaceCompute_Provider / _InstanceTypePerProvider pin the sets.
+// "" provider = AWS default.
+var workspaceComputeInstanceAllowlist = map[string]map[string]struct{}{
+	"aws": {
+		"t3.medium": {}, "t3.large": {}, "t3.xlarge": {}, "t3.2xlarge": {},
+		"m6i.large": {}, "m6i.xlarge": {}, "c6i.xlarge": {},
+	},
+	"hetzner": {
+		"cpx11": {}, "cpx21": {}, "cpx31": {}, "cpx41": {}, "cpx51": {},
+		"cax11": {}, "cax21": {}, "cax31": {}, "cax41": {},
+	},
+	"gcp": {
+		"e2-small": {}, "e2-medium": {},
+		"e2-standard-2": {}, "e2-standard-4": {}, "e2-standard-8": {},
+	},
+}
+
+// normalizeCloudProvider maps "" → "aws" so the in-place switch comparison
+// treats the default and an explicit "aws" as the same cloud (no spurious switch).
+func normalizeCloudProvider(p string) string {
+	if p == "" {
+		return "aws"
+	}
+	return p
+}
+
+// instanceTypeAllowedForProvider reports whether instanceType is valid for the
+// given provider ("" → aws). Empty instanceType is always allowed (CP defaults).
+func instanceTypeAllowedForProvider(provider, instanceType string) bool {
+	if instanceType == "" {
+		return true
+	}
+	p := provider
+	if p == "" {
+		p = "aws"
+	}
+	set, ok := workspaceComputeInstanceAllowlist[p]
+	if !ok {
+		return false
+	}
+	_, ok = set[instanceType]
+	return ok
 }

 // workspaceComputeProviderAllowlist mirrors the controlplane cloud-provider SSOT
@@ -56,11 +95,24 @@ var workspaceComputeProviderAllowlist = map[string]struct{}{
 }

 func validateWorkspaceCompute(compute models.WorkspaceCompute) error {
-	if compute.InstanceType != "" {
-		if _, ok := workspaceComputeInstanceAllowlist[compute.InstanceType]; !ok {
-			return fmt.Errorf("unsupported compute.instance_type")
+	// Provider first (so the instance-type check below can be provider-scoped).
+	// "" = default (AWS). CP fail-closes an unwired provider with a 422; validating
+	// here gives a clean 400 before the round-trip and is the gate reused by the
+	// switch-provider flow. Mirrors the controlplane cloudprovider SSOT.
+	if compute.Provider != "" {
+		if _, ok := workspaceComputeProviderAllowlist[compute.Provider]; !ok {
+			return fmt.Errorf("unsupported compute.provider (want aws|gcp|hetzner)")
 		}
 	}
+	// Instance type must belong to the chosen provider (an AWS t3.* is invalid on
+	// Hetzner, etc.). Empty = CP default for the provider.
+	if !instanceTypeAllowedForProvider(compute.Provider, compute.InstanceType) {
+		prov := compute.Provider
+		if prov == "" {
+			prov = "aws"
+		}
+		return fmt.Errorf("unsupported compute.instance_type %q for provider %q", compute.InstanceType, prov)
+	}
 	if compute.Volume.RootGB != 0 {
 		if compute.Volume.RootGB < workspaceComputeDiskFloorGB || compute.Volume.RootGB > workspaceComputeDiskCeilingGB {
 			return fmt.Errorf("compute.volume.root_gb must be between %d and %d", workspaceComputeDiskFloorGB, workspaceComputeDiskCeilingGB)
@@ -87,15 +139,6 @@ func validateWorkspaceCompute(compute models.WorkspaceCompute) error {
 	default:
 		return fmt.Errorf("unsupported compute.data_persistence (want persist|ephemeral)")
 	}
-	// Cloud backend for the box (multi-provider). "" = default (AWS). CP fail-
-	// closes an unwired provider with a 422 (PROVIDER_UNAVAILABLE); validating
-	// here gives a clean 400 before the round-trip and is the gate reused by the
-	// switch-provider flow. Mirrors the controlplane cloudprovider SSOT.
-	if compute.Provider != "" {
-		if _, ok := workspaceComputeProviderAllowlist[compute.Provider]; !ok {
-			return fmt.Errorf("unsupported compute.provider (want aws|gcp|hetzner)")
-		}
-	}
 	return nil
 }

@@ -64,6 +64,40 @@ func TestValidateWorkspaceCompute_Provider(t *testing.T) {
 	}
 }

+// Multi-provider / in-place switch: an instance type must belong to the chosen
+// provider — an AWS t3.* is meaningless on Hetzner, a cpx* on AWS, etc. Pins the
+// provider-keyed allowlist (mirrors the CP provider configs).
+func TestValidateWorkspaceCompute_InstanceTypePerProvider(t *testing.T) {
+	good := []struct{ provider, instance string }{
+		{"", "t3.medium"}, {"aws", "t3.2xlarge"}, {"aws", "c6i.xlarge"},
+		{"hetzner", "cpx31"}, {"hetzner", "cax41"},
+		{"gcp", "e2-standard-2"}, {"gcp", "e2-small"},
+		{"hetzner", ""}, {"gcp", ""}, // empty instance = CP default, always ok
+	}
+	for _, g := range good {
+		c := models.WorkspaceCompute{Provider: g.provider, InstanceType: g.instance}
+		if err := validateWorkspaceCompute(c); err != nil {
+			t.Errorf("provider=%q instance=%q must be accepted: %v", g.provider, g.instance, err)
+		}
+	}
+	bad := []struct{ provider, instance string }{
+		{"hetzner", "t3.medium"}, // AWS type on Hetzner
+		{"aws", "cpx31"},         // Hetzner type on AWS
+		{"gcp", "t3.large"},      // AWS type on GCP
+		{"hetzner", "e2-small"},  // GCP type on Hetzner
+		{"", "cpx31"},            // default(aws) + Hetzner type
+	}
+	for _, b := range bad {
+		c := models.WorkspaceCompute{Provider: b.provider, InstanceType: b.instance}
+		if err := validateWorkspaceCompute(c); err == nil {
+			t.Errorf("provider=%q instance=%q must be rejected (cross-provider instance type)", b.provider, b.instance)
+		}
+	}
+	if normalizeCloudProvider("") != "aws" || normalizeCloudProvider("hetzner") != "hetzner" {
+		t.Fatal("normalizeCloudProvider: \"\" must map to aws; explicit providers unchanged")
+	}
+}
+
 // internal#734: data_persistence enum. "" (auto), "persist", "ephemeral" are
 // the only accepted values; anything else is a clear 400 before the CP call.
 func TestValidateWorkspaceCompute_DataPersistence(t *testing.T) {
@@ -164,6 +164,7 @@ func (h *WorkspaceHandler) Update(c *gin.Context) {
 		}
 	}
 	var computeJSON string
+	var newComputeProvider string // hoisted: drives the cloud-provider switch detection below
 	computePatch := false
 	if rawCompute, ok := body["compute"]; ok {
 		computePatch = true
@@ -184,6 +185,7 @@ func (h *WorkspaceHandler) Update(c *gin.Context) {
 				c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 				return
 			}
+			newComputeProvider = compute.Provider
 			encoded, err := workspaceComputeJSON(compute)
 			if err != nil {
 				log.Printf("Update compute encode error for %s: %v", id, err)
@@ -262,6 +264,55 @@ func (h *WorkspaceHandler) Update(c *gin.Context) {
 		needsRestart = true
 	}
 	if computePatch {
+		// Cloud-provider SWITCH (in-place): if the incoming provider differs from
+		// the one currently stored, the existing box lives on the OLD cloud. We
+		// MUST deprovision it on the OLD provider BEFORE overwriting compute —
+		// otherwise the subsequent "Save & Restart" restart's provider-aware
+		// deprovision (cpProv.Stop → resolveProvider reads compute->>'provider')
+		// would target the NEW cloud and ORPHAN the old box (a silently-billing
+		// leak). Cloud mode only (the local Docker provisioner has no cross-cloud
+		// concept; provider stays "" there so this never fires). After this, the
+		// canvas's restart provisions the box on the new cloud; its own Stop is a
+		// safe no-op (the box is already gone).
+		if h.cpProv != nil {
+			var oldProvider sql.NullString
+			err := db.DB.QueryRowContext(ctx, `SELECT compute->>'provider' FROM workspaces WHERE id = $1`, id).Scan(&oldProvider)
+			// FAIL-CLOSED on the read. The earlier `err == nil` gate was fail-OPEN:
+			// a transient/unexpected DB error here skipped the whole switch block and
+			// fell through to the compute UPDATE — so during a real switch the later
+			// provider-aware restart deprovision would target the NEW cloud and ORPHAN
+			// the old box (silent billing, unrecoverable). We cannot tell whether this
+			// is a cross-cloud switch without the old provider, so on any error other
+			// than "no such row" we abort exactly like a failed deprovision: compute
+			// untouched, old box still recoverable, user retries. (sql.ErrNoRows means
+			// there is genuinely no prior box — nothing to orphan — so it's safe to
+			// skip the switch and let the UPDATE proceed.)
+			if err != nil && !errors.Is(err, sql.ErrNoRows) {
+				log.Printf("Update: provider-switch precheck for %s ABORTED — could not read current cloud provider (provider left unchanged): %v", id, err)
+				c.JSON(http.StatusBadGateway, gin.H{"error": "could not read the current cloud provider; provider unchanged — please retry"})
+				return
+			}
+			if err == nil && normalizeCloudProvider(oldProvider.String) != normalizeCloudProvider(newComputeProvider) {
+				log.Printf("Update: cloud-provider switch for %s: %q -> %q; deprovisioning old box on old provider before overwriting compute",
+					id, normalizeCloudProvider(oldProvider.String), normalizeCloudProvider(newComputeProvider))
+				// Use the ERROR-returning variant and ABORT before overwriting
+				// compute if the old-box deprovision fails. If we proceeded, the
+				// old box would keep running on the OLD cloud while the row now
+				// records the NEW provider+instance — stranding it with no DB
+				// pointer (an UNRECOVERABLE cross-cloud orphan that no reconciler
+				// can map back). Aborting leaves the row pointing at the
+				// still-recoverable old box; the user can retry the switch. (The
+				// restart paths' void cpStopWithRetry is fine there because the
+				// box stays on the SAME cloud, so the provider record is unchanged
+				// and a provider-scoped sweep can still find it.)
+				if err := h.cpStopWithRetryErr(ctx, id, "provider-switch", false); err != nil {
+					log.Printf("Update: provider-switch for %s ABORTED — could not deprovision old box on %q (provider left unchanged, old box recoverable): %v",
+						id, normalizeCloudProvider(oldProvider.String), err)
+					c.JSON(http.StatusBadGateway, gin.H{"error": "could not deprovision the current cloud box; provider unchanged — please retry"})
+					return
+				}
+			}
+		}
 		if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET compute = $2::jsonb, updated_at = now() WHERE id = $1`, id, computeJSON); err != nil {
 			log.Printf("Update compute error for %s: %v", id, err)
 			c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save compute config"})
@@ -0,0 +1,180 @@
+package handlers
+
+// workspace_provider_switch_test.go — deterministic coverage for the in-place
+// cloud-provider switch in the Update (PATCH /workspaces/:id) handler.
+//
+// The switch is DESTRUCTIVE (it recreates the box on a new cloud) and its
+// safety hinges on ORDER + ABORT, which these tests pin without touching a real
+// cloud (sqlmock DB + the scriptedCPStop fake from workspace_restart_stop_retry_test):
+//
+//   1. On a provider change, the OLD box is deprovisioned (cpProv.Stop) BEFORE
+//      the compute row is overwritten — otherwise the later restart's
+//      provider-aware deprovision would target the NEW cloud and ORPHAN the old
+//      (still-billing) box. The sqlmock query ORDER pins "read old provider →
+//      [Stop] → UPDATE compute".
+//   2. If the old-box deprovision FAILS, the handler ABORTS (502) and does NOT
+//      overwrite compute — leaving the row pointed at the recoverable old box
+//      (an unexpected UPDATE would fail sqlmock's expectations).
+//   3. A non-switch compute edit (same provider) does NOT deprovision anything.
+//   4. If the old-provider READ errors (transient DB fault, not sql.ErrNoRows),
+//      the handler FAILS CLOSED: aborts (502), deprovisions nothing, and does NOT
+//      overwrite compute — closing the fail-open read path that would otherwise
+//      orphan the old box on a real switch (security review RC 9895).
+
+import (
+	"bytes"
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/gin-gonic/gin"
+)
+
+func newPatchContext(t *testing.T, id, body string) (*gin.Context, *httptest.ResponseRecorder) {
+	t.Helper()
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: id}}
+	req := httptest.NewRequest("PATCH", "/workspaces/"+id, bytes.NewBufferString(body))
+	req.Header.Set("Content-Type", "application/json")
+	c.Request = req
+	return c, w
+}
+
+const switchTestWSID = "cccccccc-0001-0000-0000-000000000000"
+
+func newSwitchTestHandler(t *testing.T, cp *scriptedCPStop) *WorkspaceHandler {
+	t.Helper()
+	h := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+	h.cpProv = cp
+	return h
+}
+
+// 1. aws → hetzner: deprovision the OLD box, THEN overwrite compute (200).
+func TestWorkspaceUpdate_ProviderSwitch_DeprovisionsOldBeforeUpdate(t *testing.T) {
+	mock := setupTestDB(t)
+	cp := &scriptedCPStop{} // Stop succeeds
+	h := newSwitchTestHandler(t, cp)
+
+	// Ordered expectations pin: EXISTS → read OLD provider (aws) → UPDATE compute.
+	// The cpProv.Stop deprovision runs (in code) AFTER the provider read and
+	// BEFORE the UPDATE — exactly the orphan-safe order.
+	mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
+		WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
+	mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
+		WillReturnRows(sqlmock.NewRows([]string{"provider"}).AddRow("aws"))
+	mock.ExpectExec("UPDATE workspaces SET compute").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	c, w := newPatchContext(t, switchTestWSID,
+		`{"compute":{"instance_type":"cpx31","provider":"hetzner","volume":{"root_gb":30}}}`)
+	h.Update(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 on a successful switch, got %d: %s", w.Code, w.Body.String())
+	}
+	if cp.calls != 1 {
+		t.Fatalf("expected the OLD box to be deprovisioned exactly once on a provider switch; got %d Stop calls", cp.calls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet/unexpected DB queries (ordering broken?): %v", err)
+	}
+}
+
+// 2. Deprovision FAILS → abort (502) + compute NOT overwritten (no UPDATE).
+func TestWorkspaceUpdate_ProviderSwitch_AbortsWhenDeprovisionFails(t *testing.T) {
+	shrinkRetryBackoff(t) // don't burn the 1s/2s/4s retry backoff
+	mock := setupTestDB(t)
+	// All retry attempts fail → cpStopWithRetryErr returns an error → abort.
+	cp := &scriptedCPStop{errs: []error{
+		fmt.Errorf("cp 503"), fmt.Errorf("cp 503"), fmt.Errorf("cp 503"),
+	}}
+	h := newSwitchTestHandler(t, cp)
+
+	mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
+		WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
+	mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
+		WillReturnRows(sqlmock.NewRows([]string{"provider"}).AddRow("aws"))
+	// NO UPDATE expectation: if the handler overwrote compute after a failed
+	// deprovision (the orphan bug), sqlmock would flag the unexpected query.
+
+	c, w := newPatchContext(t, switchTestWSID,
+		`{"compute":{"instance_type":"cpx31","provider":"hetzner","volume":{"root_gb":30}}}`)
+	h.Update(c)
+
+	if w.Code != http.StatusBadGateway {
+		t.Fatalf("expected 502 when the old-box deprovision fails, got %d: %s", w.Code, w.Body.String())
+	}
+	if cp.calls == 0 {
+		t.Fatal("expected at least one Stop attempt before aborting")
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		// A failure here means an UNEXPECTED UPDATE ran — i.e. compute was
+		// overwritten after a failed deprovision → the orphan bug is back.
+		t.Fatalf("compute must NOT be overwritten when deprovision fails (orphan-prevention): %v", err)
+	}
+}
+
+// 3. Same provider (no switch): no deprovision; compute is updated normally.
+func TestWorkspaceUpdate_NoProviderSwitch_DoesNotDeprovision(t *testing.T) {
+	mock := setupTestDB(t)
+	cp := &scriptedCPStop{}
+	h := newSwitchTestHandler(t, cp)
+
+	mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
+		WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
+	mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
+		WillReturnRows(sqlmock.NewRows([]string{"provider"}).AddRow("aws"))
+	mock.ExpectExec("UPDATE workspaces SET compute").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// provider stays aws (only the instance size changes) → no switch, no Stop.
+	c, w := newPatchContext(t, switchTestWSID,
+		`{"compute":{"instance_type":"t3.large","provider":"aws","volume":{"root_gb":60}}}`)
+	h.Update(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if cp.calls != 0 {
+		t.Fatalf("a non-switching compute edit must NOT deprovision the box; got %d Stop calls", cp.calls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet/unexpected DB queries: %v", err)
+	}
+}
+
+// 4. Provider READ errors (transient DB fault) → fail-CLOSED: abort 502, no
+//    deprovision, no compute overwrite. A fail-open read (the old `err == nil`
+//    gate) would skip switch detection and overwrite compute → orphan the old
+//    cloud box. sqlmock has NO UPDATE/Stop expectations, so either an overwrite
+//    or a stray deprovision trips it.
+func TestWorkspaceUpdate_ProviderSwitch_AbortsOnProviderReadError(t *testing.T) {
+	mock := setupTestDB(t)
+	cp := &scriptedCPStop{}
+	h := newSwitchTestHandler(t, cp)
+
+	mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
+		WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
+	// The old-provider read hits a transient error (NOT sql.ErrNoRows).
+	mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
+		WillReturnError(fmt.Errorf("connection reset by peer"))
+
+	c, w := newPatchContext(t, switchTestWSID,
+		`{"compute":{"instance_type":"cpx31","provider":"hetzner","volume":{"root_gb":30}}}`)
+	h.Update(c)
+
+	if w.Code != http.StatusBadGateway {
+		t.Fatalf("expected 502 when the provider read fails (fail-closed), got %d: %s", w.Code, w.Body.String())
+	}
+	if cp.calls != 0 {
+		t.Fatalf("must NOT deprovision when the current provider can't be read; got %d Stop calls", cp.calls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		// An unexpected UPDATE here = compute was overwritten despite an unreadable
+		// provider → the fail-open orphan path is back.
+		t.Fatalf("compute must NOT be overwritten on a provider read error (fail-closed): %v", err)
+	}
+}
@@ -143,6 +143,12 @@ type HeartbeatPayload struct {
 	// false declared explicitly". Lets the platform distinguish "adapter
 	// said no native ownership" from "old runtime version, didn't say".
 	RuntimeMetadata *RuntimeMetadata `json:"runtime_metadata,omitempty"`
+
+	// AgentCard is sent by the runtime on heartbeat when the initial
+	// /registry/register failed and the workspace has no persisted agent_card.
+	// The heartbeat handler backfills NULL agent_card rows so the workspace
+	// can come online without requiring a full re-register. (#2421)
+	AgentCard json.RawMessage `json:"agent_card,omitempty"`
 }

 // RuntimeMetadata is the adapter-declared capability + override block
@@ -34,6 +34,11 @@ const (
 	// fireSchedule goroutine indefinitely, which blocked wg.Wait() in
 	// tick(), which stalled the entire scheduler until operator restart.
 	dbQueryTimeout = 10 * time.Second
+	// priorityTask mirrors handlers.PriorityTask (50) — the default FIFO A2A
+	// queue priority. Duplicated as a local const because the scheduler cannot
+	// import internal/handlers (handlers imports scheduler → cycle). Buffered
+	// cron ticks enqueue at the same priority as normal busy-retry A2A work.
+	priorityTask = 50
 )

 // sanitizeUTF8 replaces invalid UTF-8 byte sequences with the Unicode
@@ -48,9 +53,14 @@ func sanitizeUTF8(s string) string {
 }

 // A2AProxy is the interface the scheduler needs to send messages to workspaces.
-// WorkspaceHandler.ProxyA2ARequest satisfies this.
+// WorkspaceHandler.ProxyA2ARequest + WorkspaceHandler.EnqueueA2A satisfy this.
 type A2AProxy interface {
 	ProxyA2ARequest(ctx context.Context, workspaceID string, body []byte, callerID string, logActivity bool) (int, []byte, error)
+	// EnqueueA2A durably buffers an A2A message for a busy workspace; the
+	// drain dispatches it serially when the agent frees. idempotencyKey
+	// collapses duplicate pending buffers per (workspace,key). Returns the
+	// buffered entry id, the resulting pending depth, and any error.
+	EnqueueA2A(ctx context.Context, workspaceID, callerID string, priority int, body []byte, method, idempotencyKey string, expiresAt *time.Time) (string, int, error)
 }

 // Broadcaster records events and pushes them to WebSocket clients.
@@ -367,33 +377,6 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
 		sched.WorkspaceID,
 	).Scan(&activeTasks, &maxConcurrent)
 	capCancel()
-	if capErr == nil && activeTasks >= maxConcurrent {
-		log.Printf("Scheduler: '%s' workspace %s at capacity (active_tasks=%d, max=%d), deferring up to 2 min",
-			sched.Name, short(sched.WorkspaceID, 12), activeTasks, maxConcurrent)
-		// Poll every 10s for up to 2 minutes
-		waited := false
-		for i := 0; i < 12; i++ {
-			time.Sleep(10 * time.Second)
-			pollCtx, pollCancel := context.WithTimeout(ctx, dbQueryTimeout)
-			err := db.DB.QueryRowContext(pollCtx,
-				`SELECT COALESCE(active_tasks, 0), COALESCE(max_concurrent_tasks, 1) FROM workspaces WHERE id = $1`,
-				sched.WorkspaceID,
-			).Scan(&activeTasks, &maxConcurrent)
-			pollCancel()
-			if err != nil || activeTasks < maxConcurrent {
-				waited = true
-				break
-			}
-		}
-		if !waited && activeTasks >= maxConcurrent {
-			log.Printf("Scheduler: skipping '%s' on busy workspace %s after 2 min wait (active_tasks=%d, max=%d)",
-				sched.Name, short(sched.WorkspaceID, 12), activeTasks, maxConcurrent)
-			s.recordSkipped(ctx, sched, activeTasks)
-			return
-		}
-		log.Printf("Scheduler: '%s' workspace %s has capacity after deferral, firing",
-			sched.Name, short(sched.WorkspaceID, 12))
-	}

 	fireCtx, cancel := context.WithTimeout(ctx, fireTimeout)
 	defer cancel()
@@ -402,6 +385,9 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
 	// The agent sees recent peer messages before acting, enabling cross-agent
 	// awareness without explicit A2A delegation. Best-effort — if the fetch
 	// fails or the workspace has no Slack channels, the prompt is unchanged.
+	//
+	// Built BEFORE the capacity check so the busy-enqueue path below buffers
+	// the exact same A2A message the fire path would have dispatched.
 	prompt := sched.Prompt
 	if s.channels != nil {
 		if channelCtx := s.channels.FetchWorkspaceChannelContext(fireCtx, sched.WorkspaceID); channelCtx != "" {
@@ -426,6 +412,49 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
 		return
 	}

+	// #969 → durable buffering. When the target workspace is busy
+	// (active_tasks >= max_concurrent_tasks) we do NOT skip the tick and we do
+	// NOT block the scheduler goroutine waiting for capacity. Instead we durably
+	// buffer the cron message, mirroring how busy A2A dispatches already buffer.
+	// The drain then dispatches it serially the moment the agent frees —
+	// execution stays one-at-a-time; max_concurrent_tasks is unchanged.
+	//
+	// This supersedes the previous "poll then recordSkipped" behavior, which
+	// dropped scheduled ticks on workspaces that stayed busy across the whole
+	// poll window.
+	//
+	// Idempotency key = sched.ID (the SCHEDULE id), NOT msgID/a random uuid.
+	// Keying by schedule_id means a busy agent buffers AT MOST ONE pending tick
+	// per schedule — the latest one wins, the obsolete newer tick is collapsed —
+	// so we hold the next tick instead of stacking a stale backlog.
+	if capErr == nil && activeTasks >= maxConcurrent {
+		// Buffered ticks expire at the next scheduled fire: a tick that's been
+		// sitting in the queue past when the cron would naturally tick again is
+		// stale, so let it expire rather than fire late. Best-effort — on a bad
+		// cron expr we enqueue with no TTL (NULL) rather than block the tick.
+		var expiresAt *time.Time
+		if nextRun, nrErr := ComputeNextRun(sched.CronExpr, sched.Timezone, time.Now()); nrErr == nil {
+			expiresAt = &nextRun
+		}
+		enqCtx, enqCancel := context.WithTimeout(ctx, dbQueryTimeout)
+		// Empty callerID = canvas-style (source_id NULL), matching the fire path.
+		qID, depth, enqErr := s.proxy.EnqueueA2A(enqCtx, sched.WorkspaceID, "", priorityTask, a2aBody, "message/send", sched.ID, expiresAt)
+		enqCancel()
+		if enqErr != nil {
+			// Enqueue failed — fall back to recording a skip so the liveness
+			// view still advances and the operator sees the error, rather than
+			// silently dropping the tick or firing into a busy agent.
+			log.Printf("Scheduler: '%s' enqueue on busy workspace %s failed, recording skip: %v",
+				sched.Name, short(sched.WorkspaceID, 12), enqErr)
+			s.recordSkipped(ctx, sched, activeTasks)
+			return
+		}
+		log.Printf("Scheduler: '%s' workspace %s busy (active_tasks=%d, max=%d) — enqueued tick %s (queue depth=%d), will drain when idle",
+			sched.Name, short(sched.WorkspaceID, 12), activeTasks, maxConcurrent, short(qID, 8), depth)
+		s.recordQueued(ctx, sched, activeTasks, qID, depth)
+		return
+	}
+
 	log.Printf("Scheduler: firing '%s' → workspace %s", sched.Name, short(sched.WorkspaceID, 12))

 	// Empty callerID = canvas-style request (bypasses access control, source_id=NULL in activity log).
@@ -727,6 +756,74 @@ func (s *Scheduler) recordSkipped(ctx context.Context, sched scheduleRow, active
 	}
 }

+// recordQueued advances next_run_at and logs a cron_run activity entry with
+// status='queued' when the target workspace was busy and the tick was durably
+// buffered instead of fired. Mirrors recordSkipped (#115) but records a buffer,
+// not a drop: the drain will dispatch qID serially when the agent frees.
+// next_run_at still advances so the liveness view keeps ticking and the NEXT
+// cron slot enqueues (the schedule_id idempotency key then holds at most one
+// pending tick — the latest — per schedule).
+func (s *Scheduler) recordQueued(ctx context.Context, sched scheduleRow, activeTasks int, queueID string, depth int) {
+	reason := fmt.Sprintf("queued: workspace busy (active_tasks=%d), buffered (id=%s, depth=%d)", activeTasks, short(queueID, 8), depth)
+
+	nextRun, nextErr := ComputeNextRun(sched.CronExpr, sched.Timezone, time.Now())
+	var nextRunPtr *time.Time
+	if nextErr == nil {
+		nextRunPtr = &nextRun
+	} else {
+		// Same guard as recordSkipped/fireSchedule — preserve existing
+		// next_run_at rather than writing NULL on an unparseable cron expr.
+		log.Printf("Scheduler: ComputeNextRun error in recordQueued for '%s' (%s) — preserving existing next_run_at: %v",
+			sched.Name, sched.ID, nextErr)
+	}
+
+	queuedUpdCtx, queuedUpdCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
+	if _, err := db.DB.ExecContext(queuedUpdCtx, `
+		UPDATE workspace_schedules
+		SET last_run_at = now(),
+		    next_run_at = COALESCE($2, next_run_at),
+		    run_count = run_count + 1,
+		    last_status = 'queued',
+		    last_error = $3,
+		    updated_at = now()
+		WHERE id = $1
+	`, sched.ID, nextRunPtr, sanitizeUTF8(reason)); err != nil {
+		log.Printf("Scheduler: '%s' queued update failed: %v", sched.Name, err)
+	}
+	queuedUpdCancel()
+
+	cronMeta, marshalErr := json.Marshal(map[string]interface{}{
+		"schedule_id":   sched.ID,
+		"schedule_name": sched.Name,
+		"cron_expr":     sched.CronExpr,
+		"queued":        true,
+		"active_tasks":  activeTasks,
+		"queue_id":      queueID,
+		"queue_depth":   depth,
+	})
+	if marshalErr != nil {
+		log.Printf("Scheduler '%s': json.Marshal cronMeta(queued) failed: %v", sched.Name, marshalErr)
+	} else {
+		queuedInsCtx, queuedInsCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
+		if _, err := db.DB.ExecContext(queuedInsCtx, `
+			INSERT INTO activity_logs (workspace_id, activity_type, source_id, method, summary, request_body, status, error_detail, created_at)
+			VALUES ($1, 'cron_run', NULL, 'cron', $2, $3::jsonb, 'queued', $4, now())
+		`, sched.WorkspaceID, sanitizeUTF8("Cron queued (busy): "+sched.Name), string(cronMeta), sanitizeUTF8(reason)); err != nil {
+			log.Printf("Scheduler: '%s' queued activity log failed: %v", sched.Name, err)
+		}
+		queuedInsCancel()
+	}
+
+	if s.broadcaster != nil {
+		_ = s.broadcaster.RecordAndBroadcast(ctx, string(events.EventCronSkipped), sched.WorkspaceID, map[string]interface{}{
+			"schedule_id":   sched.ID,
+			"schedule_name": sched.Name,
+			"reason":        reason,
+			"queued":        true,
+		})
+	}
+}
+
 // repairNullNextRunAt is called once during Start() to recompute next_run_at
 // for any enabled schedule where it is NULL — a state left by the pre-#722 bug
 // where a ComputeNextRun error caused an UPDATE that wrote NULL.
@@ -73,6 +73,14 @@ type recordingProxy struct {
 	lastCaller  string
 	lastLogFlag bool
 	lastWSID    string
+
+	// enqueue tracking — the busy path calls EnqueueA2A instead of firing.
+	enqueues    int
+	lastEnqBody []byte
+	lastEnqKey  string
+	enqQueueID  string
+	enqDepth    int
+	enqErr      error
 }

 func (p *recordingProxy) ProxyA2ARequest(
@@ -89,6 +97,25 @@ func (p *recordingProxy) ProxyA2ARequest(
 	return p.status, p.body, nil
 }

+// EnqueueA2A records the busy-path enqueue so tests can assert that a tick on a
+// busy workspace was buffered (not fired, not skipped).
+func (p *recordingProxy) EnqueueA2A(
+	_ context.Context, workspaceID, callerID string, _ int, body []byte, _ string, idempotencyKey string, _ *time.Time,
+) (string, int, error) {
+	p.enqueues++
+	p.lastWSID = workspaceID
+	p.lastCaller = callerID
+	p.lastEnqBody = body
+	p.lastEnqKey = idempotencyKey
+	if p.enqErr != nil {
+		return "", 0, p.enqErr
+	}
+	if p.enqQueueID == "" {
+		p.enqQueueID = "q-rec-1"
+	}
+	return p.enqQueueID, p.enqDepth, nil
+}
+
 // ── connection + fixture helpers ──────────────────────────────────────────

 // integrationDB returns the configured integration-test connection or skips
@@ -42,6 +42,13 @@ func (p *panicProxy) ProxyA2ARequest(
 	panic("simulated A2A proxy panic")
 }

+// EnqueueA2A satisfies the extended A2AProxy interface; panics like the fire path.
+func (p *panicProxy) EnqueueA2A(
+	_ context.Context, _ string, _ string, _ int, _ []byte, _ string, _ string, _ *time.Time,
+) (string, int, error) {
+	panic("simulated A2A enqueue panic")
+}
+
 // ── TestLastTickAt_zero ───────────────────────────────────────────────────────

 // TestLastTickAt_zero confirms that LastTickAt returns a zero time.Time on a
@@ -210,6 +217,90 @@ func TestShort_helper(t *testing.T) {
 }

 // ── TestRecordSkipped_writesSkippedStatus ────────────────────────────────────
+// ── busyEnqueueProxy + TestFireSchedule_BusyEnqueuesInsteadOfSkipping ──────────
+//
+// Replaces the old "busy → skip after 2 min" assertion. When the workspace is
+// at capacity, fireSchedule must ENQUEUE the tick into the durable a2a_queue
+// (keyed by schedule_id) and record last_status='queued' — NOT fire and NOT
+// recordSkipped. Proves the scheduled-tick-starvation fix.
+
+type busyEnqueueProxy struct {
+	fired       int
+	enqueued    int
+	enqKey      string
+	enqMethod   string
+	enqPriority int
+}
+
+func (p *busyEnqueueProxy) ProxyA2ARequest(
+	_ context.Context, _ string, _ []byte, _ string, _ bool,
+) (int, []byte, error) {
+	p.fired++
+	return 200, []byte(`{"ok":true}`), nil
+}
+
+func (p *busyEnqueueProxy) EnqueueA2A(
+	_ context.Context, _ string, _ string, priority int, _ []byte, method, idempotencyKey string, _ *time.Time,
+) (string, int, error) {
+	p.enqueued++
+	p.enqKey = idempotencyKey
+	p.enqMethod = method
+	p.enqPriority = priority
+	return "q-busy-1", 1, nil
+}
+
+func TestFireSchedule_BusyEnqueuesInsteadOfSkipping(t *testing.T) {
+	mock := setupTestDB(t)
+
+	sched := scheduleRow{
+		ID:          "77777777-dead-beef-0000-000000000007",
+		WorkspaceID: "88888888-dead-beef-0000-000000000008",
+		Name:        "busy-enqueue-job",
+		CronExpr:    "*/5 * * * *",
+		Timezone:    "UTC",
+		Prompt:      "tick while busy",
+	}
+
+	// Capacity check → active_tasks(2) >= max_concurrent(1): workspace is busy.
+	mock.ExpectQuery(`SELECT COALESCE`).
+		WillReturnRows(sqlmock.NewRows([]string{"active_tasks", "max"}).AddRow(2, 1))
+
+	// recordQueued UPDATE — binds ($1=sched.ID, $2=nextRunPtr, $3=reason);
+	// last_status='queued' is a SQL literal, not a bound arg.
+	mock.ExpectExec(`UPDATE workspace_schedules`).
+		WithArgs(sched.ID, sqlmock.AnyArg(), sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// recordQueued activity_logs INSERT — binds 4 args (workspace_id, summary,
+	// request_body, error_detail); status='queued' is a SQL literal.
+	mock.ExpectExec(`INSERT INTO activity_logs`).
+		WithArgs(sched.WorkspaceID, sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	proxy := &busyEnqueueProxy{}
+	s := New(proxy, nil)
+	s.fireSchedule(context.Background(), sched)
+
+	if proxy.fired != 0 {
+		t.Errorf("busy workspace: ProxyA2ARequest must NOT fire, got %d fires", proxy.fired)
+	}
+	if proxy.enqueued != 1 {
+		t.Fatalf("busy workspace: expected exactly 1 EnqueueA2A, got %d", proxy.enqueued)
+	}
+	if proxy.enqKey != sched.ID {
+		t.Errorf("idempotency key must be schedule_id %q (buffer-latest dedup), got %q", sched.ID, proxy.enqKey)
+	}
+	if proxy.enqMethod != "message/send" {
+		t.Errorf("enqueued method = %q, want \"message/send\"", proxy.enqMethod)
+	}
+	if proxy.enqPriority != priorityTask {
+		t.Errorf("enqueued priority = %d, want priorityTask(%d)", proxy.enqPriority, priorityTask)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet DB expectations — busy tick not recorded as queued: %v", err)
+	}
+}
+
 // #115 coverage gap: the recordSkipped path wasn't tested at all when it
 // first landed. Exercises the UPDATE workspace_schedules + INSERT into
 // activity_logs via sqlmock. Broadcaster is nil so we don't need to stub
@@ -257,6 +348,13 @@ func (p *successProxy) ProxyA2ARequest(
 	return 200, []byte(`{"ok":true}`), nil
 }

+// EnqueueA2A satisfies the extended A2AProxy interface.
+func (p *successProxy) EnqueueA2A(
+	_ context.Context, _ string, _ string, _ int, _ []byte, _ string, _ string, _ *time.Time,
+) (string, int, error) {
+	return "q-success", 1, nil
+}
+
 // ── adapterErrorProxy ─────────────────────────────────────────────────────────

 // adapterErrorProxy is a test double whose ProxyA2ARequest returns HTTP 200
@@ -270,6 +368,13 @@ func (p *adapterErrorProxy) ProxyA2ARequest(
 	return 200, []byte(`{"jsonrpc":"2.0","id":"cron-test-123","error":{"code":-32603,"message":"adapter SDK internal error"}}`), nil
 }

+// EnqueueA2A satisfies the extended A2AProxy interface.
+func (p *adapterErrorProxy) EnqueueA2A(
+	_ context.Context, _ string, _ string, _ int, _ []byte, _ string, _ string, _ *time.Time,
+) (string, int, error) {
+	return "q-adaptererr", 1, nil
+}
+
 // ── TestFireSchedule_AdapterSDKError (#1696) ──────────────────────────────────
 //
 // When the adapter SDK throws internally and returns HTTP 200 with an error
@@ -667,6 +772,7 @@ func TestRecordSkipped_AdvancesNextRunAt(t *testing.T) {
 			"recordSkipped must advance next_run_at when workspace is busy (#1029)", err)
 	}
 }
+
 // trigger CI

 // ── TestDetectResultKind ───────────────────────────────────────────────────────
@@ -833,10 +939,10 @@ func TestDetectResultKind(t *testing.T) {
 //
 // When ProxyA2ARequest returns HTTP 200 but the response body contains a
 // non-ok result_kind, fireSchedule must:
-//   1. Set last_status to the result_kind (not 'ok').
-//   2. Set last_error to describe the SDK error.
-//   3. Increment consecutive_sdk_errors.
-//   4. NOT auto-disable on first occurrence (threshold is 3).
+//  1. Set last_status to the result_kind (not 'ok').
+//  2. Set last_error to describe the SDK error.
+//  3. Increment consecutive_sdk_errors.
+//  4. NOT auto-disable on first occurrence (threshold is 3).
 //
 // This test uses an sdkErrorProxy that returns a rate-limited body and asserts
 // the first run is recorded as 'rate_limited' with consecutive_sdk_errors=1
@@ -999,6 +1105,13 @@ func (p *sdkErrorProxy) ProxyA2ARequest(
 	return 200, body, nil
 }

+// EnqueueA2A satisfies the extended A2AProxy interface.
+func (p *sdkErrorProxy) EnqueueA2A(
+	_ context.Context, _ string, _ string, _ int, _ []byte, _ string, _ string, _ *time.Time,
+) (string, int, error) {
+	return "q-sdkerr", 1, nil
+}
+
 // ── TestTruncate_utf8Safe_regression2026 ──────────────────────────────────────

 // TestTruncate_utf8Safe_regression2026 locks in the #2026 fix: truncate must