diff --git a/.gitea/workflows/e2e-chat.yml b/.gitea/workflows/e2e-chat.yml
index 1ffadf3f4..d33f11c30 100644
--- a/.gitea/workflows/e2e-chat.yml
+++ b/.gitea/workflows/e2e-chat.yml
@@ -113,6 +113,28 @@ jobs:
     runs-on: docker-host
     # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
     # mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
+    #
+    # PROMOTION-READINESS (toward required gate — do NOT flip continue-on-error
+    # without CTO sign-off, that's the irreversible call):
+    #   NOW FAIL-CLOSED:
+    #     - Postgres/Redis/platform/canvas readiness are already bounded
+    #       readiness-polls that hard-fail (and dump logs) at their deadline,
+    #       not fixed sleeps — preserved.
+    #     - passWithNoTests:false + forbidOnly (playwright.config.ts) → a
+    #       renamed/moved spec or stray test.only can no longer green the lane.
+    #     - REQUIRE-LIVE guard in "Run Playwright E2E tests" → chat==true must
+    #       actually execute >=1 test, else exit 1.
+    #     - chat-desktop "activity log" test no longer swallows its assertion.
+    #   STILL BLOCKS PROMOTION:
+    #     - The echo round-trip asserts on rendered "Echo: ..." text but never
+    #       asserts the echo runtime actually RECEIVED the A2A request
+    #       (fixtures/echo-runtime.ts exposes lastRequest, unused) — an
+    #       optimistic client-side render could pass without a real round-trip.
+    #       Add a server-received assertion before required.
+    #     - The "No-op pass" path (detect-changes chat!=true) is a legitimate
+    #       paths-filter skip, but a required gate needs it to be a neutral
+    #       check, not a green "success", so a skipped heavy lane can't be
+    #       mistaken for a passed one.
     continue-on-error: true
     timeout-minutes: 15
     env:
@@ -334,11 +356,32 @@ jobs:
       - name: Run Playwright E2E tests
         if: needs.detect-changes.outputs.chat == 'true'
         working-directory: canvas
+        env:
+          # CI=1 activates forbidOnly in playwright.config.ts (a stray
+          # `test.only` would otherwise green the suite while skipping the
+          # rest). passWithNoTests:false (also in the config) already makes
+          # a zero-match selection exit non-zero.
+          CI: "1"
         run: |
+          set -euo pipefail
           export E2E_PLATFORM_URL="http://127.0.0.1:${PLATFORM_PORT}"
           export E2E_DATABASE_URL="${DATABASE_URL}"
           export PLAYWRIGHT_BASE_URL="http://localhost:${CANVAS_PORT}"
-          npx playwright test e2e/chat-desktop.spec.ts e2e/chat-mobile.spec.ts
+
+          # REQUIRE-LIVE guard (mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE):
+          # this lane reached here only because detect-changes said chat==true,
+          # so it MUST actually execute the round-trip specs. `pipefail` makes
+          # a real test failure (playwright non-zero) abort here under `set -e`;
+          # passWithNoTests:false makes a zero-match selection non-zero too. The
+          # explicit grep below is belt-and-braces: assert the list reporter
+          # printed an executed-count summary, so a silent all-skip / no-op can
+          # never report green.
+          npx playwright test e2e/chat-desktop.spec.ts e2e/chat-mobile.spec.ts \
+            --reporter=list 2>&1 | tee /tmp/pw-chat.out
+          if ! grep -qE '[0-9]+ (passed|failed|skipped)' /tmp/pw-chat.out; then
+            echo "::error::E2E Chat REQUIRE-LIVE: chat==true but Playwright reported no executed tests — specs missing or all-skipped, refusing to report green."
+            exit 1
+          fi
 
       - name: Dump platform log on failure
         if: failure() && needs.detect-changes.outputs.chat == 'true'
diff --git a/.gitea/workflows/e2e-staging-external.yml b/.gitea/workflows/e2e-staging-external.yml
index 8236e9b92..2b2432d4d 100644
--- a/.gitea/workflows/e2e-staging-external.yml
+++ b/.gitea/workflows/e2e-staging-external.yml
@@ -85,6 +85,25 @@ jobs:
     runs-on: ubuntu-latest
     # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
     # mc#1982: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
+    #
+    # PROMOTION-READINESS (toward required gate — do NOT flip continue-on-error
+    # without CTO sign-off, that's the irreversible call):
+    #   NOW FAIL-CLOSED:
+    #     - Missing CP_STAGING_ADMIN_API_TOKEN → hard exit 2 (preflight).
+    #     - Staging CP unhealthy → hard exit 1 (preflight, not a workspace bug).
+    #     - Harness E2E_REQUIRE_LIVE=1 → exit 5 if a clean exit didn't prove
+    #       all four awaiting_agent transitions (no silent skip).
+    #     - Sweep transition (step 6) is now a bounded readiness-poll, not a
+    #       fixed sleep + one-shot assert → no more sweep-cadence flake.
+    #     - register / re-register retry ONLY transient edge 5xx (bounded),
+    #       fail closed on 4xx → no more cold-boot-502 flake.
+    #   STILL BLOCKS PROMOTION:
+    #     - Single shared staging tenant + EC2 quota window: an infra-side
+    #       provisioning outage (not a code bug) would turn the gate red.
+    #       Needs an infra-class vs code-class signal split before required.
+    #     - "CP unhealthy → exit 1" currently looks identical to a real
+    #       failure on the run page; required-gate would need it demoted to
+    #       a neutral/skip so staging flakiness can't block merges.
     continue-on-error: true
     timeout-minutes: 25
 
@@ -124,6 +143,15 @@ jobs:
 
       - name: Run external-runtime E2E
         id: e2e
+        # E2E_REQUIRE_LIVE=1: the harness fails CLOSED (exit 5) if it ever
+        # reaches a clean exit without proving all four awaiting_agent
+        # transitions. Mirrors CP serving-e2e SERVING_E2E_REQUIRE_LIVE — a
+        # silent skip / early-return / dropped assertion can no longer
+        # masquerade as green. Token-missing and CP-unhealthy already
+        # hard-fail in the two preflight steps above, so reaching this step
+        # means a real cycle is expected.
+        env:
+          E2E_REQUIRE_LIVE: "1"
         run: bash tests/e2e/test_staging_external_runtime.sh
 
       # Mirror the e2e-staging-saas.yml safety net: if the runner is
diff --git a/canvas/e2e/chat-desktop.spec.ts b/canvas/e2e/chat-desktop.spec.ts
index 15bb2d880..bf3f1e22c 100644
--- a/canvas/e2e/chat-desktop.spec.ts
+++ b/canvas/e2e/chat-desktop.spec.ts
@@ -101,10 +101,19 @@ test.describe("Desktop ChatTab", () => {
     await textarea.fill("Trigger activity");
     await page.getByRole("button", { name: /Send/ }).first().click();
 
-    // Activity log container should appear during the send flow.
-    await expect(page.locator("[data-testid='activity-log']").first()).toBeVisible({ timeout: 10_000 }).catch(() => {
-      // Activity log may not be present in all layouts.
-    });
+    // FALSE-GREEN FIX: the prior `.catch(() => {})` swallowed the assertion
+    // entirely, so this test passed whether or not the activity log ever
+    // rendered. The activity-log container is optional per layout, so we
+    // gate on its presence in the DOM: if it's not part of this layout,
+    // skip explicitly (a recorded skip, not a silent pass); if it IS
+    // present, it MUST become visible during the send flow — that's the
+    // behaviour this test exists to protect.
+    const activityLog = page.locator("[data-testid='activity-log']").first();
+    if ((await activityLog.count()) === 0) {
+      test.skip(true, "activity-log not part of this layout");
+      return;
+    }
+    await expect(activityLog).toBeVisible({ timeout: 10_000 });
   });
 });
 
diff --git a/canvas/playwright.config.ts b/canvas/playwright.config.ts
index 88c32e0d7..eb8e542b6 100644
--- a/canvas/playwright.config.ts
+++ b/canvas/playwright.config.ts
@@ -7,6 +7,14 @@ export default defineConfig({
   fullyParallel: false,
   workers: 1,
   retries: 0,
+  // Fail CLOSED when an explicit spec selection matches zero tests.
+  // Playwright defaults this to true, so `playwright test e2e/chat-*.spec.ts`
+  // would exit 0 (green) if those files were renamed/moved/deleted — a
+  // false-green that would silently gut the e2e-chat gate after a refactor.
+  // forbidOnly likewise stops a stray `test.only` from green-ing the suite
+  // while skipping every other case.
+  passWithNoTests: false,
+  forbidOnly: !!process.env.CI,
   use: {
     baseURL: process.env.PLAYWRIGHT_BASE_URL || "http://localhost:3000",
     headless: true,
diff --git a/tests/e2e/test_staging_external_runtime.sh b/tests/e2e/test_staging_external_runtime.sh
index 68ca1b620..b4c8d4d70 100755
--- a/tests/e2e/test_staging_external_runtime.sh
+++ b/tests/e2e/test_staging_external_runtime.sh
@@ -40,9 +40,25 @@
 #   E2E_INTENTIONAL_FAILURE     1 → break a step on purpose to verify
 #                               the EXIT trap still tears down (mirrors
 #                               the full-saas harness's safety net).
+#   E2E_REQUIRE_LIVE            1 → fail-closed if the harness exits 0
+#                               WITHOUT having driven all four
+#                               awaiting_agent transitions. CI sets this
+#                               so a future skip / early-return can never
+#                               masquerade as a green run. Mirrors CP
+#                               serving-e2e SERVING_E2E_REQUIRE_LIVE.
+#   E2E_STALE_POLL_DEADLINE_SECS  default 240. Upper bound for the
+#                               heartbeat-staleness READINESS poll (step
+#                               6). Replaces the old fixed sleep+one-shot
+#                               assert that raced the sweep cadence.
+#   E2E_TRANSIENT_RETRIES      default 8. Bounded retries for register /
+#                               re-register against transient edge errors
+#                               (502/503/504 from Caddy during cold TLS /
+#                               agent boot). Mirrors the full-saas
+#                               cold-start retry loop — NOT a bare sleep.
 #
 # Exit codes: 0 happy, 1 generic, 2 missing env, 3 provision timeout,
-# 4 teardown leak.
+# 4 teardown leak, 5 REQUIRE_LIVE violation (exited 0 having validated
+# nothing).
 
 set -euo pipefail
 
@@ -51,6 +67,13 @@ ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway s
 PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
 RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
 STALE_WAIT_SECS="${E2E_STALE_WAIT_SECS:-180}"
+# Readiness-poll deadline for the sweep transition (step 6). Must exceed
+# STALE_WAIT_SECS (the no-heartbeat window) by at least one sweep
+# interval so a slightly-late sweep tick is polled-for, not misread as a
+# stuck 'online'. 240 = 180s window + 60s sweep-cadence headroom.
+STALE_POLL_DEADLINE_SECS="${E2E_STALE_POLL_DEADLINE_SECS:-240}"
+TRANSIENT_RETRIES="${E2E_TRANSIENT_RETRIES:-8}"
+REQUIRE_LIVE="${E2E_REQUIRE_LIVE:-0}"
 
 SLUG="e2e-ext-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
 SLUG=$(echo "$SLUG" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-' | head -c 32)
@@ -59,6 +82,66 @@ log()  { echo "[$(date +%H:%M:%S)] $*"; }
 fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
 ok()   { echo "[$(date +%H:%M:%S)] ✅ $*"; }
 
+# REQUIRE_LIVE bookkeeping: count the four awaiting_agent transitions the
+# test is contracted to prove. The EXIT trap fails-closed (exit 5) if the
+# script reaches a clean exit without all four — so a silent skip, an
+# early `return 0`, or a refactor that drops a step can never show green.
+TRANSITIONS_VERIFIED=0
+EXPECTED_TRANSITIONS=4
+require_transition() {  # $1 = human label
+  TRANSITIONS_VERIFIED=$((TRANSITIONS_VERIFIED + 1))
+  log "    [require-live] transition ${TRANSITIONS_VERIFIED}/${EXPECTED_TRANSITIONS} proven: $1"
+}
+
+# Redact bearer tokens from any HTTP body before logging (mirrors the
+# full-saas sanitize_http_body so transient-error logs never leak creds).
+sanitize_http_body() {
+  sed -E 's/(Bearer|token)[[:space:]]+[A-Za-z0-9._-]+/\1 REDACTED/g'
+}
+
+# Bounded retry-on-transient for POST /registry/register. The tenant edge
+# (Caddy) returns 502/503/504 with an identifiable body while TLS / the
+# workspace agent finishes cold-booting — a single shot here was the
+# un-named flake (a transient edge error misread as a register failure).
+# This mirrors the full-saas cold-start loop (test_staging_full_saas.sh
+# ~L780-816): retry ONLY on a transient TRANSPORT class (5xx + body
+# match), bounded by TRANSIENT_RETRIES, and FAIL CLOSED (non-zero) once
+# the budget is spent. It deliberately does NOT retry on a 4xx — that's a
+# real contract bug (e.g. wrong payload field) and must stay red.
+# Sets REGISTER_RESP (body + trailing "HTTP_CODE=NNN" line) on success;
+# returns non-zero (caller `fail`s) when the bounded budget is exhausted.
+register_with_retry() {  # $1 = step label, $2 = request body
+  local label="$1" body="$2"
+  local attempt code resp safe
+  for attempt in $(seq 1 "$TRANSIENT_RETRIES"); do
+    set +e
+    resp=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST \
+      "$TENANT_URL/registry/register" \
+      -H "Authorization: Bearer $WS_AUTH_TOKEN" \
+      -H "X-Molecule-Org-Id: $ORG_ID" \
+      -H "Content-Type: application/json" \
+      -d "$body")
+    set -e
+    code=$(printf '%s' "$resp" | sed -n 's/^HTTP_CODE=//p' | tail -n1)
+    code=${code:-000}
+    if [ "$code" = "200" ]; then
+      REGISTER_RESP="$resp"
+      return 0
+    fi
+    safe=$(printf '%s' "$resp" | sanitize_http_body | head -c 300)
+    # Retry ONLY on a transient transport class; a 4xx is a real bug.
+    if echo "$code" | grep -Eq '^(502|503|504)$' \
+       && echo "$safe" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|workspace agent unreachable|connection refused|no healthy upstream'; then
+      log "    ${label} transient $code attempt ${attempt}/${TRANSIENT_RETRIES}: $safe"
+      [ "$attempt" -lt "$TRANSIENT_RETRIES" ] && { sleep 10; continue; }
+    fi
+    # Non-transient (4xx, or unrecognized 5xx body): stop and fail closed.
+    REGISTER_RESP="$resp"
+    return 1
+  done
+  return 1
+}
+
 CURL_COMMON=(-sS --fail-with-body --max-time 30)
 
 # ─── cleanup trap (mirrors full-saas) ────────────────────────────────────
@@ -98,8 +181,19 @@ cleanup_org() {
   fi
   ok "Teardown clean — no orphan resources for $SLUG (${elapsed}s)"
 
+  # REQUIRE_LIVE fail-closed gate. Only meaningful on an OTHERWISE-CLEAN
+  # exit (entry_rc==0): a script that completed all steps but somehow did
+  # not register all four transitions (a skip, an early return, a dropped
+  # assertion in a refactor) must NOT report success. A non-zero entry_rc
+  # already carries its own failure semantics — don't mask it with 5.
+  if [ "$entry_rc" = "0" ] && [ "${REQUIRE_LIVE}" = "1" ] \
+     && [ "$TRANSITIONS_VERIFIED" -lt "$EXPECTED_TRANSITIONS" ]; then
+    echo "❌ REQUIRE_LIVE: exited 0 but only ${TRANSITIONS_VERIFIED}/${EXPECTED_TRANSITIONS} awaiting_agent transitions were proven — refusing to report green." >&2
+    exit 5
+  fi
+
   case "$entry_rc" in
-    0|1|2|3|4) ;;
+    0|1|2|3|4|5) ;;
     *) exit 1 ;;
   esac
 }
@@ -248,6 +342,7 @@ GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
 DB_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
 [ "$DB_STATUS" != "awaiting_agent" ] && fail "DB row status=$DB_STATUS (expected awaiting_agent — migration 046 likely not applied)"
 ok "DB row stored as awaiting_agent (proof migration 046 applied)"
+require_transition "create: provisioning → awaiting_agent (DB-verified)"
 
 # ─── 5. Register the workspace (transitions to online) ──────────────────
 # Pre-fix this path was actually fine because it writes 'online', a value
@@ -277,20 +372,20 @@ log "5/8 Registering workspace via /registry/register..."
 #   url           — accepted but not dispatched-to in poll mode, so
 #                   example.invalid is a valid sentinel.
 REGISTER_BODY=$(printf '{"id":"%s","url":"https://example.invalid:443","delivery_mode":"poll","agent_card":{"name":"e2e-ext","skills":[{"id":"echo","name":"Echo"}]}}' "$WS_ID")
-# Disable --fail-with-body for this one call so a 4xx surfaces the response
-# body (the bare CURL_COMMON would `set -e`-kill before we could log it).
-REGISTER_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
-  -H "Authorization: Bearer $WS_AUTH_TOKEN" \
-  -H "X-Molecule-Org-Id: $ORG_ID" \
-  -H "Content-Type: application/json" \
-  -d "$REGISTER_BODY") || true
-log "    register response: $(echo "$REGISTER_RESP" | head -c 300)"
-echo "$REGISTER_RESP" | grep -q "HTTP_CODE=200" || fail "register returned non-200 — see body above"
+# Bounded retry-on-transient (see register_with_retry). The previous
+# single-shot here would `fail` on a cold-boot 502 from the tenant edge —
+# an un-named transient misread as a register break. The helper retries
+# ONLY that class and fails closed on a real 4xx or an exhausted budget.
+REGISTER_RESP=""
+register_with_retry "register" "$REGISTER_BODY" \
+  || fail "register returned non-200 after bounded retries — body: $(printf '%s' "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
+log "    register response: $(echo "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
 
 GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
 ONLINE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
 [ "$ONLINE_STATUS" != "online" ] && fail "Expected online after register, got $ONLINE_STATUS"
 ok "Workspace transitioned to online"
+require_transition "register: awaiting_agent → online"
 
 # Confirm the register handler echoed back delivery_mode=poll. We read
 # this from the register RESPONSE, not the workspace GET response, because
@@ -310,38 +405,63 @@ fi
 # This is the SECOND silent-failure path (registry/healthsweep.go's
 # sweepStaleRemoteWorkspaces). Pre-migration-046 the heartbeat-staleness
 # UPDATE silently failed and the workspace stuck on 'online' forever
-# even though no agent was alive. We wait the full window + a sweep
-# interval and assert the row transitions back to 'awaiting_agent'.
-log "6/8 Waiting ${STALE_WAIT_SECS}s for heartbeat-staleness sweep (no heartbeat sent)..."
+# even though no agent was alive.
+#
+# FLAKE FIX (named: sweep-cadence race). The old code did a FIXED
+# `sleep $STALE_WAIT_SECS` then a SINGLE assert. The staleness sweep is a
+# periodic tick (REMOTE_LIVENESS_STALE_AFTER + a sweep interval); if the
+# tick that flips the row lands even one second after the fixed sleep, the
+# one-shot GET reads 'online' and the test fails — a real transition,
+# misread as a flake because the assert was racing the sweep cadence.
+# Replace with: sleep through the mandatory no-heartbeat window ONCE (the
+# sweep cannot fire before the window elapses, so polling earlier is
+# pointless), then READINESS-POLL for the awaiting_agent transition up to
+# STALE_POLL_DEADLINE_SECS, hard-failing with a clear message at the
+# deadline. Deterministic: a slow-but-working sweep passes; a genuinely
+# stuck 'online' still fails (now with how long we actually waited).
+log "6/8 Waiting ${STALE_WAIT_SECS}s no-heartbeat window, then polling for sweep (up to ${STALE_POLL_DEADLINE_SECS}s total)..."
+[ "$STALE_POLL_DEADLINE_SECS" -le "$STALE_WAIT_SECS" ] && \
+  fail "Misconfigured: STALE_POLL_DEADLINE_SECS ($STALE_POLL_DEADLINE_SECS) must exceed STALE_WAIT_SECS ($STALE_WAIT_SECS) by at least one sweep interval"
 sleep "$STALE_WAIT_SECS"
 
-GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
-STALE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
-[ "$STALE_STATUS" != "awaiting_agent" ] && \
-  fail "After ${STALE_WAIT_SECS}s with no heartbeat, expected status=awaiting_agent (sweep transition), got $STALE_STATUS — migration 046 likely not applied OR sweep not running"
+STALE_DEADLINE=$(( $(date +%s) + (STALE_POLL_DEADLINE_SECS - STALE_WAIT_SECS) ))
+STALE_STATUS=""
+while true; do
+  GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
+  STALE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
+  [ "$STALE_STATUS" = "awaiting_agent" ] && break
+  if [ "$(date +%s)" -gt "$STALE_DEADLINE" ]; then
+    fail "After ${STALE_POLL_DEADLINE_SECS}s with no heartbeat, status still '$STALE_STATUS' (expected awaiting_agent sweep transition) — migration 046 likely not applied OR sweep not running"
+  fi
+  sleep 10
+done
 ok "Heartbeat-staleness sweep transitioned online → awaiting_agent (proof healthsweep.go fix working)"
+require_transition "sweep: online → awaiting_agent (no heartbeat)"
 
 # ─── 7. Re-register and confirm we can come back online ─────────────────
 # This proves the awaiting_agent state is recoverable (re-registrable),
 # which is the whole point of using it instead of 'offline'.
 log "7/8 Re-registering after stale → confirming recovery to online..."
 # Same payload contract as step 5 (id + agent_card both required). See note
-# there for why workspace_id would 400.
-REREG_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
-  -H "Authorization: Bearer $WS_AUTH_TOKEN" \
-  -H "X-Molecule-Org-Id: $ORG_ID" \
-  -H "Content-Type: application/json" \
-  -d "$REGISTER_BODY") || true
-log "    re-register response: $(echo "$REREG_RESP" | head -c 300)"
-echo "$REREG_RESP" | grep -q "HTTP_CODE=200" || fail "re-register returned non-200 — see body above"
+# there for why workspace_id would 400. Same bounded retry-on-transient.
+REGISTER_RESP=""
+register_with_retry "re-register" "$REGISTER_BODY" \
+  || fail "re-register returned non-200 after bounded retries — body: $(printf '%s' "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
+log "    re-register response: $(echo "$REGISTER_RESP" | sanitize_http_body | head -c 300)"
 
 GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
 RECOVERED_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
 [ "$RECOVERED_STATUS" != "online" ] && \
   fail "Expected re-register to return workspace to online, got $RECOVERED_STATUS"
 ok "Re-register succeeded — awaiting_agent → online (operator-recoverable)"
+require_transition "re-register: awaiting_agent → online (recovery)"
 
 # ─── 8. Done — cleanup runs in the EXIT trap ───────────────────────────
+# REQUIRE_LIVE belt-and-braces: assert here too (in addition to the EXIT
+# trap) so the failure surfaces in step order, not only post-teardown.
+if [ "${REQUIRE_LIVE}" = "1" ] && [ "$TRANSITIONS_VERIFIED" -lt "$EXPECTED_TRANSITIONS" ]; then
+  fail "REQUIRE_LIVE: only ${TRANSITIONS_VERIFIED}/${EXPECTED_TRANSITIONS} transitions proven at end of run"
+fi
 log "8/8 All four awaiting_agent transitions verified."
 log "═══════════════════════════════════════════════════════════════════"
 ok "External-runtime E2E PASSED on $SLUG"