From ec39fecda295adbf07641212b2419e3c3d411eca Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 11:35:56 -0700 Subject: [PATCH 1/2] fix(ci): hard-fail when >50% of fleet unreachable post-redeploy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Belt-and-suspenders sanity floor on top of the unreachable-soft-warn introduced earlier in this PR. Addresses the residual gap noted in review: if a new image crashes on startup, every tenant ends up unreachable, and the soft-warn alone would let that ship as a green deploy. Canary-verify catches it on the canary tenant first, but this guard is a fallback for canary-skip dispatches and same-batch races. Threshold is 50% of healthz_ok-snapshotted tenants — comfortably above the typical e2e-* teardown rate (5-10/hour, ~1 ephemeral tenant per batch) but below any plausible real-outage scenario. Mirrored across staging.yml + main.yml for shape parity. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/redeploy-tenants-on-main.yml | 10 ++++++++++ .github/workflows/redeploy-tenants-on-staging.yml | 12 ++++++++++++ 2 files changed, 22 insertions(+) diff --git a/.github/workflows/redeploy-tenants-on-main.yml b/.github/workflows/redeploy-tenants-on-main.yml index efacbe69..466c0249 100644 --- a/.github/workflows/redeploy-tenants-on-main.yml +++ b/.github/workflows/redeploy-tenants-on-main.yml @@ -306,6 +306,16 @@ jobs: if [ $UNREACHABLE_COUNT -gt 0 ]; then echo "::warning::$UNREACHABLE_COUNT tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages." fi + + # Belt-and-suspenders sanity floor: same logic as the staging + # variant — if MORE than half the prod fleet is unreachable, + # this is a real outage, not a teardown race. Hard-fail. + TOTAL_VERIFIED=${#SLUGS[@]} + if [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then + echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold. Likely real outage, not teardown race." + exit 1 + fi + if [ $STALE_COUNT -gt 0 ]; then echo "::error::$STALE_COUNT tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary." exit 1 diff --git a/.github/workflows/redeploy-tenants-on-staging.yml b/.github/workflows/redeploy-tenants-on-staging.yml index 125f25c1..381d0a65 100644 --- a/.github/workflows/redeploy-tenants-on-staging.yml +++ b/.github/workflows/redeploy-tenants-on-staging.yml @@ -283,6 +283,18 @@ jobs: if [ $UNREACHABLE_COUNT -gt 0 ]; then echo "::warning::$UNREACHABLE_COUNT staging tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages." fi + + # Belt-and-suspenders sanity floor: if MORE than half the fleet is + # unreachable, this isn't a teardown race — it's a real outage + # (e.g. the new image crashes on startup). Hard-fail. Canary-verify + # would catch this on the canary tenant first; this guard is a + # fallback for canary-skip dispatches and same-batch races. + TOTAL_VERIFIED=${#SLUGS[@]} + if [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then + echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED staging tenant(s) unreachable — exceeds 50% threshold. Likely real outage, not teardown race." + exit 1 + fi + if [ $STALE_COUNT -gt 0 ]; then echo "::error::$STALE_COUNT staging tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary." exit 1 From 9b909c4459f8724bc9ab53866eae419a784ba9b5 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 11:40:31 -0700 Subject: [PATCH 2/2] fix(ci): gate 50%-floor on TOTAL_VERIFIED >= 4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-review of #2403 caught a regression: with a 1-tenant fleet (the exact case the original #2402 fix targeted), the new floor would re-introduce the flake. Trace: TOTAL=1, UNREACHABLE=1, $((1/2))=0 if 1 -gt 0 → TRUE → exit 1 The 50%-rule only meaningfully distinguishes "real outage" from "teardown race" when the fleet is large enough that "half down" is statistically meaningful. With 1-3 tenants, canary-verify is the actual gate (it runs against the canary first and aborts the rollout if the canary fails to come up). Gate the floor on TOTAL_VERIFIED >= 4. Truth table: TOTAL UNREACHABLE RESULT 1 1 soft-warn (original e2e flake case) 4 2 soft-warn (exactly half) 4 3 hard-fail (75% — real outage) 10 6 hard-fail (60% — real outage) Mirrored across staging.yml + main.yml. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../workflows/redeploy-tenants-on-main.yml | 9 +++++---- .../workflows/redeploy-tenants-on-staging.yml | 19 +++++++++++++------ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/.github/workflows/redeploy-tenants-on-main.yml b/.github/workflows/redeploy-tenants-on-main.yml index 466c0249..46743347 100644 --- a/.github/workflows/redeploy-tenants-on-main.yml +++ b/.github/workflows/redeploy-tenants-on-main.yml @@ -308,11 +308,12 @@ jobs: fi # Belt-and-suspenders sanity floor: same logic as the staging - # variant — if MORE than half the prod fleet is unreachable, - # this is a real outage, not a teardown race. Hard-fail. + # variant — see that file's comment for the full rationale. + # Floor only applies when fleet >= 4; below that, canary-verify + # is the actual gate. TOTAL_VERIFIED=${#SLUGS[@]} - if [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then - echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold. Likely real outage, not teardown race." + if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then + echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race." exit 1 fi diff --git a/.github/workflows/redeploy-tenants-on-staging.yml b/.github/workflows/redeploy-tenants-on-staging.yml index 381d0a65..7f191e8d 100644 --- a/.github/workflows/redeploy-tenants-on-staging.yml +++ b/.github/workflows/redeploy-tenants-on-staging.yml @@ -285,13 +285,20 @@ jobs: fi # Belt-and-suspenders sanity floor: if MORE than half the fleet is - # unreachable, this isn't a teardown race — it's a real outage - # (e.g. the new image crashes on startup). Hard-fail. Canary-verify - # would catch this on the canary tenant first; this guard is a - # fallback for canary-skip dispatches and same-batch races. + # unreachable AND the fleet is large enough that "half down" is + # statistically meaningful, this is a real outage (e.g. new image + # crashes on startup), not a teardown race. Hard-fail. + # + # Floor only applies when TOTAL_VERIFIED >= 4 — below that, the + # canary-verify step is the actual gate for "all tenants down" + # detection (it runs against the canary first and aborts the + # rollout if the canary fails to come up). Without the >=4 gate, + # a 1-tenant fleet (e.g. a single ephemeral e2e-* tenant on a + # quiet staging push) would re-flake on the exact teardown-race + # condition #2402 fixed: 1 of 1 unreachable = 100% > 50% → fail. TOTAL_VERIFIED=${#SLUGS[@]} - if [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then - echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED staging tenant(s) unreachable — exceeds 50% threshold. Likely real outage, not teardown race." + if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then + echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED staging tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race." exit 1 fi