From ec39fecda295adbf07641212b2419e3c3d411eca Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 11:35:56 -0700 Subject: [PATCH] fix(ci): hard-fail when >50% of fleet unreachable post-redeploy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Belt-and-suspenders sanity floor on top of the unreachable-soft-warn introduced earlier in this PR. Addresses the residual gap noted in review: if a new image crashes on startup, every tenant ends up unreachable, and the soft-warn alone would let that ship as a green deploy. Canary-verify catches it on the canary tenant first, but this guard is a fallback for canary-skip dispatches and same-batch races. Threshold is 50% of healthz_ok-snapshotted tenants — comfortably above the typical e2e-* teardown rate (5-10/hour, ~1 ephemeral tenant per batch) but below any plausible real-outage scenario. Mirrored across staging.yml + main.yml for shape parity. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/redeploy-tenants-on-main.yml | 10 ++++++++++ .github/workflows/redeploy-tenants-on-staging.yml | 12 ++++++++++++ 2 files changed, 22 insertions(+) diff --git a/.github/workflows/redeploy-tenants-on-main.yml b/.github/workflows/redeploy-tenants-on-main.yml index efacbe69..466c0249 100644 --- a/.github/workflows/redeploy-tenants-on-main.yml +++ b/.github/workflows/redeploy-tenants-on-main.yml @@ -306,6 +306,16 @@ jobs: if [ $UNREACHABLE_COUNT -gt 0 ]; then echo "::warning::$UNREACHABLE_COUNT tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages." fi + + # Belt-and-suspenders sanity floor: same logic as the staging + # variant — if MORE than half the prod fleet is unreachable, + # this is a real outage, not a teardown race. Hard-fail. + TOTAL_VERIFIED=${#SLUGS[@]} + if [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then + echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold. Likely real outage, not teardown race." + exit 1 + fi + if [ $STALE_COUNT -gt 0 ]; then echo "::error::$STALE_COUNT tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary." exit 1 diff --git a/.github/workflows/redeploy-tenants-on-staging.yml b/.github/workflows/redeploy-tenants-on-staging.yml index 125f25c1..381d0a65 100644 --- a/.github/workflows/redeploy-tenants-on-staging.yml +++ b/.github/workflows/redeploy-tenants-on-staging.yml @@ -283,6 +283,18 @@ jobs: if [ $UNREACHABLE_COUNT -gt 0 ]; then echo "::warning::$UNREACHABLE_COUNT staging tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages." fi + + # Belt-and-suspenders sanity floor: if MORE than half the fleet is + # unreachable, this isn't a teardown race — it's a real outage + # (e.g. the new image crashes on startup). Hard-fail. Canary-verify + # would catch this on the canary tenant first; this guard is a + # fallback for canary-skip dispatches and same-batch races. + TOTAL_VERIFIED=${#SLUGS[@]} + if [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then + echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED staging tenant(s) unreachable — exceeds 50% threshold. Likely real outage, not teardown race." + exit 1 + fi + if [ $STALE_COUNT -gt 0 ]; then echo "::error::$STALE_COUNT staging tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary." exit 1