From 596e797dca895789e0dc7f5cab305f4782fc23c0 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 3 May 2026 04:28:29 -0700 Subject: [PATCH] ci(deploy): broaden ephemeral-prefix matchers to cover rt-e2e-* MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The redeploy-tenants-on-staging soft-warn filter and the sweep-stale-e2e-orgs janitor both hardcoded `^e2e-` to identify ephemeral test tenants. Runtime-test harness fixtures (RFC #2251) mint slugs prefixed with `rt-e2e-`, which neither matcher recognized. Concrete impact observed today: - Two `rt-e2e-v{5,6}-*` tenants left orphaned 8h on staging (sweep-stale-e2e-orgs ignored them). - On the next staging redeploy their phantom EC2s returned `InvalidInstanceId: Instances not in a valid state for account` from SSM SendCommand → CP returned HTTP 500 + ok=false. - The redeploy soft-warn missed them too, so the workflow went red, which broke the auto-promote-staging chain feeding the canvas warm-paper rollout to prod. Fix: switch both matchers to recognize the alternation `^(e2e-|rt-e2e-)`. Long-lived prefixes (demo-prep, dryrun-*, dryrun2-*) remain non-ephemeral and continue to hard-fail. Comment documents the source-of-truth list and the cross-file invariant. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../workflows/redeploy-tenants-on-staging.yml | 40 +++++++++++-------- .github/workflows/sweep-stale-e2e-orgs.yml | 14 +++++-- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/.github/workflows/redeploy-tenants-on-staging.yml b/.github/workflows/redeploy-tenants-on-staging.yml index caaeb56e..97392172 100644 --- a/.github/workflows/redeploy-tenants-on-staging.yml +++ b/.github/workflows/redeploy-tenants-on-staging.yml @@ -176,35 +176,41 @@ jobs: # # CP returns HTTP 500 + ok=false whenever ANY tenant in the # fleet failed SSM or healthz. In practice the recurring source - # of these is ephemeral e2e-* tenants (saas/canvas/ext) being - # torn down by their parent E2E run mid-redeploy: the EC2 dies → - # SSM exit=2 or healthz timeout → CP marks the fleet failed → - # this workflow goes red even though every operator-facing - # tenant rolled fine. + # of these is ephemeral test tenants being torn down by their + # parent E2E run mid-redeploy: the EC2 dies → SSM exit=2 or + # healthz timeout → CP marks the fleet failed → this workflow + # goes red even though every operator-facing tenant rolled fine. # - # Filter: if HTTP=500/ok=false AND every failed slug matches - # ^e2e-, treat as soft-warn and let the verify step downstream - # handle the unreachable-vs-stale distinction (it already knows - # the difference per #2402). Any non-e2e-* failure or a non-500 - # HTTP response remains a hard failure. + # Ephemeral slug prefixes (kept in sync with sweep-stale-e2e-orgs.yml + # — see that file for the source-of-truth list and rationale): + # - e2e-* — canvas/saas/ext E2E suites + # - rt-e2e-* — runtime-test harness fixtures (RFC #2251) + # Long-lived prefixes that are NOT ephemeral and MUST hard-fail: + # demo-prep, dryrun-*, dryrun2-*, plus all human tenant slugs. + # + # Filter: if HTTP=500/ok=false AND every failed slug matches an + # ephemeral prefix, treat as soft-warn and let the verify step + # downstream handle unreachable-vs-stale (#2402). Any non-ephemeral + # failure or a non-500 HTTP response remains a hard failure. OK=$(jq -r '.ok // "false"' "$HTTP_RESPONSE") FAILED_SLUGS=$(jq -r ' .results[]? | select((.healthz_ok != true) or (.ssm_status != "Success")) | .slug' "$HTTP_RESPONSE" 2>/dev/null || true) - NON_E2E_FAILED=$(printf '%s\n' "$FAILED_SLUGS" | grep -v '^$' | grep -v '^e2e-' || true) + EPHEMERAL_PREFIX_RE='^(e2e-|rt-e2e-)' + NON_EPHEMERAL_FAILED=$(printf '%s\n' "$FAILED_SLUGS" | grep -v '^$' | grep -Ev "$EPHEMERAL_PREFIX_RE" || true) if [ "$HTTP_CODE" = "200" ] && [ "$OK" = "true" ]; then : # happy path — fall through to verification - elif [ "$HTTP_CODE" = "500" ] && [ -z "$NON_E2E_FAILED" ] && [ -n "$FAILED_SLUGS" ]; then - COUNT=$(printf '%s\n' "$FAILED_SLUGS" | grep -c '^e2e-' || true) - echo "::warning::redeploy-fleet returned HTTP 500 but every failed tenant ($COUNT) is e2e-* ephemeral — treating as teardown race, soft-warning." + elif [ "$HTTP_CODE" = "500" ] && [ -z "$NON_EPHEMERAL_FAILED" ] && [ -n "$FAILED_SLUGS" ]; then + COUNT=$(printf '%s\n' "$FAILED_SLUGS" | grep -Ec "$EPHEMERAL_PREFIX_RE" || true) + echo "::warning::redeploy-fleet returned HTTP 500 but every failed tenant ($COUNT) is ephemeral (e2e-*/rt-e2e-*) — treating as teardown race, soft-warning." printf '%s\n' "$FAILED_SLUGS" | sed 's/^/::warning:: failed: /' elif [ "$HTTP_CODE" != "200" ]; then echo "::error::redeploy-fleet returned HTTP $HTTP_CODE" - if [ -n "$NON_E2E_FAILED" ]; then - echo "::error::non-e2e tenant(s) failed:" - printf '%s\n' "$NON_E2E_FAILED" | sed 's/^/::error:: /' + if [ -n "$NON_EPHEMERAL_FAILED" ]; then + echo "::error::non-ephemeral tenant(s) failed:" + printf '%s\n' "$NON_EPHEMERAL_FAILED" | sed 's/^/::error:: /' fi exit 1 else diff --git a/.github/workflows/sweep-stale-e2e-orgs.yml b/.github/workflows/sweep-stale-e2e-orgs.yml index 6913cba2..5a0dce30 100644 --- a/.github/workflows/sweep-stale-e2e-orgs.yml +++ b/.github/workflows/sweep-stale-e2e-orgs.yml @@ -87,20 +87,28 @@ jobs: > orgs.json # Filter: - # 1. slug starts with 'e2e-' (covers e2e-, e2e-canary-, - # e2e-canvas-* — all variants the test scripts mint) + # 1. slug starts with one of the ephemeral test prefixes: + # - 'e2e-' — covers e2e-canary-, e2e-canvas-*, etc. + # - 'rt-e2e-' — runtime-test harness fixtures (RFC #2251); + # missing this prefix left two such tenants + # orphaned 8h on staging (2026-05-03), then + # hard-failed redeploy-tenants-on-staging + # and broke the staging→main auto-promote + # chain. Kept in sync with the EPHEMERAL_PREFIX_RE + # regex in redeploy-tenants-on-staging.yml. # 2. created_at is older than MAX_AGE_MINUTES ago # Output one slug per line to a file the next step reads. python3 > stale_slugs.txt <<'PY' import json, os from datetime import datetime, timezone, timedelta + EPHEMERAL_PREFIXES = ("e2e-", "rt-e2e-") with open("orgs.json") as f: data = json.load(f) max_age = int(os.environ["MAX_AGE_MINUTES"]) cutoff = datetime.now(timezone.utc) - timedelta(minutes=max_age) for o in data.get("orgs", []): slug = o.get("slug", "") - if not slug.startswith("e2e-"): + if not slug.startswith(EPHEMERAL_PREFIXES): continue created = o.get("created_at") if not created: