diff --git a/.github/workflows/redeploy-tenants-on-staging.yml b/.github/workflows/redeploy-tenants-on-staging.yml index 7f191e8d..caaeb56e 100644 --- a/.github/workflows/redeploy-tenants-on-staging.yml +++ b/.github/workflows/redeploy-tenants-on-staging.yml @@ -172,12 +172,44 @@ jobs: jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true } >> "$GITHUB_STEP_SUMMARY" - if [ "$HTTP_CODE" != "200" ]; then + # Distinguish "real fleet failure" from "E2E teardown race". + # + # CP returns HTTP 500 + ok=false whenever ANY tenant in the + # fleet failed SSM or healthz. In practice the recurring source + # of these is ephemeral e2e-* tenants (saas/canvas/ext) being + # torn down by their parent E2E run mid-redeploy: the EC2 dies → + # SSM exit=2 or healthz timeout → CP marks the fleet failed → + # this workflow goes red even though every operator-facing + # tenant rolled fine. + # + # Filter: if HTTP=500/ok=false AND every failed slug matches + # ^e2e-, treat as soft-warn and let the verify step downstream + # handle the unreachable-vs-stale distinction (it already knows + # the difference per #2402). Any non-e2e-* failure or a non-500 + # HTTP response remains a hard failure. + OK=$(jq -r '.ok // "false"' "$HTTP_RESPONSE") + FAILED_SLUGS=$(jq -r ' + .results[]? + | select((.healthz_ok != true) or (.ssm_status != "Success")) + | .slug' "$HTTP_RESPONSE" 2>/dev/null || true) + NON_E2E_FAILED=$(printf '%s\n' "$FAILED_SLUGS" | grep -v '^$' | grep -v '^e2e-' || true) + + if [ "$HTTP_CODE" = "200" ] && [ "$OK" = "true" ]; then + : # happy path — fall through to verification + elif [ "$HTTP_CODE" = "500" ] && [ -z "$NON_E2E_FAILED" ] && [ -n "$FAILED_SLUGS" ]; then + COUNT=$(printf '%s\n' "$FAILED_SLUGS" | grep -c '^e2e-' || true) + echo "::warning::redeploy-fleet returned HTTP 500 but every failed tenant ($COUNT) is e2e-* ephemeral — treating as teardown race, soft-warning." + printf '%s\n' "$FAILED_SLUGS" | sed 's/^/::warning:: failed: /' + elif [ "$HTTP_CODE" != "200" ]; then echo "::error::redeploy-fleet returned HTTP $HTTP_CODE" + if [ -n "$NON_E2E_FAILED" ]; then + echo "::error::non-e2e tenant(s) failed:" + printf '%s\n' "$NON_E2E_FAILED" | sed 's/^/::error:: /' + fi exit 1 - fi - OK=$(jq -r '.ok' "$HTTP_RESPONSE") - if [ "$OK" != "true" ]; then + else + # HTTP=200 but ok=false (shouldn't happen with current CP + # but keep the gate for completeness). echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)" exit 1 fi