forked from molecule-ai/molecule-core
Merge pull request #2511 from Molecule-AI/fix/redeploy-tolerate-e2e-teardown-race
fix(redeploy-staging): tolerate e2e-* teardown race in fleet HTTP 500
This commit is contained in:
commit
3d7b4b70ff
@ -172,12 +172,44 @@ jobs:
|
||||
jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
if [ "$HTTP_CODE" != "200" ]; then
|
||||
# Distinguish "real fleet failure" from "E2E teardown race".
|
||||
#
|
||||
# CP returns HTTP 500 + ok=false whenever ANY tenant in the
|
||||
# fleet failed SSM or healthz. In practice the recurring source
|
||||
# of these is ephemeral e2e-* tenants (saas/canvas/ext) being
|
||||
# torn down by their parent E2E run mid-redeploy: the EC2 dies →
|
||||
# SSM exit=2 or healthz timeout → CP marks the fleet failed →
|
||||
# this workflow goes red even though every operator-facing
|
||||
# tenant rolled fine.
|
||||
#
|
||||
# Filter: if HTTP=500/ok=false AND every failed slug matches
|
||||
# ^e2e-, treat as soft-warn and let the verify step downstream
|
||||
# handle the unreachable-vs-stale distinction (it already knows
|
||||
# the difference per #2402). Any non-e2e-* failure or a non-500
|
||||
# HTTP response remains a hard failure.
|
||||
OK=$(jq -r '.ok // "false"' "$HTTP_RESPONSE")
|
||||
FAILED_SLUGS=$(jq -r '
|
||||
.results[]?
|
||||
| select((.healthz_ok != true) or (.ssm_status != "Success"))
|
||||
| .slug' "$HTTP_RESPONSE" 2>/dev/null || true)
|
||||
NON_E2E_FAILED=$(printf '%s\n' "$FAILED_SLUGS" | grep -v '^$' | grep -v '^e2e-' || true)
|
||||
|
||||
if [ "$HTTP_CODE" = "200" ] && [ "$OK" = "true" ]; then
|
||||
: # happy path — fall through to verification
|
||||
elif [ "$HTTP_CODE" = "500" ] && [ -z "$NON_E2E_FAILED" ] && [ -n "$FAILED_SLUGS" ]; then
|
||||
COUNT=$(printf '%s\n' "$FAILED_SLUGS" | grep -c '^e2e-' || true)
|
||||
echo "::warning::redeploy-fleet returned HTTP 500 but every failed tenant ($COUNT) is e2e-* ephemeral — treating as teardown race, soft-warning."
|
||||
printf '%s\n' "$FAILED_SLUGS" | sed 's/^/::warning:: failed: /'
|
||||
elif [ "$HTTP_CODE" != "200" ]; then
|
||||
echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
|
||||
if [ -n "$NON_E2E_FAILED" ]; then
|
||||
echo "::error::non-e2e tenant(s) failed:"
|
||||
printf '%s\n' "$NON_E2E_FAILED" | sed 's/^/::error:: /'
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
OK=$(jq -r '.ok' "$HTTP_RESPONSE")
|
||||
if [ "$OK" != "true" ]; then
|
||||
else
|
||||
# HTTP=200 but ok=false (shouldn't happen with current CP
|
||||
# but keep the gate for completeness).
|
||||
echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
Loading…
Reference in New Issue
Block a user