fix(e2e-sanity): normalize unexpected curl exit codes in cleanup trap (#2159)

When E2E_INTENTIONAL_FAILURE=1 poisons the tenant token, step 5/11's
`tenant_call POST /workspaces` curl exits 22 (HTTP error under
--fail-with-body). `set -e` propagates rc=22 directly, but the
script's documented contract emits only {0,1,2,3,4}, and the sanity
workflow's case statement only matches those. rc=22 falls through
to "Unexpected rc — investigate harness" and opens a false-positive
priority-high "safety net broken" issue (#2159, weekly run on
2026-04-27).

The trap now captures $? at entry (must be the first statement
before any command clobbers it) and at the end normalizes any
non-contract code to 1 (generic failure). Leak detection continues
to exit 4 directly, so its semantics are preserved.

Adds tests/e2e/test_harness_rc_normalization.sh — a self-contained
regression test that builds a stub harness with the same trap
pattern, triggers controlled exit codes, and asserts the
normalization. Covers the 5 contracted codes + curl-22 (the bug) +
3 representative network-failure codes + sigsegv-139.

Verification:
  - 10/10 regression tests pass
  - shellcheck clean on both modified files
  - production teardown path unchanged for legitimate {1,2,3,4}
    failures and the leak-detection exit 4

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-27 02:55:44 -07:00
parent e99f937630
commit 99fb61bb8c
2 changed files with 108 additions and 1 deletions

View File

@ -0,0 +1,88 @@
#!/usr/bin/env bash
# Regression test for #2159: when test_staging_full_saas.sh exits via
# `set -e` propagating an exit code outside the documented contract
# {0,1,2,3,4}, the cleanup trap must normalize it to 1.
#
# Pre-fix: a poisoned-token curl in step 5/11 exited with curl rc=22
# (HTTP error under --fail-with-body). The sanity workflow's case
# statement only matched {0,1,4}, fell through to the "investigate
# harness" branch, and opened a false-positive priority-high issue
# (#2159, 2026-04-27).
#
# This test exercises the exact normalization pattern in isolation
# so a future refactor that drops or weakens the pattern fails CI.
set -uo pipefail # NOT -e — we want to inspect non-zero rc explicitly
PASS=0
FAIL=0
# Build a stub harness with the same trap pattern as the production
# script. Source it in a subshell, trigger an exit with a controlled
# rc, and assert the observed final rc.
run_stub() {
local trigger_rc="$1"
local stub
stub=$(mktemp)
cat > "$stub" <<EOF
#!/usr/bin/env bash
set -euo pipefail
CLEANUP_DONE=0
cleanup_org() {
local entry_rc=\$?
if [ "\$CLEANUP_DONE" = "1" ]; then return 0; fi
CLEANUP_DONE=1
case "\$entry_rc" in
0|1|2|3|4) ;;
*) exit 1 ;;
esac
}
trap cleanup_org EXIT INT TERM
exit $trigger_rc
EOF
chmod +x "$stub"
local observed
bash "$stub"; observed=$?
rm -f "$stub"
echo "$observed"
}
assert_rc() {
local label="$1" trigger="$2" expected="$3"
local observed
observed=$(run_stub "$trigger")
if [ "$observed" = "$expected" ]; then
echo "$label: trigger=$trigger expected=$expected observed=$observed"
PASS=$((PASS+1))
else
echo "$label: trigger=$trigger expected=$expected OBSERVED=$observed" >&2
FAIL=$((FAIL+1))
fi
}
echo "Test: cleanup_org exit-code normalization"
echo " Contract: only {0,1,2,3,4} pass through; anything else maps to 1"
echo
# Contracted codes pass through unchanged.
assert_rc "happy path" 0 0
assert_rc "fail() generic" 1 1
assert_rc "missing env" 2 2
assert_rc "provisioning timeout" 3 3
assert_rc "leak detected (cleanup exits 4)" 4 4
# The bug: rc=22 from curl --fail-with-body must normalize to 1.
assert_rc "curl HTTP error (rc=22, the bug)" 22 1
# Other realistic curl failure codes (network, SSL, etc.) must also
# normalize. Pinning a few representative values so a future regex
# refactor that loses the wildcard is caught.
assert_rc "curl couldn't resolve host (rc=6)" 6 1
assert_rc "curl SSL error (rc=35)" 35 1
assert_rc "curl operation timeout (rc=28)" 28 1
# Edge: very high rc (from a SIGSEGV-killed child or similar).
assert_rc "high rc (139, sigsegv)" 139 1
echo
echo "passed=$PASS failed=$FAIL"
[ "$FAIL" = "0" ]

View File

@ -72,7 +72,12 @@ CURL_COMMON=(-sS --fail-with-body --max-time 30)
# ─── cleanup trap ───────────────────────────────────────────────────────
CLEANUP_DONE=0
cleanup_org() {
[ "$CLEANUP_DONE" = "1" ] && return 0
# Capture upstream exit code IMMEDIATELY — must be the first statement
# in the trap, before any command (including the CLEANUP_DONE check)
# that would clobber $?.
local entry_rc=$?
if [ "$CLEANUP_DONE" = "1" ]; then return 0; fi
CLEANUP_DONE=1
if [ "${E2E_KEEP_ORG:-0}" = "1" ]; then
@ -99,6 +104,20 @@ cleanup_org() {
exit 4
fi
ok "Teardown clean — no orphan resources for $SLUG"
# Normalize unexpected upstream exit codes to 1 (generic failure). The
# script's documented contract (header "Exit codes" section) only emits
# {0, 1, 2, 3, 4}, but `set -e` propagates the raw exit code of the
# failing command — e.g. curl exits 22 on HTTP error under
# --fail-with-body. Without this normalization, the
# E2E_INTENTIONAL_FAILURE sanity workflow (e2e-staging-sanity.yml)
# gets rc=22 from the poisoned-token curl, falls through its
# case statement, and opens a false-positive priority-high
# "safety net broken" issue (#2159, 2026-04-27).
case "$entry_rc" in
0|1|2|3|4) ;; # contracted codes — let bash use entry_rc
*) exit 1 ;; # anything else is a generic failure
esac
}
trap cleanup_org EXIT INT TERM