test(e2e): fix exit-trap quote bug + narrow A2A infra-skip (#2917) #2922
@@ -235,6 +235,32 @@ source "$(dirname "$0")/lib/completion_assert.sh"
|
||||
CURL_COMMON=(-sS --fail-with-body --max-time 30)
|
||||
E2E_TMP_FILES=()
|
||||
|
||||
# Infra-skip helper (core#2917). Emits a machine-readable scan_status line
|
||||
# and exits 0 so the advisory staging gate goes green-with-skip rather than
|
||||
# false-red on a known transient A2A-layer degradation. The trap still tears
|
||||
# down the org.
|
||||
#
|
||||
# Fail-closed on repeated skips: a broadly broken agent that triggers skips on
|
||||
# every A2A call would otherwise paint the advisory lane green while masking a
|
||||
# real regression. We allow one distinct skip reason per run; a second distinct
|
||||
# reason (or any repeated skip after the cap) converts to a hard failure.
|
||||
INFRA_SKIP_REASONS=""
|
||||
infra_skip() {
|
||||
local reason="$1"
|
||||
local detail="${2:-}"
|
||||
case " $INFRA_SKIP_REASONS " in
|
||||
*" $reason "*) ;;
|
||||
*) INFRA_SKIP_REASONS="$INFRA_SKIP_REASONS $reason" ;;
|
||||
esac
|
||||
local distinct_count
|
||||
distinct_count=$(echo "$INFRA_SKIP_REASONS" | wc -w | tr -d ' ')
|
||||
if [ "$distinct_count" -ge 2 ]; then
|
||||
fail "infra-skip cap exceeded ($distinct_count distinct reasons:${INFRA_SKIP_REASONS:-none}) — refusing false-green on repeated A2A-layer degradation"
|
||||
fi
|
||||
echo "[$(date +%H:%M:%S)] ⚠️ scan_status: infra-skip:${reason}${detail:+ $detail}"
|
||||
exit 0
|
||||
}
|
||||
|
||||
e2e_tmp() {
|
||||
local f
|
||||
f=$(mktemp "$1")
|
||||
@@ -327,7 +353,19 @@ cleanup_org() {
|
||||
*) exit 1 ;; # anything else is a generic failure
|
||||
esac
|
||||
}
|
||||
trap cleanup_org EXIT INT TERM
|
||||
|
||||
# Wrapper for the EXIT/INT/TERM trap: capture the original exit code,
|
||||
# remove the org-create bodyfile (created later), run teardown, and
|
||||
# propagate the original code. Defined as a function so the trap string
|
||||
# is simple and cannot pick up an unbalanced quote from inline command
|
||||
# substitution (core#2917).
|
||||
cleanup_org_and_bodyfile() {
|
||||
local entry_rc=$?
|
||||
rm -f "$CREATE_BODYFILE" 2>/dev/null || true
|
||||
cleanup_org
|
||||
exit "$entry_rc"
|
||||
}
|
||||
trap cleanup_org_and_bodyfile EXIT INT TERM
|
||||
|
||||
# ─── 0. Preflight ───────────────────────────────────────────────────────
|
||||
log "═══════════════════════════════════════════════════════════════════"
|
||||
@@ -366,31 +404,9 @@ log "1/11 Creating org $SLUG via /cp/admin/orgs..."
|
||||
# set -euo pipefail, aborting the whole harness with no body
|
||||
# in the CI logs.
|
||||
CREATE_BODYFILE="$(mktemp -t create-org-resp.XXXXXX)"
|
||||
# core#60 trap-chain + exit-code preservation (RC #11654 #2,
|
||||
# #11654 #3, #11673, #11674): the prior
|
||||
# `trap 'rm -f "$CREATE_BODYFILE"' EXIT` overwrote the
|
||||
# cleanup_org EXIT trap at line 330, leaking the staging
|
||||
# org/resources if the bodyfile path succeeded and a later
|
||||
# step failed. Worse, re-installing the previous trap
|
||||
# during EXIT handling and then exiting with `(exit $ec)`
|
||||
# does NOT actually invoke the re-installed trap body — a
|
||||
# trap that fires during another trap's body does not chain.
|
||||
# The fix: extract cleanup_org's command body via `trap -p
|
||||
# EXIT`, then build a single EXIT trap that (a) captures
|
||||
# the script's exit code FIRST into a file-scoped
|
||||
# `__org_create_bodyfile_ec` (file-scoped via export so the
|
||||
# trap-string evaluator can see it), (b) removes the
|
||||
# bodyfile, (c) explicitly invokes the captured
|
||||
# cleanup_org body inline (not as a re-registered trap),
|
||||
# (d) propagates the original exit code to CI. The capture
|
||||
# uses `trap -p EXIT` which prints the current trap in a
|
||||
# form suitable for re-evaluation; the `sed` extracts the
|
||||
# command body (the original trap was set with
|
||||
# `trap cleanup_org EXIT INT TERM` so the captured string
|
||||
# is just `cleanup_org`).
|
||||
__org_create_bodyfile_ec=""
|
||||
prev_exit_trap="$(trap -p EXIT | sed -E "s/^trap -- '//; s/'$ EXIT$//")"
|
||||
trap '__org_create_bodyfile_ec=$?; rm -f "$CREATE_BODYFILE"; '"${prev_exit_trap}"'; exit "${__org_create_bodyfile_ec}"' EXIT
|
||||
# cleanup_org_and_bodyfile (EXIT/INT/TERM trap) removes this bodyfile and
|
||||
# runs teardown, so a non-2xx org-create response is logged while the org
|
||||
# and EC2 resources are still cleaned up (core#60 / core#2917).
|
||||
set +e
|
||||
CREATE_HTTP_CODE=$(curl "${CURL_COMMON[@]}" -X POST "$CP_URL/cp/admin/orgs" \
|
||||
-H "Authorization: Bearer $ADMIN_TOKEN" \
|
||||
@@ -1193,6 +1209,7 @@ a2a_send_or_poll_queue() {
|
||||
local payload="$1"; shift
|
||||
local label="$1"
|
||||
local tmp qid resp code rc attempt poll_attempt poll_tmp
|
||||
local a2a_gateway_error_seen=0 last_qstatus="" queue_poll_count=0
|
||||
tmp=$(mktemp -t a2a_poll.XXXXXX)
|
||||
qid=""
|
||||
|
||||
@@ -1250,6 +1267,8 @@ except Exception:
|
||||
fail "$label queue item $qid terminal status=$qstatus: $(printf '%s' "$resp" | sanitize_http_body)"
|
||||
;;
|
||||
queued|dispatched|in_progress|"")
|
||||
last_qstatus="$qstatus"
|
||||
queue_poll_count=$((queue_poll_count + 1))
|
||||
echo " $label queue poll attempt $poll_attempt/30 status=$qstatus — backing off 2s" >&2
|
||||
sleep 2
|
||||
;;
|
||||
@@ -1261,6 +1280,21 @@ except Exception:
|
||||
done
|
||||
rm -f "$poll_tmp"
|
||||
# Ran out of queue poll attempts.
|
||||
# core#2917: if a gateway-edge error preceded a queued task that never
|
||||
# drained, treat it as a transient A2A-layer infra-skip rather than a
|
||||
# workspace-code failure. The flag is only set for edge signals (Bad
|
||||
# Gateway/Gateway Timeout/error-code 502/504/no healthy upstream), never
|
||||
# for agent-origin signals that could mask a real regression.
|
||||
# Verified signature: 502/503/504 on an initial POST, queue_id assigned,
|
||||
# then 30/30 polls stuck in queued/dispatched/in_progress/empty.
|
||||
case "$last_qstatus" in
|
||||
queued|dispatched|in_progress|"")
|
||||
if [ "$a2a_gateway_error_seen" = "1" ] && [ -n "$qid" ]; then
|
||||
rm -f "$tmp"
|
||||
infra_skip "a2a-queue-timeout" "queue_id=$qid poll_count=${queue_poll_count}/30 last_status=${last_qstatus:-<empty>}"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
fail "$label queue poll timed out waiting for $qid to complete"
|
||||
fi
|
||||
|
||||
@@ -1307,15 +1341,27 @@ except Exception:
|
||||
|
||||
local safe_body
|
||||
safe_body=$(printf '%s' "$resp" | sanitize_http_body)
|
||||
if echo "$code" | grep -Eq '^(502|503|504)$' && echo "$safe_body" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|workspace agent unreachable|connection refused|no healthy upstream|workspace agent busy|native_session|restarting|restart triggered'; then
|
||||
echo " $label A2A transient $code attempt $attempt/12: $safe_body" >&2
|
||||
if [ "$attempt" -lt 12 ]; then
|
||||
local sleep_sec=10
|
||||
if echo "$safe_body" | grep -Eqi 'workspace agent busy|native_session|restarting|restart triggered'; then
|
||||
sleep_sec=30
|
||||
if echo "$code" | grep -Eq '^(502|503|504)$'; then
|
||||
# core#2917: split gateway-edge signals (unambiguous transient infra,
|
||||
# eligible for the queue-timeout infra-skip) from agent-origin signals
|
||||
# that can hide a real workspace-agent regression. Only edge signals set
|
||||
# a2a_gateway_error_seen; agent-origin retries are still allowed but will
|
||||
# never skip-to-green if the queue never drains.
|
||||
if echo "$safe_body" | grep -Eqi 'Service Unavailable|Bad Gateway|Gateway Timeout|error code: 502|error code: 504|no healthy upstream'; then
|
||||
a2a_gateway_error_seen=1
|
||||
echo " $label A2A transient gateway $code attempt $attempt/12: $safe_body" >&2
|
||||
if [ "$attempt" -lt 12 ]; then
|
||||
sleep 10
|
||||
continue
|
||||
fi
|
||||
elif echo "$safe_body" | grep -Eqi 'workspace agent unreachable|connection refused|workspace agent busy|native_session|restarting|restart triggered'; then
|
||||
echo " $label A2A agent-origin $code attempt $attempt/12: $safe_body" >&2
|
||||
if [ "$attempt" -lt 12 ]; then
|
||||
# Agent restart/cold-start can take tens of seconds; keep polling,
|
||||
# but do NOT treat this as an edge-gateway transient eligible for skip.
|
||||
sleep 30
|
||||
continue
|
||||
fi
|
||||
sleep "$sleep_sec"
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
break
|
||||
@@ -1323,6 +1369,11 @@ except Exception:
|
||||
|
||||
rm -f "$tmp"
|
||||
if [ "$rc" != "0" ] || [ "$code" -lt 200 ] || [ "$code" -ge 300 ]; then
|
||||
# core#2917: outright A2A connect timeout (curl_rc=28, http=000) is the
|
||||
# second verified transient-infra signature, not a workspace bug.
|
||||
if [ "$rc" = "28" ] && [ "$code" = "000" ]; then
|
||||
infra_skip "a2a-connect-timeout" "curl_rc=$rc http=$code attempt=$attempt label=$label"
|
||||
fi
|
||||
fail "$label failed after $attempt attempt(s) (curl_rc=$rc, http=$code): $(printf '%s' "$resp" | sanitize_http_body)"
|
||||
fi
|
||||
printf '%s' "$resp"
|
||||
|
||||
Reference in New Issue
Block a user