From fff480c6e2e98b88c1aaa5bdcd68bbe70c8cccd0 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sun, 14 Jun 2026 18:49:09 +0000 Subject: [PATCH] fix(harness): count __SKIP__/__XFAIL__ replays as skips, not passes The harness replay runner treated any replay that exited 0 as PASS, including canary-smoke-a2a-pong.sh which exits 0 immediately with an __XFAIL__ marker (blocked on #2863). That falsely inflated the pass count and masked the fact that the replay exercised nothing. Changes: - Capture each replay's stdout and detect __SKIP__ / __XFAIL__ markers. - Count marked replays as SKIP, not PASS. - Update the summary line to report passed/failed/skipped. - Switch canary-smoke-a2a-pong.sh to the __SKIP__ marker so the runner classifies it correctly (the xfail reason and #2863 reference stay in the human-readable output). No replay semantics changed; the runner now honestly reports xfails as skips instead of false-greens. --- .../harness/replays/canary-smoke-a2a-pong.sh | 2 +- tests/harness/run-all-replays.sh | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/harness/replays/canary-smoke-a2a-pong.sh b/tests/harness/replays/canary-smoke-a2a-pong.sh index 9be9fde5..2d665c66 100755 --- a/tests/harness/replays/canary-smoke-a2a-pong.sh +++ b/tests/harness/replays/canary-smoke-a2a-pong.sh @@ -17,7 +17,7 @@ # signal that the 2-genuine review needs. Tracking the work in the linked # issue lets us burn down the xfails as separate PRs land. # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ -echo "[replay] __XFAIL__:#2863:CP-stub 401 on workspace start (30s provisioning stall)" +echo "[replay] __SKIP__:#2863:CP-stub 401 on workspace start (30s provisioning stall)" exit 0 set -euo pipefail diff --git a/tests/harness/run-all-replays.sh b/tests/harness/run-all-replays.sh index 092158c3..ea79247c 100755 --- a/tests/harness/run-all-replays.sh +++ b/tests/harness/run-all-replays.sh @@ -64,10 +64,18 @@ for replay in "${REPLAYS[@]}"; do name=$(basename "$replay" .sh) echo "" echo "[run-all] ━━━ $name ━━━" - if bash "$replay"; then - # Replays signal "skip" by exiting 0 with a __SKIP__ marker in stdout — - # but we capture that as a pass here since the script exited 0. The - # skip is documented in the script's own output. CI uses pass/fail. + out=$(mktemp) + rc=0 + bash "$replay" >"$out" 2>&1 || rc=$? + # Stream the replay output so logs remain useful. + cat "$out" + if [ "$rc" -eq 0 ] && grep -qE '^\[replay\] __(SKIP|XFAIL)__' "$out"; then + # Replays signal "skip" by exiting 0 with a __SKIP__ or __XFAIL__ + # marker in stdout. Count them as skips, not passes, so the harness + # gate doesn't false-green on xfails that test nothing. + SKIP_COUNT=$((SKIP_COUNT + 1)) + echo "[run-all] SKIP: $name" + elif [ "$rc" -eq 0 ]; then PASS_COUNT=$((PASS_COUNT + 1)) echo "[run-all] PASS: $name" else @@ -75,11 +83,12 @@ for replay in "${REPLAYS[@]}"; do FAILED_NAMES+=("$name") echo "[run-all] FAIL: $name" fi + rm -f "$out" done echo "" echo "[run-all] =============================" -echo "[run-all] Replay summary: ${PASS_COUNT} passed, ${FAIL_COUNT} failed (of ${#REPLAYS[@]} total)" +echo "[run-all] Replay summary: ${PASS_COUNT} passed, ${FAIL_COUNT} failed, ${SKIP_COUNT} skipped (of ${#REPLAYS[@]} total)" if [ ${FAIL_COUNT} -gt 0 ]; then echo "[run-all] Failed:" for name in "${FAILED_NAMES[@]}"; do -- 2.52.0