From fff480c6e2e98b88c1aaa5bdcd68bbe70c8cccd0 Mon Sep 17 00:00:00 2001
From: "Molecule AI Dev Engineer A (Kimi)"
 <dev-engineer-a-kimi@agents.moleculesai.app>
Date: Sun, 14 Jun 2026 18:49:09 +0000
Subject: [PATCH] fix(harness): count __SKIP__/__XFAIL__ replays as skips, not
 passes

The harness replay runner treated any replay that exited 0 as PASS,
including canary-smoke-a2a-pong.sh which exits 0 immediately with an
__XFAIL__ marker (blocked on #2863). That falsely inflated the pass
count and masked the fact that the replay exercised nothing.

Changes:
- Capture each replay's stdout and detect __SKIP__ / __XFAIL__ markers.
- Count marked replays as SKIP, not PASS.
- Update the summary line to report passed/failed/skipped.
- Switch canary-smoke-a2a-pong.sh to the __SKIP__ marker so the runner
  classifies it correctly (the xfail reason and #2863 reference stay
  in the human-readable output).

No replay semantics changed; the runner now honestly reports xfails as
skips instead of false-greens.
---
 .../harness/replays/canary-smoke-a2a-pong.sh  |  2 +-
 tests/harness/run-all-replays.sh              | 19 ++++++++++++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/tests/harness/replays/canary-smoke-a2a-pong.sh b/tests/harness/replays/canary-smoke-a2a-pong.sh
index 9be9fde5..2d665c66 100755
--- a/tests/harness/replays/canary-smoke-a2a-pong.sh
+++ b/tests/harness/replays/canary-smoke-a2a-pong.sh
@@ -17,7 +17,7 @@
 # signal that the 2-genuine review needs. Tracking the work in the linked
 # issue lets us burn down the xfails as separate PRs land.
 # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-echo "[replay] __XFAIL__:#2863:CP-stub 401 on workspace start (30s provisioning stall)"
+echo "[replay] __SKIP__:#2863:CP-stub 401 on workspace start (30s provisioning stall)"
 exit 0
 
 set -euo pipefail
diff --git a/tests/harness/run-all-replays.sh b/tests/harness/run-all-replays.sh
index 092158c3..ea79247c 100755
--- a/tests/harness/run-all-replays.sh
+++ b/tests/harness/run-all-replays.sh
@@ -64,10 +64,18 @@ for replay in "${REPLAYS[@]}"; do
     name=$(basename "$replay" .sh)
     echo ""
     echo "[run-all] ━━━ $name ━━━"
-    if bash "$replay"; then
-        # Replays signal "skip" by exiting 0 with a __SKIP__ marker in stdout —
-        # but we capture that as a pass here since the script exited 0. The
-        # skip is documented in the script's own output. CI uses pass/fail.
+    out=$(mktemp)
+    rc=0
+    bash "$replay" >"$out" 2>&1 || rc=$?
+    # Stream the replay output so logs remain useful.
+    cat "$out"
+    if [ "$rc" -eq 0 ] && grep -qE '^\[replay\] __(SKIP|XFAIL)__' "$out"; then
+        # Replays signal "skip" by exiting 0 with a __SKIP__ or __XFAIL__
+        # marker in stdout. Count them as skips, not passes, so the harness
+        # gate doesn't false-green on xfails that test nothing.
+        SKIP_COUNT=$((SKIP_COUNT + 1))
+        echo "[run-all] SKIP: $name"
+    elif [ "$rc" -eq 0 ]; then
         PASS_COUNT=$((PASS_COUNT + 1))
         echo "[run-all] PASS: $name"
     else
@@ -75,11 +83,12 @@ for replay in "${REPLAYS[@]}"; do
         FAILED_NAMES+=("$name")
         echo "[run-all] FAIL: $name"
     fi
+    rm -f "$out"
 done
 
 echo ""
 echo "[run-all] ============================="
-echo "[run-all] Replay summary: ${PASS_COUNT} passed, ${FAIL_COUNT} failed (of ${#REPLAYS[@]} total)"
+echo "[run-all] Replay summary: ${PASS_COUNT} passed, ${FAIL_COUNT} failed, ${SKIP_COUNT} skipped (of ${#REPLAYS[@]} total)"
 if [ ${FAIL_COUNT} -gt 0 ]; then
     echo "[run-all] Failed:"
     for name in "${FAILED_NAMES[@]}"; do
-- 
2.52.0