From 6e5b5c4142952559c2e7930a86ae6b20b2ea4a90 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 28 Apr 2026 21:00:01 -0700 Subject: [PATCH] fix(harness): cleanup_failed event + drop misleading exit_code capture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-review follow-ups on #2257: - Drop `local exit_code=$?` from cleanup(). `trap`-handler return values are ignored, so capturing $? only misled a future reader into thinking exit-code preservation was happening. - Replace silenced `>/dev/null 2>&1` DELETE with `-w '%{http_code}'` capture. ADMIN_TOKEN expiring mid-run was the realistic failure mode here — previously we swallowed it under the silenced redirect, leaving workspaces leaked with no signal. Now a 401/403/5xx surfaces as a `cleanup_failed` JSON event with a remediation hint pointing at cleanup-rogue-workspaces.sh; 404 is treated as success (the post-condition — workspace absent — holds). Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/measure-coordinator-task-bounds.sh | 23 +++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/scripts/measure-coordinator-task-bounds.sh b/scripts/measure-coordinator-task-bounds.sh index 0a67733d..732f2ce7 100755 --- a/scripts/measure-coordinator-task-bounds.sh +++ b/scripts/measure-coordinator-task-bounds.sh @@ -153,18 +153,31 @@ PM_ID="" CHILD_ID="" cleanup() { - local exit_code=$? + # `trap` ignores function return values, so don't capture/return $? + # — that would only mislead a future reader. Disable -e inside cleanup + # so a single curl failure doesn't abort the loop and leave the other + # workspace orphaned. set +e if [ "$KEEP_WORKSPACES" = "1" ]; then emit "cleanup_skipped" "{\"reason\":\"KEEP_WORKSPACES=1\",\"pm_id\":\"$PM_ID\",\"child_id\":\"$CHILD_ID\"}" - return $exit_code + return fi for id in "$CHILD_ID" "$PM_ID"; do [ -z "$id" ] && continue - api -X DELETE "$PLATFORM/workspaces/$id" >/dev/null 2>&1 - emit "cleanup_deleted" "{\"workspace_id\":\"$id\"}" + # Capture HTTP status separately from response body so a 401/403/5xx + # surfaces as a `cleanup_failed` event instead of a silent leak. The + # operator can then re-run cleanup-rogue-workspaces.sh with fresh + # credentials. ADMIN_TOKEN expiry mid-run is the realistic failure + # mode here; without this we'd swallow it under `>/dev/null 2>&1`. + code=$(api -o /dev/null -w '%{http_code}' -X DELETE "$PLATFORM/workspaces/$id" 2>/dev/null || echo "curl_err") + if [ "$code" = "200" ] || [ "$code" = "204" ] || [ "$code" = "404" ]; then + # 404 = already gone (race with a concurrent operator). Treat as + # success since the post-condition (workspace absent) holds. + emit "cleanup_deleted" "{\"workspace_id\":\"$id\",\"http_code\":\"$code\"}" + else + emit "cleanup_failed" "{\"workspace_id\":\"$id\",\"http_code\":\"$code\",\"hint\":\"workspace may be leaked — re-run cleanup-rogue-workspaces.sh\"}" + fi done - return $exit_code } trap cleanup EXIT INT TERM