fix(harness): cleanup_failed event + drop misleading exit_code capture

Self-review follow-ups on #2257:

- Drop `local exit_code=$?` from cleanup(). `trap`-handler return values
  are ignored, so capturing $? only misled a future reader into thinking
  exit-code preservation was happening.

- Replace silenced `>/dev/null 2>&1` DELETE with `-w '%{http_code}'`
  capture. ADMIN_TOKEN expiring mid-run was the realistic failure mode
  here — previously we swallowed it under the silenced redirect, leaving
  workspaces leaked with no signal. Now a 401/403/5xx surfaces as a
  `cleanup_failed` JSON event with a remediation hint pointing at
  cleanup-rogue-workspaces.sh; 404 is treated as success (the
  post-condition — workspace absent — holds).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-28 21:00:01 -07:00
parent ada0e1ddd0
commit 6e5b5c4142

View File

@ -153,18 +153,31 @@ PM_ID=""
CHILD_ID=""
cleanup() {
local exit_code=$?
# `trap` ignores function return values, so don't capture/return $?
# — that would only mislead a future reader. Disable -e inside cleanup
# so a single curl failure doesn't abort the loop and leave the other
# workspace orphaned.
set +e
if [ "$KEEP_WORKSPACES" = "1" ]; then
emit "cleanup_skipped" "{\"reason\":\"KEEP_WORKSPACES=1\",\"pm_id\":\"$PM_ID\",\"child_id\":\"$CHILD_ID\"}"
return $exit_code
return
fi
for id in "$CHILD_ID" "$PM_ID"; do
[ -z "$id" ] && continue
api -X DELETE "$PLATFORM/workspaces/$id" >/dev/null 2>&1
emit "cleanup_deleted" "{\"workspace_id\":\"$id\"}"
# Capture HTTP status separately from response body so a 401/403/5xx
# surfaces as a `cleanup_failed` event instead of a silent leak. The
# operator can then re-run cleanup-rogue-workspaces.sh with fresh
# credentials. ADMIN_TOKEN expiry mid-run is the realistic failure
# mode here; without this we'd swallow it under `>/dev/null 2>&1`.
code=$(api -o /dev/null -w '%{http_code}' -X DELETE "$PLATFORM/workspaces/$id" 2>/dev/null || echo "curl_err")
if [ "$code" = "200" ] || [ "$code" = "204" ] || [ "$code" = "404" ]; then
# 404 = already gone (race with a concurrent operator). Treat as
# success since the post-condition (workspace absent) holds.
emit "cleanup_deleted" "{\"workspace_id\":\"$id\",\"http_code\":\"$code\"}"
else
emit "cleanup_failed" "{\"workspace_id\":\"$id\",\"http_code\":\"$code\",\"hint\":\"workspace may be leaked — re-run cleanup-rogue-workspaces.sh\"}"
fi
done
return $exit_code
}
trap cleanup EXIT INT TERM