fix(ci): status-reaper infra-failure→red — observability hardening #2370
@@ -1098,18 +1098,18 @@ def main() -> int:
|
||||
try:
|
||||
return process_once(dry_run=args.dry_run)
|
||||
except ApiError as exc:
|
||||
# API errors (401/403/404/500) are transient for a queue tick —
|
||||
# log and exit 0 so the workflow is not marked failed and the next
|
||||
# tick can retry. Returning non-zero would permanently fail the
|
||||
# workflow run, blocking future ticks.
|
||||
# FAIL-CLOSED: API errors are not "transient success" — they mean
|
||||
# the queue could not evaluate merge state. Returning 0 hides
|
||||
# persistent infra issues (auth drift, endpoint outages) from
|
||||
# operators. Return 1 so the cron job surfaces red and paging fires.
|
||||
sys.stderr.write(f"::error::queue API error: {exc}\n")
|
||||
return 0
|
||||
return 1
|
||||
except urllib.error.URLError as exc:
|
||||
sys.stderr.write(f"::error::queue network error: {exc}\n")
|
||||
return 0
|
||||
return 1
|
||||
except TimeoutError as exc:
|
||||
sys.stderr.write(f"::error::queue timeout: {exc}\n")
|
||||
return 0
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -689,8 +689,8 @@ def reap_branch(
|
||||
shas = list_recent_commit_shas(branch, limit)
|
||||
except ApiError as e:
|
||||
print(
|
||||
"::warning::status-reaper skipped this tick because the "
|
||||
f"commit list could not be read after retries: {e}"
|
||||
"::error::status-reaper cannot run: commit-list API failed "
|
||||
f"after retries: {e}"
|
||||
)
|
||||
return {
|
||||
"scanned_shas": 0,
|
||||
@@ -704,6 +704,7 @@ def reap_branch(
|
||||
"compensated_cancelled_push": 0,
|
||||
"preserved_pr_without_push_success": 0,
|
||||
"compensated_per_sha": {},
|
||||
"sha_api_errors": 0,
|
||||
"skipped": True,
|
||||
"skip_reason": "commit-list-api-error",
|
||||
}
|
||||
@@ -720,6 +721,7 @@ def reap_branch(
|
||||
"compensated_cancelled_push": 0,
|
||||
"preserved_pr_without_push_success": 0,
|
||||
"compensated_per_sha": {},
|
||||
"sha_api_errors": 0,
|
||||
}
|
||||
|
||||
for sha in shas:
|
||||
@@ -731,8 +733,9 @@ def reap_branch(
|
||||
try:
|
||||
combined = get_combined_status(sha)
|
||||
except ApiError as e:
|
||||
aggregate["sha_api_errors"] += 1
|
||||
print(
|
||||
f"::warning::get_combined_status({sha[:10]}) failed; "
|
||||
f"::error::get_combined_status({sha[:10]}) failed; "
|
||||
f"skipping this SHA: {e}"
|
||||
)
|
||||
continue
|
||||
@@ -819,6 +822,14 @@ def main() -> int:
|
||||
sort_keys=True,
|
||||
)
|
||||
)
|
||||
# Observability: infra-failure → red. If the commit list could not be
|
||||
# read or any per-SHA status fetch failed, the tick is incomplete and
|
||||
# must be observable as a failure (non-zero exit) so the cron bot or
|
||||
# runner surface alerts.
|
||||
if counters.get("skipped"):
|
||||
return 1
|
||||
if counters.get("sha_api_errors", 0) > 0:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
@@ -1050,12 +1050,13 @@ def test_reap_continues_on_per_sha_apierror(sr_module, monkeypatch, capsys):
|
||||
|
||||
|
||||
def test_main_soft_skips_when_commit_listing_times_out(sr_module, monkeypatch, capsys):
|
||||
"""A transient outage while listing recent commits should not paint main red.
|
||||
"""A transient outage while listing recent commits fails the tick visibly.
|
||||
|
||||
Per-SHA status read failures are already isolated inside `reap_branch`.
|
||||
The real 2026-05-14 failure was earlier: `/commits?sha=main&limit=30`
|
||||
timed out after all retries, aborting the tick. The next 5-minute tick can
|
||||
retry safely, so `main()` should emit an observable warning and return 0.
|
||||
retry safely, but the tick itself must be observable as red (exit 1 + error
|
||||
annotation) so the cron bot alerts on persistent infra issues.
|
||||
"""
|
||||
|
||||
monkeypatch.setattr(sr_module, "scan_workflows", lambda _: {"workflow-without-push": False})
|
||||
@@ -1068,9 +1069,9 @@ def test_main_soft_skips_when_commit_listing_times_out(sr_module, monkeypatch, c
|
||||
monkeypatch.setattr(sr_module, "list_recent_commit_shas", fake_list_recent_commit_shas)
|
||||
monkeypatch.setattr(sys, "argv", ["status-reaper.py"])
|
||||
|
||||
assert sr_module.main() == 0
|
||||
assert sr_module.main() == 1
|
||||
captured = capsys.readouterr()
|
||||
assert "::warning::status-reaper skipped this tick" in captured.out
|
||||
assert "::error::status-reaper cannot run" in captured.out
|
||||
assert '"skipped": true' in captured.out
|
||||
assert '"skip_reason": "commit-list-api-error"' in captured.out
|
||||
|
||||
|
||||
Reference in New Issue
Block a user