fix(ci): status-reaper infra-failure→red — observability hardening #2370

Merged
devops-engineer merged 2 commits from fix/status-reaper-observability into main 2026-06-06 18:17:02 +00:00
3 changed files with 26 additions and 14 deletions
+7 -7
View File
@@ -1098,18 +1098,18 @@ def main() -> int:
try:
return process_once(dry_run=args.dry_run)
except ApiError as exc:
# API errors (401/403/404/500) are transient for a queue tick —
# log and exit 0 so the workflow is not marked failed and the next
# tick can retry. Returning non-zero would permanently fail the
# workflow run, blocking future ticks.
# FAIL-CLOSED: API errors are not "transient success" — they mean
# the queue could not evaluate merge state. Returning 0 hides
# persistent infra issues (auth drift, endpoint outages) from
# operators. Return 1 so the cron job surfaces red and paging fires.
sys.stderr.write(f"::error::queue API error: {exc}\n")
return 0
return 1
except urllib.error.URLError as exc:
sys.stderr.write(f"::error::queue network error: {exc}\n")
return 0
return 1
except TimeoutError as exc:
sys.stderr.write(f"::error::queue timeout: {exc}\n")
return 0
return 1
if __name__ == "__main__":
+14 -3
View File
@@ -689,8 +689,8 @@ def reap_branch(
shas = list_recent_commit_shas(branch, limit)
except ApiError as e:
print(
"::warning::status-reaper skipped this tick because the "
f"commit list could not be read after retries: {e}"
"::error::status-reaper cannot run: commit-list API failed "
f"after retries: {e}"
)
return {
"scanned_shas": 0,
@@ -704,6 +704,7 @@ def reap_branch(
"compensated_cancelled_push": 0,
"preserved_pr_without_push_success": 0,
"compensated_per_sha": {},
"sha_api_errors": 0,
"skipped": True,
"skip_reason": "commit-list-api-error",
}
@@ -720,6 +721,7 @@ def reap_branch(
"compensated_cancelled_push": 0,
"preserved_pr_without_push_success": 0,
"compensated_per_sha": {},
"sha_api_errors": 0,
}
for sha in shas:
@@ -731,8 +733,9 @@ def reap_branch(
try:
combined = get_combined_status(sha)
except ApiError as e:
aggregate["sha_api_errors"] += 1
print(
f"::warning::get_combined_status({sha[:10]}) failed; "
f"::error::get_combined_status({sha[:10]}) failed; "
f"skipping this SHA: {e}"
)
continue
@@ -819,6 +822,14 @@ def main() -> int:
sort_keys=True,
)
)
# Observability: infra-failure → red. If the commit list could not be
# read or any per-SHA status fetch failed, the tick is incomplete and
# must be observable as a failure (non-zero exit) so the cron bot or
# runner surface alerts.
if counters.get("skipped"):
return 1
if counters.get("sha_api_errors", 0) > 0:
return 1
return 0
+5 -4
View File
@@ -1050,12 +1050,13 @@ def test_reap_continues_on_per_sha_apierror(sr_module, monkeypatch, capsys):
def test_main_soft_skips_when_commit_listing_times_out(sr_module, monkeypatch, capsys):
"""A transient outage while listing recent commits should not paint main red.
"""A transient outage while listing recent commits fails the tick visibly.
Per-SHA status read failures are already isolated inside `reap_branch`.
The real 2026-05-14 failure was earlier: `/commits?sha=main&limit=30`
timed out after all retries, aborting the tick. The next 5-minute tick can
retry safely, so `main()` should emit an observable warning and return 0.
retry safely, but the tick itself must be observable as red (exit 1 + error
annotation) so the cron bot alerts on persistent infra issues.
"""
monkeypatch.setattr(sr_module, "scan_workflows", lambda _: {"workflow-without-push": False})
@@ -1068,9 +1069,9 @@ def test_main_soft_skips_when_commit_listing_times_out(sr_module, monkeypatch, c
monkeypatch.setattr(sr_module, "list_recent_commit_shas", fake_list_recent_commit_shas)
monkeypatch.setattr(sys, "argv", ["status-reaper.py"])
assert sr_module.main() == 0
assert sr_module.main() == 1
captured = capsys.readouterr()
assert "::warning::status-reaper skipped this tick" in captured.out
assert "::error::status-reaper cannot run" in captured.out
assert '"skipped": true' in captured.out
assert '"skip_reason": "commit-list-api-error"' in captured.out