diff --git a/.gitea/scripts/gitea-merge-queue.py b/.gitea/scripts/gitea-merge-queue.py index 29b561a60..c78d4577e 100644 --- a/.gitea/scripts/gitea-merge-queue.py +++ b/.gitea/scripts/gitea-merge-queue.py @@ -1098,18 +1098,18 @@ def main() -> int: try: return process_once(dry_run=args.dry_run) except ApiError as exc: - # API errors (401/403/404/500) are transient for a queue tick — - # log and exit 0 so the workflow is not marked failed and the next - # tick can retry. Returning non-zero would permanently fail the - # workflow run, blocking future ticks. + # FAIL-CLOSED: API errors are not "transient success" — they mean + # the queue could not evaluate merge state. Returning 0 hides + # persistent infra issues (auth drift, endpoint outages) from + # operators. Return 1 so the cron job surfaces red and paging fires. sys.stderr.write(f"::error::queue API error: {exc}\n") - return 0 + return 1 except urllib.error.URLError as exc: sys.stderr.write(f"::error::queue network error: {exc}\n") - return 0 + return 1 except TimeoutError as exc: sys.stderr.write(f"::error::queue timeout: {exc}\n") - return 0 + return 1 if __name__ == "__main__": diff --git a/.gitea/scripts/status-reaper.py b/.gitea/scripts/status-reaper.py index 5bf4c7d5f..21fac46ee 100644 --- a/.gitea/scripts/status-reaper.py +++ b/.gitea/scripts/status-reaper.py @@ -689,8 +689,8 @@ def reap_branch( shas = list_recent_commit_shas(branch, limit) except ApiError as e: print( - "::warning::status-reaper skipped this tick because the " - f"commit list could not be read after retries: {e}" + "::error::status-reaper cannot run: commit-list API failed " + f"after retries: {e}" ) return { "scanned_shas": 0, @@ -704,6 +704,7 @@ def reap_branch( "compensated_cancelled_push": 0, "preserved_pr_without_push_success": 0, "compensated_per_sha": {}, + "sha_api_errors": 0, "skipped": True, "skip_reason": "commit-list-api-error", } @@ -720,6 +721,7 @@ def reap_branch( "compensated_cancelled_push": 0, "preserved_pr_without_push_success": 0, "compensated_per_sha": {}, + "sha_api_errors": 0, } for sha in shas: @@ -731,8 +733,9 @@ def reap_branch( try: combined = get_combined_status(sha) except ApiError as e: + aggregate["sha_api_errors"] += 1 print( - f"::warning::get_combined_status({sha[:10]}) failed; " + f"::error::get_combined_status({sha[:10]}) failed; " f"skipping this SHA: {e}" ) continue @@ -819,6 +822,14 @@ def main() -> int: sort_keys=True, ) ) + # Observability: infra-failure → red. If the commit list could not be + # read or any per-SHA status fetch failed, the tick is incomplete and + # must be observable as a failure (non-zero exit) so the cron bot or + # runner surface alerts. + if counters.get("skipped"): + return 1 + if counters.get("sha_api_errors", 0) > 0: + return 1 return 0 diff --git a/tests/test_status_reaper.py b/tests/test_status_reaper.py index 7449545e4..331911fe4 100644 --- a/tests/test_status_reaper.py +++ b/tests/test_status_reaper.py @@ -1050,12 +1050,13 @@ def test_reap_continues_on_per_sha_apierror(sr_module, monkeypatch, capsys): def test_main_soft_skips_when_commit_listing_times_out(sr_module, monkeypatch, capsys): - """A transient outage while listing recent commits should not paint main red. + """A transient outage while listing recent commits fails the tick visibly. Per-SHA status read failures are already isolated inside `reap_branch`. The real 2026-05-14 failure was earlier: `/commits?sha=main&limit=30` timed out after all retries, aborting the tick. The next 5-minute tick can - retry safely, so `main()` should emit an observable warning and return 0. + retry safely, but the tick itself must be observable as red (exit 1 + error + annotation) so the cron bot alerts on persistent infra issues. """ monkeypatch.setattr(sr_module, "scan_workflows", lambda _: {"workflow-without-push": False}) @@ -1068,9 +1069,9 @@ def test_main_soft_skips_when_commit_listing_times_out(sr_module, monkeypatch, c monkeypatch.setattr(sr_module, "list_recent_commit_shas", fake_list_recent_commit_shas) monkeypatch.setattr(sys, "argv", ["status-reaper.py"]) - assert sr_module.main() == 0 + assert sr_module.main() == 1 captured = capsys.readouterr() - assert "::warning::status-reaper skipped this tick" in captured.out + assert "::error::status-reaper cannot run" in captured.out assert '"skipped": true' in captured.out assert '"skip_reason": "commit-list-api-error"' in captured.out