From 116697c576a3398ad33403355a5284e6c6a24f45 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sat, 6 Jun 2026 17:27:24 +0000 Subject: [PATCH 1/2] =?UTF-8?q?fix(ci):=20status-reaper=20infra-failure?= =?UTF-8?q?=E2=86=92red=20=E2=80=94=20observability=20hardening?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - commit-list API failure: ::warning:: → ::error:: + return 1 - per-SHA get_combined_status failure: ::warning:: → ::error:: + tracked in sha_api_errors counter - main() returns 1 when skipped=True or sha_api_errors > 0 so cron bot surfaces persistent infra issues as red failures Diff-proof: 49/49 status-reaper tests pass. Refs: internal#219 §1, PR#2367 pair --- .gitea/scripts/status-reaper.py | 17 ++++++++++++++--- tests/test_status_reaper.py | 9 +++++---- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/.gitea/scripts/status-reaper.py b/.gitea/scripts/status-reaper.py index 5bf4c7d5f..21fac46ee 100644 --- a/.gitea/scripts/status-reaper.py +++ b/.gitea/scripts/status-reaper.py @@ -689,8 +689,8 @@ def reap_branch( shas = list_recent_commit_shas(branch, limit) except ApiError as e: print( - "::warning::status-reaper skipped this tick because the " - f"commit list could not be read after retries: {e}" + "::error::status-reaper cannot run: commit-list API failed " + f"after retries: {e}" ) return { "scanned_shas": 0, @@ -704,6 +704,7 @@ def reap_branch( "compensated_cancelled_push": 0, "preserved_pr_without_push_success": 0, "compensated_per_sha": {}, + "sha_api_errors": 0, "skipped": True, "skip_reason": "commit-list-api-error", } @@ -720,6 +721,7 @@ def reap_branch( "compensated_cancelled_push": 0, "preserved_pr_without_push_success": 0, "compensated_per_sha": {}, + "sha_api_errors": 0, } for sha in shas: @@ -731,8 +733,9 @@ def reap_branch( try: combined = get_combined_status(sha) except ApiError as e: + aggregate["sha_api_errors"] += 1 print( - f"::warning::get_combined_status({sha[:10]}) failed; " + f"::error::get_combined_status({sha[:10]}) failed; " f"skipping this SHA: {e}" ) continue @@ -819,6 +822,14 @@ def main() -> int: sort_keys=True, ) ) + # Observability: infra-failure → red. If the commit list could not be + # read or any per-SHA status fetch failed, the tick is incomplete and + # must be observable as a failure (non-zero exit) so the cron bot or + # runner surface alerts. + if counters.get("skipped"): + return 1 + if counters.get("sha_api_errors", 0) > 0: + return 1 return 0 diff --git a/tests/test_status_reaper.py b/tests/test_status_reaper.py index 7449545e4..331911fe4 100644 --- a/tests/test_status_reaper.py +++ b/tests/test_status_reaper.py @@ -1050,12 +1050,13 @@ def test_reap_continues_on_per_sha_apierror(sr_module, monkeypatch, capsys): def test_main_soft_skips_when_commit_listing_times_out(sr_module, monkeypatch, capsys): - """A transient outage while listing recent commits should not paint main red. + """A transient outage while listing recent commits fails the tick visibly. Per-SHA status read failures are already isolated inside `reap_branch`. The real 2026-05-14 failure was earlier: `/commits?sha=main&limit=30` timed out after all retries, aborting the tick. The next 5-minute tick can - retry safely, so `main()` should emit an observable warning and return 0. + retry safely, but the tick itself must be observable as red (exit 1 + error + annotation) so the cron bot alerts on persistent infra issues. """ monkeypatch.setattr(sr_module, "scan_workflows", lambda _: {"workflow-without-push": False}) @@ -1068,9 +1069,9 @@ def test_main_soft_skips_when_commit_listing_times_out(sr_module, monkeypatch, c monkeypatch.setattr(sr_module, "list_recent_commit_shas", fake_list_recent_commit_shas) monkeypatch.setattr(sys, "argv", ["status-reaper.py"]) - assert sr_module.main() == 0 + assert sr_module.main() == 1 captured = capsys.readouterr() - assert "::warning::status-reaper skipped this tick" in captured.out + assert "::error::status-reaper cannot run" in captured.out assert '"skipped": true' in captured.out assert '"skip_reason": "commit-list-api-error"' in captured.out -- 2.52.0 From db39d519dc2fb35473ab9f554edbb6eba5334f92 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sat, 6 Jun 2026 17:47:46 +0000 Subject: [PATCH 2/2] fix(merge-queue): queue API/network/timeout errors now return 1 (#2370 RC) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per CR2 RC: the status-reaper observability fix was complete, but the merge-queue exception handlers in main() still returned 0 on ApiError, URLError, and TimeoutError. This hid persistent infra issues from operators — the cron stayed green while the queue could not evaluate merge state. Now all three handlers return 1 so the cron job surfaces red and operators are paged to investigate. Diff-proof: 52/52 gitea-merge-queue tests pass. Refs: core#2370, CR2 RC. --- .gitea/scripts/gitea-merge-queue.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.gitea/scripts/gitea-merge-queue.py b/.gitea/scripts/gitea-merge-queue.py index 29b561a60..c78d4577e 100644 --- a/.gitea/scripts/gitea-merge-queue.py +++ b/.gitea/scripts/gitea-merge-queue.py @@ -1098,18 +1098,18 @@ def main() -> int: try: return process_once(dry_run=args.dry_run) except ApiError as exc: - # API errors (401/403/404/500) are transient for a queue tick — - # log and exit 0 so the workflow is not marked failed and the next - # tick can retry. Returning non-zero would permanently fail the - # workflow run, blocking future ticks. + # FAIL-CLOSED: API errors are not "transient success" — they mean + # the queue could not evaluate merge state. Returning 0 hides + # persistent infra issues (auth drift, endpoint outages) from + # operators. Return 1 so the cron job surfaces red and paging fires. sys.stderr.write(f"::error::queue API error: {exc}\n") - return 0 + return 1 except urllib.error.URLError as exc: sys.stderr.write(f"::error::queue network error: {exc}\n") - return 0 + return 1 except TimeoutError as exc: sys.stderr.write(f"::error::queue timeout: {exc}\n") - return 0 + return 1 if __name__ == "__main__": -- 2.52.0