From cbf7123016d37512da1a5f6b2be42d204e3a5b06 Mon Sep 17 00:00:00 2001 From: core-be Date: Wed, 13 May 2026 13:08:47 -0700 Subject: [PATCH] fix(ci): retry transient status-reaper API reads --- .gitea/scripts/status-reaper.py | 62 +++++++++++++++++++++++++++++++-- tests/test_status_reaper.py | 28 +++++++++++++++ 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/.gitea/scripts/status-reaper.py b/.gitea/scripts/status-reaper.py index 9833e7b4..fd2ec6c5 100644 --- a/.gitea/scripts/status-reaper.py +++ b/.gitea/scripts/status-reaper.py @@ -92,6 +92,7 @@ import argparse import json import os import sys +import time import urllib.error import urllib.parse import urllib.request @@ -189,6 +190,11 @@ def api( except urllib.error.HTTPError as e: raw = e.read() status = e.code + except (TimeoutError, urllib.error.URLError, OSError) as e: + raise ApiError( + f"{method} {path} -> transport error: " + f"{type(e).__name__}: {e}" + ) from e if not (200 <= status < 300): snippet = raw[:500].decode("utf-8", errors="replace") if raw else "" @@ -206,6 +212,56 @@ def api( return status, {"_raw": raw.decode("utf-8", errors="replace")} +def api_with_retries( + method: str, + path: str, + *, + body: dict | None = None, + query: dict[str, str] | None = None, + expect_json: bool = True, + attempts: int = 3, +) -> tuple[int, Any]: + """Retry idempotent Gitea reads before failing the tick. + + `status-reaper` runs during merge bursts, when Gitea can be slow to + answer the large commit/status reads this script needs. Retrying GETs + turns transient read timeouts into a delayed sweep instead of another + red status that requires reaping. POSTs intentionally do not retry here: + callers that mutate state must opt in explicitly. + """ + if method != "GET": + return api( + method, + path, + body=body, + query=query, + expect_json=expect_json, + ) + + last_error: ApiError | None = None + for attempt in range(1, attempts + 1): + try: + return api( + method, + path, + body=body, + query=query, + expect_json=expect_json, + ) + except ApiError as e: + last_error = e + if attempt == attempts: + break + print( + f"::warning::{method} {path} failed attempt " + f"{attempt}/{attempts}; retrying: {e}" + ) + time.sleep(min(2 ** (attempt - 1), 5)) + + assert last_error is not None + raise last_error + + # -------------------------------------------------------------------------- # Workflow scan + classification # -------------------------------------------------------------------------- @@ -323,7 +379,7 @@ def scan_workflows(workflows_dir: str) -> dict[str, bool]: # -------------------------------------------------------------------------- def get_head_sha(branch: str) -> str: """HEAD SHA of `branch`. Raises ApiError on non-2xx.""" - _, body = api("GET", f"/repos/{OWNER}/{NAME}/branches/{branch}") + _, body = api_with_retries("GET", f"/repos/{OWNER}/{NAME}/branches/{branch}") if not isinstance(body, dict): raise ApiError(f"branch {branch} response not a JSON object") commit = body.get("commit") @@ -348,7 +404,7 @@ def get_combined_status(sha: str) -> dict: } Raises ApiError on non-2xx. """ - _, body = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status") + _, body = api_with_retries("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status") if not isinstance(body, dict): raise ApiError(f"status for {sha} response not a JSON object") return body @@ -546,7 +602,7 @@ def list_recent_commit_shas(branch: str, limit: int) -> list[str]: a transient 5xx on ONE commit's status is best-effort; losing the commit list itself means we don't even know which commits to try.) """ - _, body = api( + _, body = api_with_retries( "GET", f"/repos/{OWNER}/{NAME}/commits", query={"sha": branch, "limit": str(limit)}, diff --git a/tests/test_status_reaper.py b/tests/test_status_reaper.py index 81327487..c520b9b9 100644 --- a/tests/test_status_reaper.py +++ b/tests/test_status_reaper.py @@ -1009,3 +1009,31 @@ def test_reap_continues_on_per_sha_apierror(sr_module, monkeypatch, capsys): captured = capsys.readouterr() assert "::warning::" in captured.out or "::notice::" in captured.out assert SHA_A[:10] in captured.out + + +def test_list_recent_commit_shas_retries_transient_apierror(sr_module, monkeypatch, capsys): + """The initial /commits listing is load-bearing for the whole reaper + tick. A transient Gitea read timeout should retry before failing the + tick, otherwise status-reaper strands the exact reds it is meant to + compensate. + """ + calls: list[tuple[str, str]] = [] + + def fake_api(method, path, *, body=None, query=None, expect_json=True): + calls.append((method, path)) + if len(calls) == 1: + raise sr_module.ApiError( + "GET /repos/owner/repo/commits -> transport error: TimeoutError" + ) + return (200, [{"sha": SHA_A}, {"sha": SHA_B}]) + + monkeypatch.setattr(sr_module, "api", fake_api) + monkeypatch.setattr(sr_module.time, "sleep", lambda _: None) + + assert sr_module.list_recent_commit_shas("main", 30) == [SHA_A, SHA_B] + assert calls == [ + ("GET", "/repos/owner/repo/commits"), + ("GET", "/repos/owner/repo/commits"), + ] + captured = capsys.readouterr() + assert "::warning::GET /repos/owner/repo/commits failed attempt 1/3" in captured.out -- 2.45.2