fix(ci): retry transient status-reaper API reads #888

Closed
hongming wants to merge 1 commits from fix/status-reaper-api-timeout-retry-20260513130514 into main
2 changed files with 87 additions and 3 deletions

View File

@ -92,6 +92,7 @@ import argparse
import json
import os
import sys
import time
import urllib.error
import urllib.parse
import urllib.request
@ -189,6 +190,11 @@ def api(
except urllib.error.HTTPError as e:
raw = e.read()
status = e.code
except (TimeoutError, urllib.error.URLError, OSError) as e:
raise ApiError(
f"{method} {path} -> transport error: "
f"{type(e).__name__}: {e}"
) from e
if not (200 <= status < 300):
snippet = raw[:500].decode("utf-8", errors="replace") if raw else ""
@ -206,6 +212,56 @@ def api(
return status, {"_raw": raw.decode("utf-8", errors="replace")}
def api_with_retries(
method: str,
path: str,
*,
body: dict | None = None,
query: dict[str, str] | None = None,
expect_json: bool = True,
attempts: int = 3,
) -> tuple[int, Any]:
"""Retry idempotent Gitea reads before failing the tick.
`status-reaper` runs during merge bursts, when Gitea can be slow to
answer the large commit/status reads this script needs. Retrying GETs
turns transient read timeouts into a delayed sweep instead of another
red status that requires reaping. POSTs intentionally do not retry here:
callers that mutate state must opt in explicitly.
"""
if method != "GET":
return api(
method,
path,
body=body,
query=query,
expect_json=expect_json,
)
last_error: ApiError | None = None
for attempt in range(1, attempts + 1):
try:
return api(
method,
path,
body=body,
query=query,
expect_json=expect_json,
)
except ApiError as e:
last_error = e
if attempt == attempts:
break
print(
f"::warning::{method} {path} failed attempt "
f"{attempt}/{attempts}; retrying: {e}"
)
time.sleep(min(2 ** (attempt - 1), 5))
assert last_error is not None
raise last_error
# --------------------------------------------------------------------------
# Workflow scan + classification
# --------------------------------------------------------------------------
@ -323,7 +379,7 @@ def scan_workflows(workflows_dir: str) -> dict[str, bool]:
# --------------------------------------------------------------------------
def get_head_sha(branch: str) -> str:
"""HEAD SHA of `branch`. Raises ApiError on non-2xx."""
_, body = api("GET", f"/repos/{OWNER}/{NAME}/branches/{branch}")
_, body = api_with_retries("GET", f"/repos/{OWNER}/{NAME}/branches/{branch}")
if not isinstance(body, dict):
raise ApiError(f"branch {branch} response not a JSON object")
commit = body.get("commit")
@ -348,7 +404,7 @@ def get_combined_status(sha: str) -> dict:
}
Raises ApiError on non-2xx.
"""
_, body = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status")
_, body = api_with_retries("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status")
if not isinstance(body, dict):
raise ApiError(f"status for {sha} response not a JSON object")
return body
@ -546,7 +602,7 @@ def list_recent_commit_shas(branch: str, limit: int) -> list[str]:
a transient 5xx on ONE commit's status is best-effort; losing the
commit list itself means we don't even know which commits to try.)
"""
_, body = api(
_, body = api_with_retries(
"GET",
f"/repos/{OWNER}/{NAME}/commits",
query={"sha": branch, "limit": str(limit)},

View File

@ -1009,3 +1009,31 @@ def test_reap_continues_on_per_sha_apierror(sr_module, monkeypatch, capsys):
captured = capsys.readouterr()
assert "::warning::" in captured.out or "::notice::" in captured.out
assert SHA_A[:10] in captured.out
def test_list_recent_commit_shas_retries_transient_apierror(sr_module, monkeypatch, capsys):
"""The initial /commits listing is load-bearing for the whole reaper
tick. A transient Gitea read timeout should retry before failing the
tick, otherwise status-reaper strands the exact reds it is meant to
compensate.
"""
calls: list[tuple[str, str]] = []
def fake_api(method, path, *, body=None, query=None, expect_json=True):
calls.append((method, path))
if len(calls) == 1:
raise sr_module.ApiError(
"GET /repos/owner/repo/commits -> transport error: TimeoutError"
)
return (200, [{"sha": SHA_A}, {"sha": SHA_B}])
monkeypatch.setattr(sr_module, "api", fake_api)
monkeypatch.setattr(sr_module.time, "sleep", lambda _: None)
assert sr_module.list_recent_commit_shas("main", 30) == [SHA_A, SHA_B]
assert calls == [
("GET", "/repos/owner/repo/commits"),
("GET", "/repos/owner/repo/commits"),
]
captured = capsys.readouterr()
assert "::warning::GET /repos/owner/repo/commits failed attempt 1/3" in captured.out