fix(ci): paginate /statuses to exhaustion in verify-by-state readers (#2440-family) #2469

Merged
devops-engineer merged 2 commits from fix/status-reader-paginate-to-exhaustion into main 2026-06-09 03:14:15 +00:00
4 changed files with 341 additions and 35 deletions
+56 -19
View File
@@ -116,28 +116,65 @@ fi
# 3. Status-check state at the PR HEAD (where checks ran). The merge
# commit doesn't get its own checks; we evaluate the PR's last
# commit, which is what branch protection compared against.
# Fail-closed: verify HTTP 200. A 401/403/404 means the status is
# unreadable — we must NOT treat that as "no statuses" and skip checks.
STATUS_TMP=$(mktemp)
STATUS_HTTP=$(curl -sS -o "$STATUS_TMP" -w '%{http_code}' -H "$AUTH" \
"${API}/repos/${OWNER}/${NAME}/commits/${HEAD_SHA}/status")
STATUS=$(cat "$STATUS_TMP")
rm -f "$STATUS_TMP"
if [ "$STATUS_HTTP" != "200" ]; then
echo "::error::GET /commits/${HEAD_SHA}/status returned HTTP ${STATUS_HTTP} — cannot evaluate required checks."
exit 1
fi
# FAIL-CLOSED: a 200 status response missing the 'statuses' array, or with
# 'statuses' set to a non-array type (null/string/object), must NOT be treated
# as "no checks" — that would silently declare all checks green.
if ! echo "$STATUS" | jq -e '(.statuses | type) == "array"' >/dev/null; then
echo "::error::GET /commits/${HEAD_SHA}/status returned HTTP 200 but 'statuses' is missing or not an array — cannot evaluate required checks."
exit 1
fi
#
# Pagination (status-pagination RCA, #2440-family): the combined
# /commits/{sha}/status endpoint caps its embedded `statuses` array at the
# Gitea default page size (~30). On a high-churn PR an older-but-still-current
# required-context SUCCESS row is pushed PAST that cap, so reading the combined
# view would record that context as `missing` and emit a FALSE-POSITIVE
# force-merge. We instead page through the dedicated /commits/{sha}/statuses
# list to EXHAUSTION (until a short/empty page), accumulating every row.
#
# Fail-closed is preserved end to end: any non-200 page, or a page whose body
# is not a JSON array, aborts with exit 1 (we never treat an unreadable/partial
# page as "no checks"). A genuinely-absent required context appears on NO page,
# so CHECK_STATE has no entry for it → `${...:-missing}` below keeps it
# `missing` → it is still counted as not-green. No fail-open path is added.
PER_PAGE=100
page=1
ALL_STATUSES_TMP=$(mktemp)
printf '[]' > "$ALL_STATUSES_TMP" # accumulator: a single JSON array of rows
while :; do
STATUS_TMP=$(mktemp)
STATUS_HTTP=$(curl -sS -o "$STATUS_TMP" -w '%{http_code}' -H "$AUTH" \
"${API}/repos/${OWNER}/${NAME}/commits/${HEAD_SHA}/statuses?page=${page}&limit=${PER_PAGE}")
PAGE_BODY=$(cat "$STATUS_TMP")
rm -f "$STATUS_TMP"
if [ "$STATUS_HTTP" != "200" ]; then
rm -f "$ALL_STATUSES_TMP"
echo "::error::GET /commits/${HEAD_SHA}/statuses?page=${page} returned HTTP ${STATUS_HTTP} — cannot evaluate required checks."
exit 1
fi
# FAIL-CLOSED: the /statuses endpoint returns a bare JSON array. A non-array
# body (null/object/string) means the response is malformed — we must NOT
# treat that as "no checks", which would silently declare all checks green.
if ! echo "$PAGE_BODY" | jq -e 'type == "array"' >/dev/null 2>&1; then
rm -f "$ALL_STATUSES_TMP"
echo "::error::GET /commits/${HEAD_SHA}/statuses?page=${page} returned HTTP 200 but body is not a JSON array — cannot evaluate required checks."
exit 1
fi
PAGE_COUNT=$(echo "$PAGE_BODY" | jq 'length')
# Append this page's rows to the accumulator (insertion order is preserved
# but NOT relied upon — the collapse below selects max-by-id per context).
COMBINED=$(jq -s '.[0] + .[1]' "$ALL_STATUSES_TMP" <(echo "$PAGE_BODY"))
printf '%s' "$COMBINED" > "$ALL_STATUSES_TMP"
# Short page (fewer than PER_PAGE rows) ⇒ last page ⇒ stop.
if [ "$PAGE_COUNT" -lt "$PER_PAGE" ]; then
break
fi
page=$((page + 1))
done
STATUS=$(cat "$ALL_STATUSES_TMP")
rm -f "$ALL_STATUSES_TMP"
declare -A CHECK_STATE
# Gitea's /commits/{sha}/statuses is roughly newest-first but NOT strictly
# monotonic by id (observed first ids 157,155,156,… — local inversions from
# re-runs and page boundaries), so neither first- nor last-occurrence reliably
# yields the current row. Select the MAX-id row per context explicitly
# (order-independent), matching prod-auto-deploy.py's latest_status_for_context.
while IFS=$'\t' read -r ctx state; do
[ -n "$ctx" ] && CHECK_STATE[$ctx]="$state"
done < <(echo "$STATUS" | jq -r '.statuses | .[] | "\(.context)\t\(.status)"')
done < <(echo "$STATUS" | jq -r 'group_by(.context) | map(max_by(.id)) | .[] | "\(.context)\t\(.status)"')
# 4. For each required check, was it green at merge? YAML block scalars
# (`|`) leave a trailing newline; skip blank/whitespace-only lines.
+77 -11
View File
@@ -95,17 +95,27 @@ def build_plan(env: dict[str, str]) -> dict:
def latest_status_for_context(statuses: list[dict], context: str) -> dict | None:
"""Return the first matching status.
"""Return the NEWEST status row for ``context`` (highest ``id``).
Gitea's combined-status response is newest-first in practice. The merge
queue relies on the same contract; keeping the selector explicit makes
stale duplicate contexts easy to test.
This must work for BOTH orderings Gitea exposes: the combined
``/status`` view is newest-first, but the exhaustively-paginated
``/statuses`` list (see ``fetch_all_statuses``) is ascending id order
(oldest-first). Selecting by max ``id`` collapses duplicate context rows
to the current one regardless of input order, so a stale earlier run can
never shadow the latest result. Rows without an ``id`` are treated as
oldest (id -1) so a well-formed newer row always wins.
"""
newest: dict | None = None
newest_id = -1
for status in statuses:
if status.get("context") == context:
return status
return None
if status.get("context") != context:
continue
raw_id = status.get("id")
sid = raw_id if isinstance(raw_id, int) else -1
if newest is None or sid >= newest_id:
newest = status
newest_id = sid
return newest
def ci_context_state(statuses: list[dict], context: str) -> str:
@@ -351,6 +361,55 @@ def _api_json(url: str, token: str) -> dict:
raise RuntimeError(f"GET {url} -> HTTP {exc.code}: {body}") from exc
def _api_json_list(url: str, token: str) -> list:
"""GET a Gitea list endpoint and return the JSON array.
Like ``_api_json`` but asserts the body is a list. Fail-closed: a non-list
body (or HTTP error) raises so the caller never mistakes an unreadable page
for "no more statuses" and silently truncates the required-context scan.
"""
req = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
try:
with urllib.request.urlopen(req, timeout=20) as resp:
body = json.loads(resp.read())
except urllib.error.HTTPError as exc:
detail = exc.read().decode("utf-8", errors="replace")[:500]
raise RuntimeError(f"GET {url} -> HTTP {exc.code}: {detail}") from exc
if not isinstance(body, list):
raise RuntimeError(f"GET {url} -> expected JSON array, got {type(body).__name__}")
return body
def fetch_all_statuses(host: str, repo: str, sha: str, token: str, page_size: int = 100) -> list[dict]:
"""Return EVERY commit-status row for ``sha``, paginating to exhaustion.
The combined ``/commits/{sha}/status`` endpoint caps its embedded
``statuses`` array at the Gitea default page size (~30). On a high-churn
commit, an older-but-still-current required-context SUCCESS row is pushed
PAST that cap, so a reader of the combined view sees the required context
as ``missing`` and either blocks (force-merge audit) or waits forever
(this deploy gate). We instead walk ``/commits/{sha}/statuses`` page by
page until a short/empty page, accumulating ALL rows.
Fail-closed: any page that errors or is not a list raises (see
``_api_json_list``) — we never degrade to a partial list and call a deploy
green. A genuinely-absent required context simply never appears on ANY
page, so the caller's ``ci_context_state`` still reports ``missing`` and
the gate stays closed.
"""
base = f"https://{host}/api/v1/repos/{repo}/commits/{sha}/statuses"
results: list[dict] = []
page = 1
while True:
page_url = f"{base}?page={page}&limit={page_size}"
rows = _api_json_list(page_url, token)
results.extend(r for r in rows if isinstance(r, dict))
if len(rows) < page_size:
break
page += 1
return results
def _api_json_optional(url: str, token: str) -> tuple[int, dict | None]:
req = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
try:
@@ -472,12 +531,19 @@ def wait_for_ci_context(env: dict[str, str]) -> str:
if not token:
raise ValueError("GITEA_TOKEN is required to wait for CI status")
url = f"https://{host}/api/v1/repos/{repo}/commits/{sha}/status"
deadline = time.time() + timeout
last_states: dict[str, str] = {}
while time.time() <= deadline:
body = _api_json(url, token)
statuses = body.get("statuses") or []
# Read the FULL, exhaustively-paginated /statuses list — NOT the
# combined /status view, whose embedded `statuses` array is capped at
# the Gitea page size (~30). On a high-churn commit a required-context
# SUCCESS row lands past that cap and the combined view would report
# it `missing`, so this gate would wait until timeout and refuse a
# legitimate prod deploy. Fetching every page closes that hole.
# Fail-closed is preserved: a genuinely-absent required context is on
# NO page, so ci_context_state() still returns "missing" → never
# satisfied → the deploy stays blocked.
statuses = fetch_all_statuses(host, repo, sha, token)
states = {context: ci_context_state(statuses, context) for context in contexts}
for context, state in states.items():
if state != last_states.get(context):
@@ -115,5 +115,79 @@ T16=$(validate_required_checks_json "main" '{"main":"CI / all-required"}')
[ "$T16" = "false" ] || fail "T16: string branch entry should fail"
pass "T16: string branch entry fails"
# ---------------------------------------------------------------------------
# T17+ — /statuses pagination (status-pagination RCA, #2440-family).
# The reader now pages /commits/{sha}/statuses to exhaustion instead of reading
# the capped combined /status view. These lock the page-accumulation,
# newest-wins collapse, short-page stop, and fail-closed contracts.
# ---------------------------------------------------------------------------
# Page-body type validator used per page (bare array, not an object).
validate_page_is_array() { jq -e 'type == "array"' >/dev/null 2>&1 && echo true || echo false; }
# newest-wins collapse: mirror the script's max-by-id jq (order-independent).
collapse_newest_per_context() {
declare -A CS
while IFS=$'\t' read -r ctx state; do
[ -n "$ctx" ] && CS[$ctx]="$state"
done < <(jq -r 'group_by(.context) | map(max_by(.id)) | .[] | "\(.context)\t\(.status)"')
state="${CS[CI / all-required (push)]:-missing}"
echo "$state"
}
# T17 — a bare JSON array page passes the per-page array check.
T17=$(echo '[{"context":"c1","status":"success"}]' | validate_page_is_array)
[ "$T17" = "true" ] || fail "T17: bare array page should pass array check"
pass "T17: bare array page passes array check"
# T18 — a non-array page (object) fails the per-page array check → fail-closed.
T18=$(echo '{"statuses":[]}' | validate_page_is_array)
[ "$T18" = "false" ] || fail "T18: object page should fail array check (fail-closed)"
pass "T18: object page fails array check (fail-closed)"
# T19 — required SUCCESS on PAGE 2 is FOUND after accumulation (not missing).
# page1: 100 noise rows (older ids); page2: the required-context success.
PAGE1=$(jq -nc '[range(0;100) | {id:., context:("noise-\(.) (push)"), status:"pending"}]')
PAGE2='[{"id":200,"context":"CI / all-required (push)","status":"success"}]'
# Accumulation matching the script: two-arg `jq -s '.[0] + .[1]'` over the
# running accumulator and the new page.
ACCUM=$(jq -s '.[0] + .[1]' <(echo "$PAGE1") <(echo "$PAGE2"))
LEN=$(echo "$ACCUM" | jq 'length')
[ "$LEN" = "101" ] || fail "T19: accumulated length should be 101, got $LEN"
RESULT=$(echo "$ACCUM" | collapse_newest_per_context)
[ "$RESULT" = "success" ] || fail "T19: required success on page2 must be FOUND, got '$RESULT'"
pass "T19: required success on page2 is found after pagination"
# T20 — genuinely-absent required context across all pages stays 'missing'
# → fail-closed (counted as not-green, flags the force-merge).
ABSENT=$(jq -nc '[range(0;100) | {id:., context:("noise-\(.) (push)"), status:"success"}]')
RESULT2=$(echo "$ABSENT" | collapse_newest_per_context)
[ "$RESULT2" = "missing" ] || fail "T20: absent required context must stay 'missing', got '$RESULT2'"
pass "T20: genuinely-absent required context stays missing (fail-closed)"
# T21 — non-monotonic order: newest id (157, neither first nor last in list)
# a NEWER success row (oldest-first append → last overwrite wins).
DUP='[{"id":155,"context":"CI / all-required (push)","status":"pending"},
{"id":157,"context":"CI / all-required (push)","status":"success"},
{"id":125,"context":"CI / all-required (push)","status":"failure"}]'
RESULT3=$(echo "$DUP" | collapse_newest_per_context)
[ "$RESULT3" = "success" ] || fail "T21: newest (success) must win over older (failure), got '$RESULT3'"
pass "T21: newest row per context wins after pagination collapse"
# T22 — short-page stop condition: a page with fewer than PER_PAGE rows ends
# the loop. Emulate the numeric comparison the script uses.
PER_PAGE=100
PAGE_COUNT=$(echo "$PAGE2" | jq 'length') # 1 row
if [ "$PAGE_COUNT" -lt "$PER_PAGE" ]; then SHORT=stop; else SHORT=continue; fi
[ "$SHORT" = "stop" ] || fail "T22: short page should stop pagination"
pass "T22: short page stops pagination loop"
# T23 — a full page (== PER_PAGE) continues the loop.
FULL=$(jq -nc '[range(0;100) | {id:., context:"x", status:"success"}]')
FULL_COUNT=$(echo "$FULL" | jq 'length')
if [ "$FULL_COUNT" -lt "$PER_PAGE" ]; then CONT=stop; else CONT=continue; fi
[ "$CONT" = "continue" ] || fail "T23: full page should continue pagination"
pass "T23: full page continues pagination loop"
echo
echo "ALL AUDIT-FORCE-MERGE CHECKS PASSED"
+134 -5
View File
@@ -105,16 +105,25 @@ def test_build_plan_disable_flag_short_circuits_before_credentials():
assert plan["disabled_reason"] == "PROD_AUTO_DEPLOY_DISABLED=true"
def test_latest_status_for_context_uses_first_matching_status():
def test_latest_status_for_context_picks_newest_by_id_regardless_of_order():
# The exhaustively-paginated /statuses list is ascending id order
# (oldest-first), the opposite of the combined /status view. The selector
# must collapse duplicate context rows to the NEWEST (max id) so a stale
# earlier run never shadows the current result, whichever way they arrive.
statuses = [
{"context": "CI / all-required (push)", "status": "pending"},
{"context": "CI / all-required (pull_request)", "status": "success"},
{"context": "CI / all-required (push)", "status": "success"},
{"id": 10, "context": "CI / all-required (push)", "status": "pending"},
{"id": 11, "context": "CI / all-required (pull_request)", "status": "success"},
{"id": 12, "context": "CI / all-required (push)", "status": "success"},
]
latest = prod.latest_status_for_context(statuses, "CI / all-required (push)")
assert latest == {"context": "CI / all-required (push)", "status": "pending"}
assert latest == {"id": 12, "context": "CI / all-required (push)", "status": "success"}
# Same rows shuffled (newest-first, as the combined view would deliver)
# must still resolve to the same newest row.
latest_rev = prod.latest_status_for_context(list(reversed(statuses)), "CI / all-required (push)")
assert latest_rev == {"id": 12, "context": "CI / all-required (push)", "status": "success"}
def test_ci_context_state_handles_missing_and_gitea_status_key():
@@ -612,3 +621,123 @@ def test_superseded_by_none_for_latest_job_so_it_still_rolls(monkeypatch):
)
is None
)
# ---------------------------------------------------------------------------
# /statuses pagination — required-context SUCCESS on page 2+ must be FOUND,
# genuinely-absent context must STILL fail-closed (no fail-open).
# Regression for the single-page-status bug (#2440-family, pagination RCA):
# the combined /status view caps `statuses` at ~30, so on a high-churn commit
# the still-current required-context row is pushed past page 1 and the reader
# falsely reports it `missing`.
# ---------------------------------------------------------------------------
def _paged_statuses_stub(pages):
"""Return a fake _api_json_list that serves `pages` keyed by ?page=N."""
def fake(url, _token):
# url looks like .../statuses?page=N&limit=100
page = 1
for part in url.split("?", 1)[-1].split("&"):
if part.startswith("page="):
page = int(part.split("=", 1)[1])
return pages.get(page, [])
return fake
def test_fetch_all_statuses_finds_required_success_on_page_two(monkeypatch):
# Page 1 is a full 100 rows of unrelated/older churn; the required-context
# SUCCESS only appears on page 2. A single-page reader would miss it.
page1 = [
{"id": i, "context": f"noise-{i} (push)", "status": "pending"}
for i in range(100)
]
page2 = [
{"id": 200, "context": "CI / all-required (push)", "status": "success"},
{"id": 201, "context": "Secret scan / Scan diff for credential-shaped strings (push)",
"status": "success"},
]
monkeypatch.setattr(prod, "_api_json_list", _paged_statuses_stub({1: page1, 2: page2}))
rows = prod.fetch_all_statuses("git.moleculesai.app", "molecule-ai/molecule-core", "a" * 40, "tok")
# Must have walked to page 2 and accumulated every row.
assert len(rows) == 102
assert prod.ci_context_state(rows, "CI / all-required (push)") == "success"
assert (
prod.ci_context_state(
rows, "Secret scan / Scan diff for credential-shaped strings (push)"
)
== "success"
)
def test_fetch_all_statuses_genuinely_absent_context_stays_missing(monkeypatch):
# The required context is on NO page → fail-closed: ci_context_state must
# report "missing", which context_is_satisfied() rejects → gate stays shut.
page1 = [
{"id": i, "context": f"noise-{i} (push)", "status": "success"}
for i in range(100)
]
page2 = [{"id": 200, "context": "some-other (push)", "status": "success"}]
monkeypatch.setattr(prod, "_api_json_list", _paged_statuses_stub({1: page1, 2: page2}))
rows = prod.fetch_all_statuses("git.moleculesai.app", "molecule-ai/molecule-core", "b" * 40, "tok")
state = prod.ci_context_state(rows, "CI / all-required (push)")
assert state == "missing"
assert prod.context_is_satisfied(state) is False
def test_fetch_all_statuses_fail_closed_on_page_error(monkeypatch):
# A page that raises (unreadable) must propagate, never silently truncate
# the scan and let the caller treat a partial list as complete.
def boom(url, _token):
if "page=2" in url:
raise RuntimeError("GET .../statuses?page=2 -> HTTP 502: bad gateway")
return [{"id": i, "context": f"n-{i}", "status": "success"} for i in range(100)]
monkeypatch.setattr(prod, "_api_json_list", boom)
try:
prod.fetch_all_statuses("h", "r", "c" * 40, "tok")
except RuntimeError as exc:
assert "502" in str(exc)
else:
raise AssertionError("expected page-2 error to propagate (fail-closed)")
def test_wait_for_ci_context_succeeds_when_required_status_is_past_page_one(monkeypatch):
# End-to-end: the gate reads the EXHAUSTIVE list, so a required SUCCESS that
# only exists past page 1 lets the deploy proceed instead of timing out.
full = [
{"id": i, "context": f"noise-{i} (push)", "status": "success"}
for i in range(100)
] + [
{"id": 500, "context": "CI / all-required (push)", "status": "success"},
{"id": 501, "context": "Secret scan / Scan diff for credential-shaped strings (push)",
"status": "success"},
]
monkeypatch.setattr(prod, "fetch_all_statuses", lambda *a, **k: full)
result = prod.wait_for_ci_context(
{"GITHUB_SHA": "d" * 40, "GITEA_TOKEN": "tok", "CI_STATUS_TIMEOUT_SECONDS": "30"}
)
assert result == "success"
def test_wait_for_ci_context_times_out_fail_closed_when_required_absent(monkeypatch):
# Genuinely-absent required context across all pages → never satisfied →
# the gate times out rather than green-lighting the deploy (no fail-open).
present_but_irrelevant = [
{"id": 500, "context": "some-other (push)", "status": "success"},
]
monkeypatch.setattr(prod, "fetch_all_statuses", lambda *a, **k: present_but_irrelevant)
# Zero timeout + 0 interval → single poll then TimeoutError.
try:
prod.wait_for_ci_context(
{
"GITHUB_SHA": "e" * 40,
"GITEA_TOKEN": "tok",
"CI_STATUS_TIMEOUT_SECONDS": "1",
"CI_STATUS_POLL_INTERVAL_SECONDS": "1",
}
)
except TimeoutError as exc:
assert "missing" in str(exc)
else:
raise AssertionError("expected fail-closed TimeoutError, not a satisfied gate")