2026-05-12 01:47:57 +00:00
2 changed files with 351 additions and 20 deletions
--- a/.gitea/scripts/status-reaper.py
+++ b/.gitea/scripts/status-reaper.py
@ -19,18 +19,34 @@ What this script does, per `.gitea/workflows/status-reaper.yml` invocation:
         downstream — Gitea uses ` / ` as the workflow/job separator).
     Classify each by whether `on:` contains a `push:` trigger.

-  2. GET combined status for HEAD of WATCH_BRANCH.
+  2. List the last N (=10) commits on WATCH_BRANCH via
+     GET /repos/{o}/{r}/commits?sha={branch}&limit={N}. rev2 sweeps
+     N commits per tick instead of HEAD only — schedule workflows
+     post `failure` to whatever SHA was HEAD when they COMPLETED, so
+     by the next */5 tick main has often moved forward and the red
+     gets stranded on a stale commit (Phase 1+2 evidence: rev1 saw
+     `compensated:0` every tick across ~6 cycles).

-  3. For each per-context status entry where:
-       state == "failure" AND context.endswith(" (push)")
-     Parse context as `<workflow_name> / <job_name> (push)`. Look up
-     workflow_name in the trigger map:
-       - missing → log ::notice:: and skip (conservative).
-       - has_push_trigger=True → preserve (would mask real signal).
-       - has_push_trigger=False → POST a compensating
-         `state=success` status to /statuses/{sha} with the same
-         context (Gitea de-dups by context) and a description that
-         documents the workaround + this script's path.
+  3. For EACH SHA in the list:
+       - GET combined commit status. Per-SHA error isolation
+         (refinement #7): if this call raises ApiError or any 5xx,
+         LOG `::warning::` + continue to the next SHA. Different from
+         the single-HEAD pre-rev2 path where fail-loud was correct;
+         the sweep is best-effort across historical commits, so one
+         transient blip on a stale SHA must not strand reds on the
+         OTHER stale SHAs.
+       - If combined.state == "success": skip — cost optimization
+         (refinement #2), common case (most commits are green).
+       - Otherwise iterate per-context entries. For each entry where:
+           state == "failure" AND context.endswith(" (push)")
+         Parse context as `<workflow_name> / <job_name> (push)`.
+         Look up workflow_name in the trigger map:
+           - missing → log ::notice:: and skip (conservative).
+           - has_push_trigger=True → preserve (real defect signal).
+           - has_push_trigger=False → POST a compensating
+             `state=success` status to /statuses/{sha} with the same
+             context (Gitea de-dups by context) and a description
+             documenting the workaround + this script's path.

  4. Exit 0. Re-running is idempotent — Gitea's commit-status table
     stores the LATEST state-per-context, so the success POST sticks
@ -401,21 +417,29 @@ def reap(
    sha: str,
    *,
    dry_run: bool = False,
-) -> dict[str, int]:
+) -> dict[str, Any]:
    """Walk `combined.statuses[]` and compensate where appropriate.

+    Per-SHA worker. The multi-SHA orchestrator (`reap_branch`) calls
+    this once per stale main commit each tick.
+
    Returns counters for observability:
      {compensated, preserved_real_push, preserved_unknown,
       preserved_non_failure, preserved_non_push_suffix,
-       preserved_unparseable}
+       preserved_unparseable,
+       compensated_contexts: [<context>, ...]}
+
+    `compensated_contexts` is rev2-added so `reap_branch` can build
+    `compensated_per_sha` without re-deriving it from the POST stream.
    """
-    counters = {
+    counters: dict[str, Any] = {
        "compensated": 0,
        "preserved_real_push": 0,
        "preserved_unknown": 0,
        "preserved_non_failure": 0,
        "preserved_non_push_suffix": 0,
        "preserved_unparseable": 0,
+        "compensated_contexts": [],
    }

    statuses = combined.get("statuses") or []
@ -464,10 +488,136 @@ def reap(
            sha, context, s.get("target_url"), dry_run=dry_run
        )
        counters["compensated"] += 1
+        counters["compensated_contexts"].append(context)

    return counters


+# --------------------------------------------------------------------------
+# rev2: multi-SHA sweep over the last N commits on WATCH_BRANCH
+# --------------------------------------------------------------------------
+# How many main commits to sweep per tick. Sized to cover a burst-merge
+# window where multiple PRs land in the 5-min interval between reaper
+# ticks. Older reds falling off the window is acceptable — they were
+# already stale enough that the schedule-run that posted them has long
+# since been overwritten by a real push trigger. See `reference_post_
+# suspension_pipeline` for the merge-cadence baseline.
+DEFAULT_SWEEP_LIMIT = 10
+
+
+def list_recent_commit_shas(branch: str, limit: int) -> list[str]:
+    """List the most recent `limit` commit SHAs on `branch`, newest
+    first.
+
+    Wraps GET /repos/{o}/{r}/commits?sha={branch}&limit={limit}. Gitea
+    1.22.6 returns a JSON list of commit objects each with a `sha` key
+    (verified via vendor-truth probe 2026-05-11 against
+    git.moleculesai.app — `feedback_smoke_test_vendor_truth_not_shape_match`).
+
+    Raises ApiError on non-2xx OR on unexpected response shape. This is
+    a HARD halt — without the commit list the sweep can't proceed. (The
+    per-SHA error isolation downstream is a different concern: tolerating
+    a transient 5xx on ONE commit's status is best-effort; losing the
+    commit list itself means we don't even know which commits to try.)
+    """
+    _, body = api(
+        "GET",
+        f"/repos/{OWNER}/{NAME}/commits",
+        query={"sha": branch, "limit": str(limit)},
+    )
+    if not isinstance(body, list):
+        raise ApiError(
+            f"commits listing for {branch} not a JSON array "
+            f"(got {type(body).__name__})"
+        )
+    shas: list[str] = []
+    for entry in body:
+        if not isinstance(entry, dict):
+            continue
+        sha = entry.get("sha")
+        if isinstance(sha, str) and len(sha) >= 7:
+            shas.append(sha)
+    if not shas:
+        raise ApiError(
+            f"commits listing for {branch} returned no usable SHAs"
+        )
+    return shas
+
+
+def reap_branch(
+    workflow_trigger_map: dict[str, bool],
+    branch: str,
+    *,
+    limit: int = DEFAULT_SWEEP_LIMIT,
+    dry_run: bool = False,
+) -> dict[str, Any]:
+    """Sweep the last `limit` commits on `branch`, applying `reap()`
+    to each (with per-SHA error isolation).
+
+    Returns aggregated counters PLUS rev2 observability fields:
+      - scanned_shas: how many SHAs we actually iterated
+      - compensated_per_sha: {<sha_full>: [<context>, ...]} — only
+        SHAs that actually got at least one compensation are included
+    """
+    shas = list_recent_commit_shas(branch, limit)
+
+    aggregate: dict[str, Any] = {
+        "scanned_shas": 0,
+        "compensated": 0,
+        "preserved_real_push": 0,
+        "preserved_unknown": 0,
+        "preserved_non_failure": 0,
+        "preserved_non_push_suffix": 0,
+        "preserved_unparseable": 0,
+        "compensated_per_sha": {},
+    }
+
+    for sha in shas:
+        aggregate["scanned_shas"] += 1
+
+        # Per-SHA error isolation (refinement #7). One transient blip
+        # on a historical commit must NOT abort the whole tick — the
+        # OTHER stale SHAs may still hold strandable reds.
+        try:
+            combined = get_combined_status(sha)
+        except ApiError as e:
+            print(
+                f"::warning::get_combined_status({sha[:10]}) failed; "
+                f"skipping this SHA: {e}"
+            )
+            continue
+
+        # Cost optimization (refinement #2): the common case is a green
+        # commit. Skip the per-context loop entirely when combined is
+        # already success — saves a tight loop over ~20 statuses per SHA
+        # on green commits, the dominant majority.
+        if combined.get("state") == "success":
+            continue
+
+        per_sha = reap(
+            workflow_trigger_map, combined, sha, dry_run=dry_run
+        )
+
+        # Aggregate scalar counters.
+        for key in (
+            "compensated",
+            "preserved_real_push",
+            "preserved_unknown",
+            "preserved_non_failure",
+            "preserved_non_push_suffix",
+            "preserved_unparseable",
+        ):
+            aggregate[key] += per_sha[key]
+
+        # Record per-SHA compensated contexts (only when non-empty —
+        # keep the summary readable when most SHAs are no-ops).
+        contexts = per_sha.get("compensated_contexts") or []
+        if contexts:
+            aggregate["compensated_per_sha"][sha] = list(contexts)
+
+    return aggregate
+
+
 def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
@ -475,6 +625,15 @@ def main() -> int:
        action="store_true",
        help="Skip the compensating POST; print what would be done.",
    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=DEFAULT_SWEEP_LIMIT,
+        help=(
+            "How many recent commits on WATCH_BRANCH to sweep per tick "
+            f"(default: {DEFAULT_SWEEP_LIMIT})."
+        ),
+    )
    args = parser.parse_args()

    _require_runtime_env()
@ -486,11 +645,11 @@ def main() -> int:
        f"class-O candidates={sum(1 for v in workflow_trigger_map.values() if not v)}"
    )

-    sha = get_head_sha(WATCH_BRANCH)
-    combined = get_combined_status(sha)
-
-    counters = reap(
-        workflow_trigger_map, combined, sha, dry_run=args.dry_run
+    counters = reap_branch(
+        workflow_trigger_map,
+        WATCH_BRANCH,
+        limit=args.limit,
+        dry_run=args.dry_run,
    )

    # Observability: print one JSON line summarising the tick. Loki
@ -499,9 +658,9 @@ def main() -> int:
        "status-reaper summary: "
        + json.dumps(
            {
-                "sha": sha,
                "branch": WATCH_BRANCH,
                "dry_run": args.dry_run,
+                "limit": args.limit,
                **counters,
            },
            sort_keys=True,
--- a/tests/test_status_reaper.py
+++ b/tests/test_status_reaper.py
@ -601,3 +601,175 @@ def test_scan_workflows_missing_dir_returns_empty(sr_module, tmp_path, capsys):
    assert out == {}
    captured = capsys.readouterr()
    assert "::warning::workflows dir not found" in captured.out
+
+
+# --------------------------------------------------------------------------
+# rev2: multi-SHA sweep — `reap_branch()` walks last N main commits
+# --------------------------------------------------------------------------
+# Phase 1+2 evidence (orchestrator + hongming-pc2): rev1 sees `compensated:0`
+# every tick because the schedule workflow posts `failure` to whatever SHA
+# was HEAD when it COMPLETED. By the next */5 tick, main has often moved
+# forward, so the single-HEAD reaper misses the stranded red. rev2 sweeps
+# the last 10 commits each tick. See `reference_post_suspension_pipeline`
+# and parent rev1 PR #618 for context.
+
+SHA_A = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
+SHA_B = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
+SHA_C = "cccccccccccccccccccccccccccccccccccccccc"
+
+
+def test_reap_sweeps_n_shas_smoke(sr_module, monkeypatch):
+    """rev2 contract: sweep last 10 (or N) main commits, GET combined
+    status for EACH. Smoke: with 3 stub SHAs, each is GET'd exactly once.
+    """
+    gets: list[str] = []
+    posts: list[tuple[str, dict]] = []
+
+    def fake_api(method, path, *, body=None, query=None, expect_json=True):
+        if method == "GET" and path.endswith("/commits"):
+            # commits listing — return 3 fake commit objects
+            return (200, [{"sha": SHA_A}, {"sha": SHA_B}, {"sha": SHA_C}])
+        if method == "GET" and "/commits/" in path and path.endswith("/status"):
+            sha = path.split("/commits/")[1].split("/status")[0]
+            gets.append(sha)
+            # All combined=success → cost-optimization short-circuit
+            return (200, {"state": "success", "statuses": []})
+        if method == "POST":
+            posts.append((path, body))
+            return (201, {})
+        raise AssertionError(f"unexpected api call: {method} {path}")
+
+    monkeypatch.setattr(sr_module, "api", fake_api)
+
+    workflow_map = {"x": False}
+    counters = sr_module.reap_branch(
+        workflow_map, "main", limit=10, dry_run=False
+    )
+
+    # Each of the 3 SHAs returned by /commits should be GET'd once.
+    assert gets == [SHA_A, SHA_B, SHA_C]
+    # No POST (everything was combined=success).
+    assert posts == []
+    # Counters reflect what we saw.
+    assert counters["scanned_shas"] == 3
+    assert counters["compensated"] == 0
+    assert counters["compensated_per_sha"] == {}
+
+
+def test_reap_skips_combined_success_shas(sr_module, monkeypatch):
+    """rev2 cost-optimization (refinement #2): when combined==success for
+    a SHA, do NOT iterate per-context statuses; move on to next SHA.
+
+    Mock 2 SHAs with combined=success + 1 with combined=failure → only
+    the failure-SHA's statuses get the per-context loop applied.
+    """
+    per_context_iterated_for: list[str] = []
+    posts: list[tuple[str, dict]] = []
+
+    failure_statuses = [
+        {
+            "context": "drift / drift (push)",
+            "state": "failure",
+            "target_url": "https://example.test/run/42",
+        }
+    ]
+
+    def fake_api(method, path, *, body=None, query=None, expect_json=True):
+        if method == "GET" and path.endswith("/commits"):
+            return (200, [{"sha": SHA_A}, {"sha": SHA_B}, {"sha": SHA_C}])
+        if method == "GET" and "/commits/" in path and path.endswith("/status"):
+            sha = path.split("/commits/")[1].split("/status")[0]
+            if sha == SHA_B:
+                # Mark this SHA as the failure one — return per-context
+                # statuses that would compensate if iterated.
+                return (200, {"state": "failure", "statuses": failure_statuses})
+            # Others are combined=success — must short-circuit.
+            return (200, {"state": "success", "statuses": failure_statuses})
+        if method == "POST":
+            # If a POST hits a non-failure SHA, the short-circuit failed.
+            posts.append((path, body))
+            return (201, {})
+        raise AssertionError(f"unexpected api call: {method} {path}")
+
+    monkeypatch.setattr(sr_module, "api", fake_api)
+
+    # Workflow trigger map: `drift` is schedule-only (compensable).
+    workflow_map = {"drift": False}
+    counters = sr_module.reap_branch(
+        workflow_map, "main", limit=10, dry_run=False
+    )
+
+    # Only SHA_B (the combined=failure one) should be compensated.
+    assert counters["compensated"] == 1
+    assert counters["scanned_shas"] == 3
+    assert SHA_B in counters["compensated_per_sha"]
+    assert counters["compensated_per_sha"][SHA_B] == ["drift / drift (push)"]
+    # SHA_A and SHA_C must NOT appear in compensated_per_sha — their
+    # per-context loop was skipped via the combined=success short-circuit.
+    assert SHA_A not in counters["compensated_per_sha"]
+    assert SHA_C not in counters["compensated_per_sha"]
+    # Exactly one POST: the compensation on SHA_B.
+    assert len(posts) == 1
+    assert posts[0][0] == f"/repos/owner/repo/statuses/{SHA_B}"
+
+
+def test_reap_continues_on_per_sha_apierror(sr_module, monkeypatch, capsys):
+    """rev2 refinement #7 (MOST CRITICAL): a transient ApiError or HTTP-5xx
+    on get_combined_status(SHA_X) must NOT fail the whole tick. Log + skip
+    SHA_X, continue with SHA_Y.
+
+    Different from the single-HEAD path (where fail-loud is correct): the
+    sweep is best-effort across historical commits, so one transient blip
+    on a stale SHA should not strand reds on the OTHER stale SHAs.
+    """
+    posts: list[tuple[str, dict]] = []
+
+    def fake_api(method, path, *, body=None, query=None, expect_json=True):
+        if method == "GET" and path.endswith("/commits"):
+            return (200, [{"sha": SHA_A}, {"sha": SHA_B}])
+        if method == "GET" and "/commits/" in path and path.endswith("/status"):
+            sha = path.split("/commits/")[1].split("/status")[0]
+            if sha == SHA_A:
+                raise sr_module.ApiError(
+                    f"GET /repos/owner/repo/commits/{SHA_A}/status "
+                    f"-> HTTP 502: bad gateway"
+                )
+            # SHA_B returns normally with a failure to compensate.
+            return (
+                200,
+                {
+                    "state": "failure",
+                    "statuses": [
+                        {
+                            "context": "drift / drift (push)",
+                            "state": "failure",
+                        }
+                    ],
+                },
+            )
+        if method == "POST":
+            posts.append((path, body))
+            return (201, {})
+        raise AssertionError(f"unexpected api call: {method} {path}")
+
+    monkeypatch.setattr(sr_module, "api", fake_api)
+
+    workflow_map = {"drift": False}
+    # Must NOT raise — per-SHA error isolation contract.
+    counters = sr_module.reap_branch(
+        workflow_map, "main", limit=10, dry_run=False
+    )
+
+    # SHA_A was logged + skipped. SHA_B processed normally.
+    assert counters["scanned_shas"] == 2
+    assert counters["compensated"] == 1
+    assert SHA_B in counters["compensated_per_sha"]
+    assert SHA_A not in counters["compensated_per_sha"]
+    # Compensation POST landed on SHA_B only.
+    assert len(posts) == 1
+    assert posts[0][0] == f"/repos/owner/repo/statuses/{SHA_B}"
+    # The ApiError must be logged so a human auditing tick output can see
+    # WHICH SHA blipped and WHY.
+    captured = capsys.readouterr()
+    assert "::warning::" in captured.out or "::notice::" in captured.out
+    assert SHA_A[:10] in captured.out