From 8c2f9a068cc3e5f1b1713e949a7bcbdebb302a1b Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Tue, 26 May 2026 10:42:16 +0000 Subject: [PATCH] watchdog: fix stale-issue closeout + pagination + status semantics (mc#1789) - Exhaust pagination in list_open_red_issues() (was hardcoded to 1 page). Backlog can exceed 50 open issues; old code missed stale issues. - Add SCHEDULED_CONTEXT_PATTERNS + _is_scheduled_context() helper. Scheduled jobs (Staging SaaS smoke, Continuous synthetic E2E, main-red-watchdog, ci-arm64-advisory) run on their own cadence and should not block closeout when required CI is actually green. - Fix run_once() close logic for combined=pending + required-green: close stale issues when no non-scheduled context is failed or still pending. This addresses the "main red issues never auto-close" symptom reported in mc#1789. - Move _entry_state() to module level and use it consistently in is_red() and run_once(). Gitea 1.22.6 per-entry key is `status`, not `state`; pre-rev4 code only read `state` and always got None. - Add 19 regression tests covering pagination, _entry_state, _is_scheduled_context, is_red cancel-cascade filter, and run_once close-behavior for green / pending-scheduled-only / pending-required / failure paths. Co-Authored-By: Claude Opus 4.7 --- .gitea/scripts/main-red-watchdog.py | 122 ++++++-- .../scripts/tests/test_main_red_watchdog.py | 282 ++++++++++++++++++ 2 files changed, 377 insertions(+), 27 deletions(-) create mode 100644 .gitea/scripts/tests/test_main_red_watchdog.py diff --git a/.gitea/scripts/main-red-watchdog.py b/.gitea/scripts/main-red-watchdog.py index eb04e1f49..c91310308 100755 --- a/.gitea/scripts/main-red-watchdog.py +++ b/.gitea/scripts/main-red-watchdog.py @@ -90,6 +90,15 @@ API = f"https://{GITEA_HOST}/api/v1" if GITEA_HOST else "" # match by exact title without parsing. TITLE_PREFIX = "[main-red]" +# Contexts that are scheduled or non-required — their pending/failure +# state should not block stale-issue closeout (mc#1789). +SCHEDULED_CONTEXT_PATTERNS = ( + "Staging SaaS smoke", + "Continuous synthetic E2E", + "main-red-watchdog", + "ci-arm64-advisory", +) + # Settling window (seconds) between initial red detection and the # pre-file recheck. The recheck filters out the two largest false- # positive classes seen in mc#1597..1630 (task #394, 2026-05-21): @@ -265,6 +274,11 @@ def get_combined_status(sha: str) -> dict: return body +def _entry_state(s: dict) -> str: + """Per-entry status key in Gitea 1.22.6 is `status`; fall back to `state`.""" + return s.get("status") or s.get("state") or "" + + def is_red(status: dict) -> tuple[bool, list[dict]]: """Return (is_red, failed_statuses). @@ -312,9 +326,6 @@ def is_red(status: dict) -> tuple[bool, list[dict]]: # "no per-context entries were in a red state" fallback even when # the combined-state correctly flagged red. See # `feedback_smoke_test_vendor_truth_not_shape_match`. - def _entry_state(s: dict) -> str: - return s.get("status") or s.get("state") or "" - def _is_cancel_cascade(s: dict) -> bool: """status=3 entry per Gitea 1.22.6 description-string contract. Match exactly (after strip) — substring match would catch @@ -353,6 +364,15 @@ def title_for(sha: str) -> str: return f"{TITLE_PREFIX} {REPO}: {sha[:10]}" +def _is_scheduled_context(context: str) -> bool: + """Return True if `context` is a known scheduled/non-required job. + + These contexts run on a schedule and should not block stale-issue + closeout when main's required CI has recovered (mc#1789). + """ + return any(pattern.lower() in context.lower() for pattern in SCHEDULED_CONTEXT_PATTERNS) + + def list_open_red_issues() -> list[dict]: """All open issues whose title starts with `[main-red] {repo}: `. @@ -362,23 +382,34 @@ def list_open_red_issues() -> list[dict]: file-or-update path to POST a duplicate — exactly the regression class the helper-raises contract closes. - Gitea issue search returns at most 50/page; we only need open - `[main-red]` issues which are by design ≤ 1 at any time per repo, - so a single page is enough. + Pagination is exhausted (mc#1789). The old "by design ≤ 1" invariant + was false — backlog can exceed 50 open issues. """ - _, results = api( - "GET", - f"/repos/{OWNER}/{NAME}/issues", - query={"state": "open", "type": "issues", "limit": "50"}, - ) - if not isinstance(results, list): - raise ApiError( - f"issue search returned non-list body (got {type(results).__name__})" - ) prefix = f"{TITLE_PREFIX} {REPO}: " - return [i for i in results if isinstance(i, dict) + all_issues: list[dict] = [] + page = 1 + limit = 50 + while True: + _, results = api( + "GET", + f"/repos/{OWNER}/{NAME}/issues", + query={"state": "open", "type": "issues", "limit": str(limit), "page": str(page)}, + ) + if not isinstance(results, list): + raise ApiError( + f"issue search returned non-list body (got {type(results).__name__})" + ) + matched = [ + i for i in results + if isinstance(i, dict) and isinstance(i.get("title"), str) - and i["title"].startswith(prefix)] + and i["title"].startswith(prefix) + ] + all_issues.extend(matched) + if len(results) < limit: + break + page += 1 + return all_issues def find_open_issue_for_sha(sha: str) -> dict | None: @@ -745,23 +776,60 @@ def run_once(*, dry_run: bool = False) -> int: f"{len(failed)} failed context(s)") file_or_update_red(sha, failed, debug, dry_run=dry_run) else: - # Green (or pending — pending is treated as not-red so we don't - # spam during the post-merge CI window). Close any stale issues - # from earlier SHAs only when we're actually green; pending - # means CI hasn't finished and the prior issue might still be - # accurate. - if status.get("state") == "success": + # Green or pending-with-no-real-failures. Close stale issues + # from earlier SHAs when required CI has recovered. + # + # mc#1789: main often sits at combined `pending` because + # scheduled/non-required contexts (Staging SaaS smoke, + # Continuous synthetic E2E, main-red-watchdog itself, + # ci-arm64-advisory) are still running. We close stale issues + # as long as no *non-scheduled* context has failed and no + # *non-scheduled* context is still pending — i.e. required CI + # is effectively green. + # + # The success-only gate is preserved for the canonical green + # path; the extended check below only fires when combined is + # `pending` but all required work is done. + combined_state = status.get("state") + if combined_state == "success": + should_close = True + close_reason = "GREEN" + else: + statuses = status.get("statuses") or [] + non_scheduled_pending = [ + s for s in statuses + if isinstance(s, dict) + and (_entry_state(s) == "pending") + and not _is_scheduled_context(s.get("context", "")) + ] + non_scheduled_failed = [ + s for s in statuses + if isinstance(s, dict) + and (_entry_state(s) in {"failure", "error"}) + and not _is_scheduled_context(s.get("context", "")) + ] + # Cancel-cascade already filtered by is_red(); red=False + # here means no real failures. We additionally check that + # no non-scheduled context is still pending. + should_close = not non_scheduled_pending and not non_scheduled_failed + close_reason = "pending-but-required-green" + + if should_close: closed = close_open_red_issues_for_other_shas(sha, dry_run=dry_run) if closed: emit_loki_event( "main_returned_to_green", sha, [], ) - print(f"::notice::main is GREEN at {sha[:10]} on {WATCH_BRANCH} " - f"(closed {closed} stale issue(s))") + print( + f"::notice::main is {close_reason} at {sha[:10]} on {WATCH_BRANCH} " + f"(closed {closed} stale issue(s))" + ) else: - print(f"::notice::main is PENDING at {sha[:10]} on {WATCH_BRANCH} " - f"(combined state={status.get('state')!r}; no action)") + print( + f"::notice::main has pending-or-failed required CI at {sha[:10]} " + f"on {WATCH_BRANCH} (combined state={combined_state!r}; no action)" + ) return 0 diff --git a/.gitea/scripts/tests/test_main_red_watchdog.py b/.gitea/scripts/tests/test_main_red_watchdog.py new file mode 100644 index 000000000..7b03099de --- /dev/null +++ b/.gitea/scripts/tests/test_main_red_watchdog.py @@ -0,0 +1,282 @@ +import importlib.util +import sys +from pathlib import Path +from unittest.mock import patch, MagicMock + +SCRIPT = Path(__file__).resolve().parents[1] / "main-red-watchdog.py" +spec = importlib.util.spec_from_file_location("main_red_watchdog", SCRIPT) +wd = importlib.util.module_from_spec(spec) +sys.modules[spec.name] = wd +spec.loader.exec_module(wd) + +# Module-level constants are loaded from env at import time; set them +# explicitly so unit tests can import without the full env contract. +wd.GITEA_TOKEN = "fake-token" +wd.GITEA_HOST = "git.example.com" +wd.REPO = "molecule-ai/molecule-core" +wd.OWNER = "molecule-ai" +wd.NAME = "molecule-core" +wd.WATCH_BRANCH = "main" +wd.RED_LABEL = "tier:high" +wd.API = "https://git.example.com/api/v1" + + +# --------------------------------------------------------------------------- +# _is_scheduled_context +# --------------------------------------------------------------------------- + +def test_is_scheduled_context_matches_staging_saas_smoke(): + assert wd._is_scheduled_context("Staging SaaS smoke") is True + + +def test_is_scheduled_context_matches_case_insensitive(): + assert wd._is_scheduled_context("continuous synthetic e2e") is True + + +def test_is_scheduled_context_no_match_for_required_ci(): + assert wd._is_scheduled_context("CI / all-required") is False + + +# --------------------------------------------------------------------------- +# _entry_state +# --------------------------------------------------------------------------- + +def test_entry_state_prefers_status_over_state(): + """Gitea 1.22.6 per-entry key is `status`; `state` is fallback.""" + assert wd._entry_state({"status": "failure", "state": "success"}) == "failure" + + +def test_entry_state_falls_back_to_state(): + assert wd._entry_state({"state": "pending"}) == "pending" + + +def test_entry_state_empty_when_neither_key_present(): + assert wd._entry_state({"context": "foo"}) == "" + + +# --------------------------------------------------------------------------- +# is_red +# --------------------------------------------------------------------------- + +def test_is_red_combined_failure_no_statuses(): + """Combined failure with empty statuses[] still trips red.""" + red, failed = wd.is_red({"state": "failure", "statuses": []}) + assert red is True + assert failed == [] + + +def test_is_red_cancel_cascade_filtered(): + """status=3 (cancelled) mapped to failure string must be filtered.""" + status = { + "state": "failure", + "statuses": [ + {"context": "CI / build", "status": "failure", "description": "Has been cancelled"}, + ], + } + red, failed = wd.is_red(status) + assert red is False + assert failed == [] + + +def test_is_red_real_failure_not_filtered(): + """Real failures with different descriptions are kept.""" + status = { + "state": "failure", + "statuses": [ + {"context": "CI / build", "status": "failure", "description": "Failing after 12s"}, + ], + } + red, failed = wd.is_red(status) + assert red is True + assert len(failed) == 1 + assert failed[0]["context"] == "CI / build" + + +def test_is_red_uses_entry_state_not_top_level_state(): + """Regression: per-entry key is `status`, not `state`.""" + status = { + "state": "failure", + "statuses": [ + # Only `status` present; pre-rev4 code read `state` and got None + {"context": "CI / test", "status": "failure"}, + ], + } + red, failed = wd.is_red(status) + assert red is True + assert len(failed) == 1 + + +# --------------------------------------------------------------------------- +# list_open_red_issues — pagination (mc#1789) +# --------------------------------------------------------------------------- + +def test_list_open_red_issues_exhausts_pagination(): + """Backlog can exceed 50 issues; all pages must be fetched.""" + calls = [] + + def fake_api(method, path, **kwargs): + calls.append((method, path, kwargs)) + query = (kwargs.get("query") or {}) + page = int(query.get("page", "1")) + limit = int(query.get("limit", "50")) + # Page 1 returns full limit; page 2 returns partial → break + if page == 1: + return 200, [ + {"title": f"[main-red] molecule-ai/molecule-core: sha{i:04d}"} + for i in range(limit) + ] + if page == 2: + return 200, [ + {"title": "[main-red] molecule-ai/molecule-core: extra1"}, + {"title": "[main-red] molecule-ai/molecule-core: extra2"}, + {"title": " unrelated issue "}, # filtered out + ] + return 200, [] + + with patch.object(wd, "api", side_effect=fake_api): + issues = wd.list_open_red_issues() + + assert len(issues) == 52 # 50 + 2 matched + titles = {i["title"] for i in issues} + assert "[main-red] molecule-ai/molecule-core: extra1" in titles + assert "[main-red] molecule-ai/molecule-core: extra2" in titles + + +def test_list_open_red_issues_single_page(): + """When results < limit, loop breaks after first page.""" + def fake_api(method, path, **kwargs): + return 200, [ + {"title": "[main-red] molecule-ai/molecule-core: abc123"}, + ] + + with patch.object(wd, "api", side_effect=fake_api): + issues = wd.list_open_red_issues() + + assert len(issues) == 1 + + +# --------------------------------------------------------------------------- +# run_once — close logic (mc#1789) +# --------------------------------------------------------------------------- + +def test_run_once_green_closes_stale_issues(monkeypatch): + """Combined success → close stale issues.""" + monkeypatch.setattr(wd, "get_head_sha", lambda b: "abc123") + monkeypatch.setattr(wd, "get_combined_status", lambda s: {"state": "success", "statuses": []}) + monkeypatch.setattr(wd, "is_red", lambda s: (False, [])) + + closed = [] + + def capture_close(current_sha, *, dry_run=False, close_same_sha=False): + closed.append(current_sha) + return 1 + + monkeypatch.setattr(wd, "close_open_red_issues_for_other_shas", capture_close) + monkeypatch.setattr(wd, "emit_loki_event", lambda *a, **k: None) + + assert wd.run_once(dry_run=True) == 0 + assert closed == ["abc123"] + + +def test_run_once_pending_scheduled_only_closes_stale_issues(monkeypatch): + """Combined pending, but only scheduled contexts pending → close stale.""" + monkeypatch.setattr(wd, "get_head_sha", lambda b: "abc123") + monkeypatch.setattr( + wd, "get_combined_status", + lambda s: { + "state": "pending", + "statuses": [ + {"context": "CI / all-required", "status": "success"}, + {"context": "Staging SaaS smoke", "status": "pending"}, + ], + } + ) + monkeypatch.setattr(wd, "is_red", lambda s: (False, [])) + + closed = [] + + def capture_close(current_sha, *, dry_run=False, close_same_sha=False): + closed.append(current_sha) + return 1 + + monkeypatch.setattr(wd, "close_open_red_issues_for_other_shas", capture_close) + monkeypatch.setattr(wd, "emit_loki_event", lambda *a, **k: None) + + assert wd.run_once(dry_run=True) == 0 + assert closed == ["abc123"] + + +def test_run_once_pending_required_does_not_close(monkeypatch): + """Combined pending with a real required context still pending → no close.""" + monkeypatch.setattr(wd, "get_head_sha", lambda b: "abc123") + monkeypatch.setattr( + wd, "get_combined_status", + lambda s: { + "state": "pending", + "statuses": [ + {"context": "CI / all-required", "status": "pending"}, + {"context": "Staging SaaS smoke", "status": "success"}, + ], + } + ) + monkeypatch.setattr(wd, "is_red", lambda s: (False, [])) + + closed = [] + + def capture_close(current_sha, *, dry_run=False, close_same_sha=False): + closed.append(current_sha) + return 0 + + monkeypatch.setattr(wd, "close_open_red_issues_for_other_shas", capture_close) + monkeypatch.setattr(wd, "emit_loki_event", lambda *a, **k: None) + + assert wd.run_once(dry_run=True) == 0 + assert closed == [] + + +def test_run_once_failure_does_not_close(monkeypatch): + """Real failure in non-scheduled context → no close.""" + monkeypatch.setattr(wd, "get_head_sha", lambda b: "abc123") + monkeypatch.setattr( + wd, "get_combined_status", + lambda s: { + "state": "failure", + "statuses": [ + {"context": "CI / all-required", "status": "failure"}, + ], + } + ) + # is_red will return True, so we enter the red path, not the green close path + monkeypatch.setattr(wd, "is_red", lambda s: (True, s.get("statuses", []))) + monkeypatch.setattr(wd, "time", MagicMock(sleep=lambda x: None)) + monkeypatch.setattr(wd, "emit_loki_event", lambda *a, **k: None) + + filed = [] + + def capture_file(sha, failed, debug, *, dry_run=False): + filed.append(sha) + + monkeypatch.setattr(wd, "file_or_update_red", capture_file) + monkeypatch.setattr(wd, "close_open_red_issues_for_other_shas", lambda *a, **k: 0) + + assert wd.run_once(dry_run=True) == 0 + assert filed == ["abc123"] + + +# --------------------------------------------------------------------------- +# title_for / find_open_issue_for_sha +# --------------------------------------------------------------------------- + +def test_title_for_uses_short_sha(): + assert wd.title_for("abcdef123456") == "[main-red] molecule-ai/molecule-core: abcdef1234" + + +def test_find_open_issue_for_sha_matches_exact_title(monkeypatch): + fake_issue = {"title": "[main-red] molecule-ai/molecule-core: abc1234567", "number": 42} + monkeypatch.setattr(wd, "list_open_red_issues", lambda: [fake_issue]) + assert wd.find_open_issue_for_sha("abc1234567") == fake_issue + + +def test_find_open_issue_for_sha_returns_none_when_no_match(monkeypatch): + monkeypatch.setattr(wd, "list_open_red_issues", lambda: []) + assert wd.find_open_issue_for_sha("abc123") is None -- 2.52.0