From c272eeae94c563f41c69d4e3463ca36f1ed1ce6b Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Wed, 27 May 2026 11:06:06 +0000 Subject: [PATCH 1/2] watchdog: close stale [main-red] issues when contexts recover on red (mc#1789) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When main stays red across consecutive SHAs for *different* causes, close_open_red_issues_for_other_shas never fires (it only runs when main is green). This leaves stale issues open indefinitely — e.g. #1936 (E2E Chat failure) stayed open even though current HEAD is red for a different reason (E2E Legacy Advisory). Add close_stale_red_issues(): 1. List all open [main-red] issues. 2. For each issue on an OLD SHA, query that SHA's commit status. 3. Compare the old failed contexts against current HEAD. 4. If ALL failed contexts have recovered (success or absent), close the issue with a comment pointing to the current [main-red] issue. 5. If the old SHA is itself now green, close it too. 6. Skip issues with combined-red-no-detail (can't verify recovery). Called from run_once() after file_or_update_red() on the red path. Emits a main_red_stale_closed Loki event when issues are closed. Co-Authored-By: Claude Opus 4.7 --- .gitea/scripts/main-red-watchdog.py | 152 ++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/.gitea/scripts/main-red-watchdog.py b/.gitea/scripts/main-red-watchdog.py index c91310308..bbab7cd84 100755 --- a/.gitea/scripts/main-red-watchdog.py +++ b/.gitea/scripts/main-red-watchdog.py @@ -605,6 +605,151 @@ def file_or_update_red( sys.stderr.write(f"::warning::label '{RED_LABEL}' not found on repo\n") +def close_stale_red_issues( + current_sha: str, + current_status: dict, + *, + dry_run: bool = False, +) -> int: + """Close open [main-red] issues whose specific failing contexts have + all recovered on `current_sha`, even though `main` is still red for + other reasons (mc#1789). + + When main stays red across consecutive SHAs for *different* causes, + `close_open_red_issues_for_other_shas` never fires (it only runs when + main is green). This function prevents stale issues from accumulating + indefinitely by comparing per-context recovery across SHAs. + + An issue is considered stale when every context that was in a failed + state on the issue's SHA is now either `success` on the current HEAD + or absent (workflow removed / renamed). Issues whose original SHA had + a combined-red-with-no-detail (empty statuses list) are skipped — we + cannot verify recovery without per-context data. + + Returns the number of issues closed. + """ + open_red = list_open_red_issues() + if not open_red: + return 0 + + current_statuses = current_status.get("statuses") or [] + closed = 0 + + for issue in open_red: + title = issue.get("title", "") + prefix = f"{TITLE_PREFIX} {REPO}: " + if not title.startswith(prefix): + continue + short_sha = title[len(prefix):] + if short_sha == current_sha[:10]: + continue + + # Query status for the old SHA. Short SHA should resolve; if it + # doesn't (GC'd, force-pushed, ambiguous), skip conservatively. + try: + old_status = get_combined_status(short_sha) + except ApiError: + continue + + old_red, old_failed = is_red(old_status) + if not old_red: + # Open issue for a now-green SHA — close it via the normal path. + num = issue.get("number") + if isinstance(num, int): + comment = ( + f"Commit `{short_sha}` is no longer red. Closing as the " + f"failure context has recovered or expired." + ) + if dry_run: + print( + f"::notice::[dry-run] would close issue #{num} " + f"({title}) — old SHA is now green" + ) + closed += 1 + continue + api( + "POST", + f"/repos/{OWNER}/{NAME}/issues/{num}/comments", + body={"body": comment}, + ) + api( + "PATCH", + f"/repos/{OWNER}/{NAME}/issues/{num}", + body={"state": "closed"}, + ) + print( + f"::notice::Closed stale main-red issue #{num} " + f"(old SHA {short_sha} is now green)" + ) + closed += 1 + continue + + if not old_failed: + # Combined red with no per-context detail — can't verify recovery. + continue + + # Verify every failed context from the old SHA has recovered. + all_recovered = True + recovered_ctxs: list[str] = [] + still_failing_ctxs: list[str] = [] + for s in old_failed: + ctx = s.get("context", "") + if not ctx: + continue + current_match = None + for cs in current_statuses: + if isinstance(cs, dict) and cs.get("context") == ctx: + current_match = cs + break + if current_match is None: + recovered_ctxs.append(ctx) + elif _entry_state(current_match) == "success": + recovered_ctxs.append(ctx) + else: + all_recovered = False + still_failing_ctxs.append(ctx) + + if not all_recovered: + continue + + num = issue.get("number") + if not isinstance(num, int): + continue + + comment = ( + f"The failing contexts from this SHA (`{short_sha}`) have " + f"recovered on current HEAD `{current_sha[:10]}`: " + f"{', '.join(recovered_ctxs)}. " + f"Main is still red for other reasons; see the current " + f"`[main-red]` issue for `{current_sha[:10]}`." + ) + if dry_run: + print( + f"::notice::[dry-run] would close stale issue #{num} " + f"({title}) — contexts recovered" + ) + closed += 1 + continue + + api( + "POST", + f"/repos/{OWNER}/{NAME}/issues/{num}/comments", + body={"body": comment}, + ) + api( + "PATCH", + f"/repos/{OWNER}/{NAME}/issues/{num}", + body={"state": "closed"}, + ) + print( + f"::notice::Closed stale main-red issue #{num} " + f"(contexts recovered at {current_sha[:10]})" + ) + closed += 1 + + return closed + + def close_open_red_issues_for_other_shas( current_sha: str, *, @@ -775,6 +920,13 @@ def run_once(*, dry_run: bool = False) -> int: print(f"::warning::main is RED at {sha[:10]} on {WATCH_BRANCH}: " f"{len(failed)} failed context(s)") file_or_update_red(sha, failed, debug, dry_run=dry_run) + stale_closed = close_stale_red_issues(sha, recheck_status, dry_run=dry_run) + if stale_closed: + emit_loki_event("main_red_stale_closed", sha, []) + print( + f"::notice::Closed {stale_closed} stale main-red issue(s) " + f"whose contexts recovered at {sha[:10]}" + ) else: # Green or pending-with-no-real-failures. Close stale issues # from earlier SHAs when required CI has recovered. -- 2.52.0 From 5f0a772f675bde39c3af66489a5d321c468fdc48 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Wed, 27 May 2026 11:50:09 +0000 Subject: [PATCH 2/2] main-red-watchdog: add missing close_stale_red_issues mock in test test_run_once_failure_does_not_close was not monkeypatching the new close_stale_red_issues function, causing it to hit the real api() helper and fail with URLError in CI. Co-Authored-By: Claude Opus 4.7 --- .gitea/scripts/tests/test_main_red_watchdog.py | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitea/scripts/tests/test_main_red_watchdog.py b/.gitea/scripts/tests/test_main_red_watchdog.py index 7b03099de..d728a5b5e 100644 --- a/.gitea/scripts/tests/test_main_red_watchdog.py +++ b/.gitea/scripts/tests/test_main_red_watchdog.py @@ -258,6 +258,7 @@ def test_run_once_failure_does_not_close(monkeypatch): monkeypatch.setattr(wd, "file_or_update_red", capture_file) monkeypatch.setattr(wd, "close_open_red_issues_for_other_shas", lambda *a, **k: 0) + monkeypatch.setattr(wd, "close_stale_red_issues", lambda *a, **k: 0) assert wd.run_once(dry_run=True) == 0 assert filed == ["abc123"] -- 2.52.0