From 27b6df119c6cd3b78d4cf46473df52f49561a8b9 Mon Sep 17 00:00:00 2001 From: Molecule AI Infra-SRE Date: Fri, 15 May 2026 06:24:15 +0000 Subject: [PATCH 1/4] fix(merge-queue): add review gates and handle merge failures gracefully MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes to the serialized Gitea merge queue: 1. REQUIRED_CONTEXTS now includes qa-review and security-review gates. Previously only CI/all-required and sop-checklist were checked, so PRs with failed reviews were merged (blocked by pre-receive hook) and retried forever — each tick re-attempting the same blocked PR. Adding the explicit review contexts causes the queue to WAIT instead of attempting merge, unblocking the next queued PR. 2. process_once() now catches ApiError on merge attempt and removes the merge-queue label rather than returning 0 and retrying the same PR on every subsequent tick. The comment on the PR informs the author what blocked the merge and tells them to re-add the label once resolved. Fixes: mc# queue infinite retry on review-blocked PRs Co-Authored-By: Claude Opus 4.7 --- .gitea/scripts/gitea-merge-queue.py | 25 ++++++++++++++++++++++++- .gitea/workflows/gitea-merge-queue.yml | 8 +++++++- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/.gitea/scripts/gitea-merge-queue.py b/.gitea/scripts/gitea-merge-queue.py index 46b0482ad..63974350b 100644 --- a/.gitea/scripts/gitea-merge-queue.py +++ b/.gitea/scripts/gitea-merge-queue.py @@ -407,7 +407,30 @@ def process_once(*, dry_run: bool = False) -> int: "deferring to next tick" ) return 0 - merge_pull(pr_number, dry_run=dry_run) + try: + merge_pull(pr_number, dry_run=dry_run) + except ApiError as exc: + # Merge failed (pre-receive hook, branch protection, etc.). + # Remove queue label so next tick picks the next PR. + msg = str(exc) + if "405" in msg or "not allowed to merge" in msg.lower(): + hint = "pre-receive hook or branch protection blocked the merge" + elif "422" in msg or "Unprocessable" in msg: + hint = "branch protection required-status check failed" + elif "409" in msg or "conflict" in msg.lower(): + hint = "merge conflict" + else: + hint = msg[:200] + remove_label(pr_number, QUEUE_LABEL, dry_run=dry_run) + post_comment( + pr_number, + ( + f"merge-queue: merge blocked ({hint}). " + f"Label removed — re-add once the block is resolved." + ), + dry_run=dry_run, + ) + return 0 return 0 return 0 diff --git a/.gitea/workflows/gitea-merge-queue.yml b/.gitea/workflows/gitea-merge-queue.yml index 2ad090171..f87f7e4b2 100644 --- a/.gitea/workflows/gitea-merge-queue.yml +++ b/.gitea/workflows/gitea-merge-queue.yml @@ -47,7 +47,13 @@ jobs: UPDATE_STYLE: merge REQUIRED_CONTEXTS: >- CI / all-required (pull_request), - sop-checklist / all-items-acked (pull_request) + sop-checklist / all-items-acked (pull_request), + qa-review / approved (pull_request), + security-review / approved (pull_request) + # qa-review and security-review gates are included so PRs that fail + # review checks are dequeued automatically (label removed) rather than + # causing the queue to retry the same blocked PR on every tick. + # # Push-side required contexts. Checking CI / all-required (push) # explicitly instead of the combined state avoids false-pause when # non-blocking jobs (continue-on-error: true) have failed — those -- 2.52.0 From a1146d2f5f9d644a2a1106119cf738a3319a8cfd Mon Sep 17 00:00:00 2001 From: Molecule AI Infra-SRE Date: Fri, 15 May 2026 07:52:11 +0000 Subject: [PATCH 2/4] fix(merge-queue): remove broken qa/sec gates from REQUIRED_CONTEXTS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qa-review and security-review gates permanently fail (mc#1111: SOP_TIER_CHECK_TOKEN missing PAT — token owner not in qa/security teams, HTTP 403 on team membership probe). Adding them to REQUIRED_CONTEXTS would cause the queue to strip the merge-queue label from every PR in the queue, breaking the queue for all contributors. Keep the ApiError error-handling from the previous commit (catches 405/422/409 from merge_pull and removes the label + posts a comment). That logic prevents infinite retries on blocked PRs even without qa/sec gates. Re-add qa-review and security-review to REQUIRED_CONTEXTS once mc#1111 is resolved. Co-Authored-By: Claude Opus 4.7 --- .gitea/workflows/gitea-merge-queue.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.gitea/workflows/gitea-merge-queue.yml b/.gitea/workflows/gitea-merge-queue.yml index f87f7e4b2..b94d00f68 100644 --- a/.gitea/workflows/gitea-merge-queue.yml +++ b/.gitea/workflows/gitea-merge-queue.yml @@ -47,13 +47,13 @@ jobs: UPDATE_STYLE: merge REQUIRED_CONTEXTS: >- CI / all-required (pull_request), - sop-checklist / all-items-acked (pull_request), - qa-review / approved (pull_request), - security-review / approved (pull_request) - # qa-review and security-review gates are included so PRs that fail - # review checks are dequeued automatically (label removed) rather than - # causing the queue to retry the same blocked PR on every tick. - # + sop-checklist / all-items-acked (pull_request) + # NOTE: qa-review / security-review gates intentionally omitted. + # These gates permanently fail (mc#1111: SOP_TIER_CHECK_TOKEN missing + # PAT — token owner not in qa/security teams). Adding them to + # REQUIRED_CONTEXTS would strip the merge-queue label from every PR + # in the queue, breaking the queue for all contributors. + # Re-add these gates once mc#1111 is resolved. # Push-side required contexts. Checking CI / all-required (push) # explicitly instead of the combined state avoids false-pause when # non-blocking jobs (continue-on-error: true) have failed — those -- 2.52.0 From e860114ef194efbadba2212aab0786fde73511dc Mon Sep 17 00:00:00 2001 From: Molecule AI Infra-SRE Date: Fri, 15 May 2026 09:15:08 +0000 Subject: [PATCH 3/4] fix(merge-queue): add remove_label function needed by ApiError handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ApiError handler (added in 7c08352d) calls remove_label() to strip the queue label from PRs blocked by pre-receive hooks, but the function was never defined — causing NameError on the first merge failure and crashing the workflow tick. Fixes: mc#1144 (queue stalls after pre-receive hook 405) Co-Authored-By: Claude Opus 4.7 --- .gitea/scripts/gitea-merge-queue.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/.gitea/scripts/gitea-merge-queue.py b/.gitea/scripts/gitea-merge-queue.py index 63974350b..91c86c975 100644 --- a/.gitea/scripts/gitea-merge-queue.py +++ b/.gitea/scripts/gitea-merge-queue.py @@ -314,6 +314,29 @@ def post_comment(pr_number: int, body: str, *, dry_run: bool) -> None: api("POST", f"/repos/{OWNER}/{NAME}/issues/{pr_number}/comments", body={"body": body}) +def remove_label(pr_number: int, label: str, *, dry_run: bool) -> None: + print(f"::notice::removing label '{label}' from PR #{pr_number}") + if dry_run: + return + # Gitea requires label ID, not name, for deletion. + # Multiple labels can share the same name with different IDs — remove all. + _, body = api("GET", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}") + pr_labels = body.get("labels", []) if isinstance(body, dict) else [] + removed = False + for lbl in pr_labels: + if lbl.get("name") == label: + label_id = lbl.get("id") + if label_id: + api( + "DELETE", + f"/repos/{OWNER}/{NAME}/issues/{pr_number}/labels/{label_id}", + expect_json=False, + ) + removed = True + if not removed: + print(f"::notice::label '{label}' not found on PR #{pr_number}") + + def update_pull(pr_number: int, *, dry_run: bool) -> None: print(f"::notice::updating PR #{pr_number} with base branch via style={UPDATE_STYLE}") if dry_run: -- 2.52.0 From f3724b74bc5e852c3bc9acaf3a37c9da940ed80c Mon Sep 17 00:00:00 2001 From: Molecule AI Infra-SRE Date: Fri, 15 May 2026 09:34:27 +0000 Subject: [PATCH 4/4] fix(ci-required-drift): suppress F1 false positive when sentinel deliberately omits needs: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `all-required` sentinel intentionally has no `needs:` key because Gitea 1.22/act_runner marks a `needs:`-dependent job with `if: always()` as "skipped" before upstream jobs settle, leaving branch protection with a permanent pending context. The sentinel polls commit statuses directly instead — documented design, not drift. The drift detector was flagging F1 for every CI job because `sentinel_needs` was `[]` (no key), treating it as an explicit empty needs list rather than a deliberate omission. Now the detector checks whether `needs:` is present in the sentinel YAML: absent key = known exception, explicit empty `needs: []` = still real F1 signal. +test: `test_f1_not_flagged_when_sentinel_deliberately_omits_needs` covers the new suppression path and asserts the debug flag. Fixes: mc#1161 (false-positive [ci-drift] issue on main) Co-Authored-By: Claude Opus 4.7 --- .gitea/scripts/ci-required-drift.py | 34 +++++++++++++++++++++- tests/test_ci_required_drift.py | 44 +++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/.gitea/scripts/ci-required-drift.py b/.gitea/scripts/ci-required-drift.py index 8de6de46c..de06ff554 100755 --- a/.gitea/scripts/ci-required-drift.py +++ b/.gitea/scripts/ci-required-drift.py @@ -250,6 +250,25 @@ def sentinel_needs(ci_doc: dict) -> set[str]: return set(needs) +def sentinel_has_explicit_needs_key(ci_doc: dict) -> bool: + """Return True if the sentinel job has an explicit `needs:` key in the YAML. + + This distinguishes `needs: []` (explicit empty list — a real drift signal) + from a missing `needs:` key entirely (the sentinel polls commit statuses + directly, no job-dependency chain — intentional per Gitea 1.22.6 quirk). + + When `needs:` is absent entirely, the sentinel intentionally bypasses the + `needs:` pattern to avoid the Gitea 1.22/act_runner bug where a + `needs:`-dependent job with `if: always()` emits a "skipped" status + before upstream jobs settle, leaving branch protection with a permanent + pending context. See ci.yml comment on the `all-required` sentinel. + """ + sentinel = ci_doc.get("jobs", {}).get(SENTINEL_JOB) + if not isinstance(sentinel, dict): + return True # no sentinel = no exception; let the existing error fire + return "needs" in sentinel + + def required_checks_env(audit_doc: dict) -> set[str]: """Pull the REQUIRED_CHECKS env value from audit-force-merge.yml. Walks the YAML AST per `feedback_behavior_based_ast_gates`: we do @@ -330,6 +349,7 @@ def detect_drift(branch: str) -> tuple[list[str], dict]: jobs_all = ci_jobs_all(ci_doc) needs = sentinel_needs(ci_doc) env_set = required_checks_env(audit_doc) + sentinel_has_explicit_needs = sentinel_has_explicit_needs_key(ci_doc) # Protection # api() raises ApiError on non-2xx. Transient 5xx should fail loud. @@ -369,6 +389,7 @@ def detect_drift(branch: str) -> tuple[list[str], dict]: "branch": branch, "ci_jobs": sorted(jobs), "sentinel_needs": sorted(needs), + "sentinel_has_explicit_needs": sentinel_has_explicit_needs, "protection_contexts_skipped": True, "protection_http_status": http_status, "audit_env_checks": sorted(env_set), @@ -384,12 +405,22 @@ def detect_drift(branch: str) -> tuple[list[str], dict]: contexts = set(protection.get("status_check_contexts") or []) # ----- F1: job exists in CI but not under sentinel.needs ----- + # Skip this check when the sentinel deliberately omits `needs:` (polls commit + # statuses directly instead, per Gitea 1.22.6 quirk documented in ci.yml). + # `needs: []` (explicit empty) is still a real F1 signal; only absent `needs:` + # is the documented exception. missing_from_needs = sorted(jobs - needs) - if missing_from_needs: + if missing_from_needs and sentinel_has_explicit_needs: findings.append( "F1 — jobs in ci.yml NOT under sentinel `needs:` (sentinel doesn't gate them):\n" + "\n".join(f" - {n}" for n in missing_from_needs) ) + elif missing_from_needs and not sentinel_has_explicit_needs: + sys.stderr.write( + f"::notice::sentinel '{SENTINEL_JOB}' has no `needs:` key — " + f"intentional polling-sentinel design (Gitea 1.22.6 quirk); " + f"skipping F1 for {[n for n in missing_from_needs]}.\n" + ) # ----- F1b: needs lists a job that doesn't exist (typo) ----- # Compare against jobs_all (incl. event-gated jobs); a typo is a @@ -445,6 +476,7 @@ def detect_drift(branch: str) -> tuple[list[str], dict]: "branch": branch, "ci_jobs": sorted(jobs), "sentinel_needs": sorted(needs), + "sentinel_has_explicit_needs": sentinel_has_explicit_needs, "protection_contexts": sorted(contexts), "audit_env_checks": sorted(env_set), "expected_contexts": sorted(emitted_contexts), diff --git a/tests/test_ci_required_drift.py b/tests/test_ci_required_drift.py index 3bed48c4f..ba0048fcd 100644 --- a/tests/test_ci_required_drift.py +++ b/tests/test_ci_required_drift.py @@ -179,6 +179,50 @@ def test_f1_job_missing_from_sentinel_needs(drift_module, tmp_path, monkeypatch) assert any("F1 —" in f and "test" in f for f in findings), findings +def test_f1_not_flagged_when_sentinel_deliberately_omits_needs( + drift_module, tmp_path, monkeypatch +): + """When the sentinel intentionally has no `needs:` key (polls commit statuses + directly, per Gitea 1.22.6 quirk), F1 is suppressed — the empty sentinel + needs is a documented design choice, not drift.""" + # Write ci.yml with sentinel having NO `needs:` key at all + import yaml + + doc = { + "name": "ci", + "on": {"pull_request": {}}, + "jobs": { + "build": {"runs-on": "ubuntu-latest"}, + "test": {"runs-on": "ubuntu-latest"}, # would trigger F1 if needs: [] + "all-required": { + "runs-on": "ubuntu-latest", + # NOTE: no `needs:` key — sentinel polls commit statuses directly. + # This is the intentional Gitea 1.22.6 quirk design. + "steps": [{"run": "echo polling"}], + }, + }, + } + ci = tmp_path / "ci.yml" + ci.write_text(yaml.safe_dump(doc), encoding="utf-8") + audit = _write_audit_yaml(tmp_path, ["ci / all-required (pull_request)"]) + _patch_paths(drift_module, monkeypatch, ci, audit) + + stub = _make_stub_api({ + ("GET", "/repos/owner/repo/branch_protections/main"): ( + 200, + {"status_check_contexts": ["ci / all-required (pull_request)"]}, + ), + }) + monkeypatch.setattr(drift_module, "api", stub) + + findings, debug = drift_module.detect_drift("main") + # F1 must NOT be raised + assert not any("F1 —" in f for f in findings), findings + # Debug info should confirm the sentinel has no explicit needs + assert debug["sentinel_has_explicit_needs"] is False + assert debug["sentinel_needs"] == [] + + def test_f1b_sentinel_needs_typo(drift_module, tmp_path, monkeypatch): """F1b: sentinel.needs lists a job not present in ci.yml (typo). -- 2.52.0