From 7fb66f473d22e33cb5c9234edb99d7ebe1536458 Mon Sep 17 00:00:00 2001 From: devops-engineer Date: Sat, 6 Jun 2026 01:23:48 -0700 Subject: [PATCH] fix(merge-queue): HOLD on persistent 409-conflict-on-update (HOL guard) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A queued PR whose branch-update hits a persistent HTTP 409 merge-conflict sat at the queue head and was retried every tick, never advancing to other ready PRs — head-of-line-blocking the whole autonomous merge queue. ~25 stale conflicted PRs clogged the queue this way. Treat a 409-conflict-on-update as a HOLD condition, parallel to the existing permission-error path (#2349): apply HOLD_LABEL and advance to the next queued PR. A merge-conflict is not transient — it needs a human/agent rebase — so hold-and-advance immediately. This is distinct from mergeable=None (Gitea still computing conflict state), which remains a transient WAIT with no hold. - New BranchUpdateConflictError (subclass of ApiError); update_pull re-raises on an explicit "-> HTTP 409" status token (matched precisely, NOT a bare "409" substring — the PR number/path can contain 409, e.g. /pulls/1409/update). - process_once update-branch catches it, HOLDs the PR, advances. Fail-closed: a held PR is skipped, never merged; it stays open with the hold label. - Extract shared hold_pr() helper; reuse it in the merge-permission path. Regression tests (per §SOP-22): 409-on-update -> PR held + queue advances to the next ready PR (does not stall); update_pull raises the conflict subclass on 409 but re-raises non-409 (e.g. 500) as plain ApiError; PR-number-in-path does not false-trigger. 26 existing tests stay green (31 total in this module). Fixes #2352 Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitea/scripts/gitea-merge-queue.py | 106 +++++++++--- .../scripts/tests/test_gitea_merge_queue.py | 163 ++++++++++++++++++ 2 files changed, 248 insertions(+), 21 deletions(-) diff --git a/.gitea/scripts/gitea-merge-queue.py b/.gitea/scripts/gitea-merge-queue.py index 3aa0edf64..5b494d7bd 100644 --- a/.gitea/scripts/gitea-merge-queue.py +++ b/.gitea/scripts/gitea-merge-queue.py @@ -31,7 +31,11 @@ Authoritative gates (fail-closed): Head-of-line (HOL) safety: a permanent permission/4xx merge error (403/404/405) HOLDS the PR (applies HOLD_LABEL) so the queue advances to the -next PR instead of re-selecting the same wedged PR every tick. +next PR instead of re-selecting the same wedged PR every tick. Likewise, a +persistent branch-update conflict (the /update endpoint returns HTTP 409 +because the PR branch cannot be merged with main without manual rebase) HOLDS +the PR — a conflict will not self-resolve, so retrying it every tick would +HOL-block every ready PR behind it (issue #2352). Status-fetch is fail-closed: if the combined status for a sha cannot be fetched, the PR is skipped this tick (never treated as green). @@ -112,6 +116,20 @@ class MergePermissionError(ApiError): The queue should HOLD this PR and move to the next one.""" +class BranchUpdateConflictError(ApiError): + """Updating the PR branch with the base hit a merge-conflict (HTTP 409). + + A true merge-conflict is NOT transient: the branch cannot be auto-updated + until a human/agent rebases it. The queue should HOLD this PR (apply + HOLD_LABEL) and advance to the next candidate, exactly like the permission + path — otherwise the conflicted PR sits at the queue head and is retried + every tick forever, head-of-line-blocking every ready PR behind it. + + NOTE: distinct from mergeable=None, which is Gitea STILL COMPUTING conflict + state — that case is handled as a transient WAIT (no hold). This error is + only raised on an explicit 409 returned by the /update endpoint.""" + + class BranchProtectionUnavailable(ApiError): """Branch protection (the authoritative required-context source) could not be enumerated. The queue must HOLD rather than merge with an unverified @@ -584,12 +602,24 @@ def update_pull(pr_number: int, *, dry_run: bool) -> None: print(f"::notice::updating PR #{pr_number} with base branch via style={UPDATE_STYLE}") if dry_run: return - api( - "POST", - f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/update", - query={"style": UPDATE_STYLE}, - expect_json=False, - ) + try: + api( + "POST", + f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/update", + query={"style": UPDATE_STYLE}, + expect_json=False, + ) + except ApiError as exc: + # Gitea returns HTTP 409 when the base cannot be merged into the PR + # branch because of a real conflict. The queue cannot auto-resolve a + # conflict, so re-raise as BranchUpdateConflictError; process_once HOLDs + # the PR and advances (HOL guard) instead of retrying it forever. + # Match the HTTP STATUS token ("-> HTTP 409") specifically, not a bare + # "409" substring — the PR number or path can itself contain "409" + # (e.g. /pulls/1409/update) and must not be misread as a conflict. + if "-> HTTP 409" in str(exc): + raise BranchUpdateConflictError(str(exc)) from exc + raise # re-raise other ApiErrors unchanged def add_label_by_name(pr_number: int, label_name: str, *, dry_run: bool) -> None: @@ -618,6 +648,29 @@ def add_label_by_name(pr_number: int, label_name: str, *, dry_run: bool) -> None ) +def hold_pr(pr_number: int, hold_note: str, *, dry_run: bool) -> None: + """Apply HOLD_LABEL to a wedged PR so the queue advances past it. + + choose_next_queued_issue skips HOLD_LABEL-bearing PRs, so this is the HOL + guard: a PR the queue cannot make progress on (permanent permission error + or unresolvable branch-update conflict) is held and a human/agent fixes it, + rather than the queue re-selecting it every tick forever. If the label + cannot be applied we still post the explanatory comment so the wedge is at + least visible — but we never loop on the PR. + """ + try: + add_label_by_name(pr_number, HOLD_LABEL, dry_run=dry_run) + except ApiError as label_exc: + sys.stderr.write( + f"::error::could not apply HOLD_LABEL to PR #{pr_number}: {label_exc}\n" + ) + hold_note += ( + f"\n\n(NOTE: could not apply the hold label automatically: " + f"{label_exc}. Please add `{HOLD_LABEL}` manually.)" + ) + post_comment(pr_number, hold_note, dry_run=dry_run) + + def merge_pull(pr_number: int, *, dry_run: bool, force: bool = False) -> None: payload: dict[str, Any] = { "Do": "merge", @@ -737,7 +790,30 @@ def process_once(*, dry_run: bool = False) -> int: print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}") if decision.action == "update": - update_pull(pr_number, dry_run=dry_run) + try: + update_pull(pr_number, dry_run=dry_run) + except BranchUpdateConflictError as exc: + # The branch cannot be updated with main because of a real conflict + # (HTTP 409). This is the HOL fix for issue #2352: previously the + # 409 propagated to main() and the tick exited 0 with the PR still + # queued, so the NEXT tick re-selected the SAME conflicted PR and + # retried the failing update forever — head-of-line-blocking every + # ready PR behind it. A conflict will not self-resolve; it needs a + # human/agent rebase. So HOLD this PR (HOL guard) and advance to the + # next candidate. Fail-closed: a held PR is skipped, never merged. + sys.stderr.write( + f"::error::branch-update conflict for PR #{pr_number}: {exc}\n" + ) + hold_note = ( + "merge-queue: could not update this branch with " + f"`{WATCH_BRANCH}` — the update returned a merge conflict " + f"(HTTP 409) that the queue cannot auto-resolve ({exc}). " + f"Applied `{HOLD_LABEL}` to unblock the queue (HOL guard). " + f"Fix: rebase/merge `{WATCH_BRANCH}` into this branch and " + f"resolve the conflicts, then remove `{HOLD_LABEL}` to requeue." + ) + hold_pr(pr_number, hold_note, dry_run=dry_run) + return 0 post_comment( pr_number, ( @@ -773,19 +849,7 @@ def process_once(*, dry_run: bool = False) -> int: f"Fix: grant Can-merge to the queue token, then remove " f"`{HOLD_LABEL}` to requeue." ) - try: - add_label_by_name(pr_number, HOLD_LABEL, dry_run=dry_run) - except ApiError as label_exc: - # If we cannot even apply the hold label, fall back to a comment - # so the wedge is at least visible; do NOT loop on this PR. - sys.stderr.write( - f"::error::could not apply HOLD_LABEL to PR #{pr_number}: {label_exc}\n" - ) - hold_note += ( - f"\n\n(NOTE: could not apply the hold label automatically: " - f"{label_exc}. Please add `{HOLD_LABEL}` manually.)" - ) - post_comment(pr_number, hold_note, dry_run=dry_run) + hold_pr(pr_number, hold_note, dry_run=dry_run) return 0 return 0 return 0 diff --git a/.gitea/scripts/tests/test_gitea_merge_queue.py b/.gitea/scripts/tests/test_gitea_merge_queue.py index 440f90dfe..ccac256f6 100644 --- a/.gitea/scripts/tests/test_gitea_merge_queue.py +++ b/.gitea/scripts/tests/test_gitea_merge_queue.py @@ -539,3 +539,166 @@ def test_process_once_holds_tick_when_branch_protection_unavailable(monkeypatch) rc = mq.process_once(dry_run=False) assert rc == 0 assert merged["called"] is False + + +# -------------------------------------------------------------------------- +# Fix 4 (issue #2352): a persistent 409-conflict-on-update HOLDS the PR and +# the queue ADVANCES — it does not retry the conflicted PR forever (HOL). +# -------------------------------------------------------------------------- + +def test_BranchUpdateConflictError_inherits_from_ApiError(): + assert issubclass(mq.BranchUpdateConflictError, mq.ApiError) + + +def test_update_pull_raises_conflict_error_on_409(monkeypatch): + """A 409 from the /update endpoint becomes BranchUpdateConflictError so + process_once can HOLD-and-advance rather than letting it propagate as a + plain ApiError (which would leave the PR queued and re-selectable).""" + monkeypatch.setattr(mq, "OWNER", "molecule-ai") + monkeypatch.setattr(mq, "NAME", "molecule-core") + + def fake_api(method, path, **kwargs): + raise mq.ApiError("POST /pulls/1409/update -> HTTP 409: merge conflict") + monkeypatch.setattr(mq, "api", fake_api) + + import pytest + with pytest.raises(mq.BranchUpdateConflictError) as exc_info: + mq.update_pull(1409, dry_run=False) + assert "409" in str(exc_info.value) + + +def test_update_pull_reraises_non_409_errors(monkeypatch): + """Non-409 update failures (e.g. 500) are NOT conflicts; they must NOT be + swallowed as a hold — they re-raise as the original ApiError so the tick is + a transient no-op and the PR is retried next tick.""" + monkeypatch.setattr(mq, "OWNER", "molecule-ai") + monkeypatch.setattr(mq, "NAME", "molecule-core") + + def fake_api(method, path, **kwargs): + raise mq.ApiError("POST /pulls/1409/update -> HTTP 500: server error") + monkeypatch.setattr(mq, "api", fake_api) + + import pytest + with pytest.raises(mq.ApiError) as exc_info: + mq.update_pull(1409, dry_run=False) + # Re-raised as plain ApiError, NOT the conflict subclass. + assert not isinstance(exc_info.value, mq.BranchUpdateConflictError) + assert "500" in str(exc_info.value) + + +def _stale_pr_update_409_monkeypatch(monkeypatch, queued_issues, calls): + """Wire process_once so the selected PR needs an update (head does NOT + contain main) and the /update call returns a 409 conflict. Everything else + is green. Records merge attempts and the applied hold label in `calls`.""" + monkeypatch.setattr(mq, "OWNER", "molecule-ai") + monkeypatch.setattr(mq, "NAME", "molecule-core") + monkeypatch.setattr(mq, "WATCH_BRANCH", "main") + monkeypatch.setattr(mq, "QUEUE_LABEL", "merge-queue") + monkeypatch.setattr(mq, "HOLD_LABEL", "merge-queue-hold") + monkeypatch.setattr(mq, "REVIEWER_SET", REVIEWERS) + monkeypatch.setattr(mq, "get_branch_protection", lambda branch: mq.BranchProtection( + required_contexts=["CI / all-required (pull_request)"], + required_approvals=2, + block_on_rejected_reviews=True, + )) + main_sha = "b" * 40 + head_sha = "a" * 40 + monkeypatch.setattr(mq, "get_branch_head", lambda branch: main_sha) + + def fake_combined(sha): + ctx = "CI / all-required (push)" if sha == main_sha else "CI / all-required (pull_request)" + return {"state": "success", "statuses": [{"context": ctx, "status": "success"}]} + monkeypatch.setattr(mq, "get_combined_status", fake_combined) + + monkeypatch.setattr(mq, "list_queued_issues", lambda: queued_issues) + monkeypatch.setattr(mq, "get_pull", lambda n: { + "state": "open", "number": n, "mergeable": True, + "base": {"ref": "main", "repo_id": 1}, + "head": {"sha": head_sha, "repo_id": 1}, + "labels": [{"name": "merge-queue"}], + }) + # NOTE: commits do NOT contain main_sha → pr_has_current_base is False → + # decision.action == "update". + monkeypatch.setattr(mq, "get_pull_commits", lambda n: [{"sha": head_sha}]) + monkeypatch.setattr(mq, "get_pull_reviews", lambda n: [ + {"state": "APPROVED", "user": {"login": "agent-researcher"}, + "official": True, "stale": False, "dismissed": False, "commit_id": head_sha}, + {"state": "APPROVED", "user": {"login": "agent-reviewer-cr2"}, + "official": True, "stale": False, "dismissed": False, "commit_id": head_sha}, + ]) + + def fake_update(pr_number, *, dry_run): + calls["update_attempts"] += 1 + raise mq.BranchUpdateConflictError( + "POST /pulls/%d/update -> HTTP 409: merge conflict" % pr_number + ) + monkeypatch.setattr(mq, "update_pull", fake_update) + + def fake_merge(pr_number, *, dry_run, force=False): + calls["merge_attempts"] += 1 + monkeypatch.setattr(mq, "merge_pull", fake_merge) + + def fake_add_label(pr_number, label_name, *, dry_run): + calls["hold_label"] = (pr_number, label_name) + monkeypatch.setattr(mq, "add_label_by_name", fake_add_label) + monkeypatch.setattr(mq, "post_comment", lambda *a, **k: None) + + +def test_process_once_holds_pr_on_409_conflict_on_update(monkeypatch): + """The #2352 regression: a queued PR whose /update returns 409 must get the + HOLD_LABEL (so the queue advances) and must NOT be merged. Without the fix + the 409 propagated, the PR stayed queued, and the next tick re-selected the + SAME conflicted PR forever (head-of-line block).""" + calls = {"update_attempts": 0, "merge_attempts": 0, "hold_label": None} + _stale_pr_update_409_monkeypatch( + monkeypatch, + queued_issues=[ + {"number": 1409, "pull_request": {}, "labels": [{"name": "merge-queue"}], + "created_at": "2026-06-01T00:00:00Z"}, + ], + calls=calls, + ) + + rc = mq.process_once(dry_run=False) + + assert rc == 0 + assert calls["update_attempts"] == 1 + # Held, not merged — fail-closed. + assert calls["hold_label"] == (1409, "merge-queue-hold") + assert calls["merge_attempts"] == 0 + + +def test_queue_advances_past_held_conflicted_pr(monkeypatch): + """End-to-end HOL proof for #2352: PR #1409 (oldest) hits a 409-on-update + and is held; on the NEXT tick choose_next_queued_issue must SKIP the held + PR and select the next ready PR (#1500) instead of stalling on #1409.""" + calls = {"update_attempts": 0, "merge_attempts": 0, "hold_label": None} + conflicted = {"number": 1409, "pull_request": {}, + "labels": [{"name": "merge-queue"}], + "created_at": "2026-06-01T00:00:00Z"} + next_ready = {"number": 1500, "pull_request": {}, + "labels": [{"name": "merge-queue"}], + "created_at": "2026-06-02T00:00:00Z"} + _stale_pr_update_409_monkeypatch( + monkeypatch, + queued_issues=[conflicted, next_ready], + calls=calls, + ) + + # Tick 1: oldest (#1409) is selected, 409-on-update → held. + rc = mq.process_once(dry_run=False) + assert rc == 0 + assert calls["hold_label"] == (1409, "merge-queue-hold") + + # Simulate the label now present on #1409 (as the real hold would persist). + conflicted["labels"] = [{"name": "merge-queue"}, {"name": "merge-queue-hold"}] + + # Tick 2: the queue must ADVANCE — choose_next_queued_issue skips the held + # #1409 and selects the next ready candidate #1500, NOT re-select #1409. + selected = mq.choose_next_queued_issue( + [conflicted, next_ready], + queue_label="merge-queue", + hold_label="merge-queue-hold", + ) + assert selected is not None + assert selected["number"] == 1500 -- 2.52.0