fix(merge-queue): HOLD on persistent 409-conflict-on-update (HOL guard) (#2352) #2354
@@ -31,7 +31,11 @@ Authoritative gates (fail-closed):
|
||||
|
||||
Head-of-line (HOL) safety: a permanent permission/4xx merge error
|
||||
(403/404/405) HOLDS the PR (applies HOLD_LABEL) so the queue advances to the
|
||||
next PR instead of re-selecting the same wedged PR every tick.
|
||||
next PR instead of re-selecting the same wedged PR every tick. Likewise, a
|
||||
persistent branch-update conflict (the /update endpoint returns HTTP 409
|
||||
because the PR branch cannot be merged with main without manual rebase) HOLDS
|
||||
the PR — a conflict will not self-resolve, so retrying it every tick would
|
||||
HOL-block every ready PR behind it (issue #2352).
|
||||
|
||||
Status-fetch is fail-closed: if the combined status for a sha cannot be
|
||||
fetched, the PR is skipped this tick (never treated as green).
|
||||
@@ -112,6 +116,20 @@ class MergePermissionError(ApiError):
|
||||
The queue should HOLD this PR and move to the next one."""
|
||||
|
||||
|
||||
class BranchUpdateConflictError(ApiError):
|
||||
"""Updating the PR branch with the base hit a merge-conflict (HTTP 409).
|
||||
|
||||
A true merge-conflict is NOT transient: the branch cannot be auto-updated
|
||||
until a human/agent rebases it. The queue should HOLD this PR (apply
|
||||
HOLD_LABEL) and advance to the next candidate, exactly like the permission
|
||||
path — otherwise the conflicted PR sits at the queue head and is retried
|
||||
every tick forever, head-of-line-blocking every ready PR behind it.
|
||||
|
||||
NOTE: distinct from mergeable=None, which is Gitea STILL COMPUTING conflict
|
||||
state — that case is handled as a transient WAIT (no hold). This error is
|
||||
only raised on an explicit 409 returned by the /update endpoint."""
|
||||
|
||||
|
||||
class BranchProtectionUnavailable(ApiError):
|
||||
"""Branch protection (the authoritative required-context source) could not
|
||||
be enumerated. The queue must HOLD rather than merge with an unverified
|
||||
@@ -584,12 +602,24 @@ def update_pull(pr_number: int, *, dry_run: bool) -> None:
|
||||
print(f"::notice::updating PR #{pr_number} with base branch via style={UPDATE_STYLE}")
|
||||
if dry_run:
|
||||
return
|
||||
api(
|
||||
"POST",
|
||||
f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/update",
|
||||
query={"style": UPDATE_STYLE},
|
||||
expect_json=False,
|
||||
)
|
||||
try:
|
||||
api(
|
||||
"POST",
|
||||
f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/update",
|
||||
query={"style": UPDATE_STYLE},
|
||||
expect_json=False,
|
||||
)
|
||||
except ApiError as exc:
|
||||
# Gitea returns HTTP 409 when the base cannot be merged into the PR
|
||||
# branch because of a real conflict. The queue cannot auto-resolve a
|
||||
# conflict, so re-raise as BranchUpdateConflictError; process_once HOLDs
|
||||
# the PR and advances (HOL guard) instead of retrying it forever.
|
||||
# Match the HTTP STATUS token ("-> HTTP 409") specifically, not a bare
|
||||
# "409" substring — the PR number or path can itself contain "409"
|
||||
# (e.g. /pulls/1409/update) and must not be misread as a conflict.
|
||||
if "-> HTTP 409" in str(exc):
|
||||
raise BranchUpdateConflictError(str(exc)) from exc
|
||||
raise # re-raise other ApiErrors unchanged
|
||||
|
||||
|
||||
def add_label_by_name(pr_number: int, label_name: str, *, dry_run: bool) -> None:
|
||||
@@ -618,6 +648,29 @@ def add_label_by_name(pr_number: int, label_name: str, *, dry_run: bool) -> None
|
||||
)
|
||||
|
||||
|
||||
def hold_pr(pr_number: int, hold_note: str, *, dry_run: bool) -> None:
|
||||
"""Apply HOLD_LABEL to a wedged PR so the queue advances past it.
|
||||
|
||||
choose_next_queued_issue skips HOLD_LABEL-bearing PRs, so this is the HOL
|
||||
guard: a PR the queue cannot make progress on (permanent permission error
|
||||
or unresolvable branch-update conflict) is held and a human/agent fixes it,
|
||||
rather than the queue re-selecting it every tick forever. If the label
|
||||
cannot be applied we still post the explanatory comment so the wedge is at
|
||||
least visible — but we never loop on the PR.
|
||||
"""
|
||||
try:
|
||||
add_label_by_name(pr_number, HOLD_LABEL, dry_run=dry_run)
|
||||
except ApiError as label_exc:
|
||||
sys.stderr.write(
|
||||
f"::error::could not apply HOLD_LABEL to PR #{pr_number}: {label_exc}\n"
|
||||
)
|
||||
hold_note += (
|
||||
f"\n\n(NOTE: could not apply the hold label automatically: "
|
||||
f"{label_exc}. Please add `{HOLD_LABEL}` manually.)"
|
||||
)
|
||||
post_comment(pr_number, hold_note, dry_run=dry_run)
|
||||
|
||||
|
||||
def merge_pull(pr_number: int, *, dry_run: bool, force: bool = False) -> None:
|
||||
payload: dict[str, Any] = {
|
||||
"Do": "merge",
|
||||
@@ -737,7 +790,30 @@ def process_once(*, dry_run: bool = False) -> int:
|
||||
|
||||
print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}")
|
||||
if decision.action == "update":
|
||||
update_pull(pr_number, dry_run=dry_run)
|
||||
try:
|
||||
update_pull(pr_number, dry_run=dry_run)
|
||||
except BranchUpdateConflictError as exc:
|
||||
# The branch cannot be updated with main because of a real conflict
|
||||
# (HTTP 409). This is the HOL fix for issue #2352: previously the
|
||||
# 409 propagated to main() and the tick exited 0 with the PR still
|
||||
# queued, so the NEXT tick re-selected the SAME conflicted PR and
|
||||
# retried the failing update forever — head-of-line-blocking every
|
||||
# ready PR behind it. A conflict will not self-resolve; it needs a
|
||||
# human/agent rebase. So HOLD this PR (HOL guard) and advance to the
|
||||
# next candidate. Fail-closed: a held PR is skipped, never merged.
|
||||
sys.stderr.write(
|
||||
f"::error::branch-update conflict for PR #{pr_number}: {exc}\n"
|
||||
)
|
||||
hold_note = (
|
||||
"merge-queue: could not update this branch with "
|
||||
f"`{WATCH_BRANCH}` — the update returned a merge conflict "
|
||||
f"(HTTP 409) that the queue cannot auto-resolve ({exc}). "
|
||||
f"Applied `{HOLD_LABEL}` to unblock the queue (HOL guard). "
|
||||
f"Fix: rebase/merge `{WATCH_BRANCH}` into this branch and "
|
||||
f"resolve the conflicts, then remove `{HOLD_LABEL}` to requeue."
|
||||
)
|
||||
hold_pr(pr_number, hold_note, dry_run=dry_run)
|
||||
return 0
|
||||
post_comment(
|
||||
pr_number,
|
||||
(
|
||||
@@ -773,19 +849,7 @@ def process_once(*, dry_run: bool = False) -> int:
|
||||
f"Fix: grant Can-merge to the queue token, then remove "
|
||||
f"`{HOLD_LABEL}` to requeue."
|
||||
)
|
||||
try:
|
||||
add_label_by_name(pr_number, HOLD_LABEL, dry_run=dry_run)
|
||||
except ApiError as label_exc:
|
||||
# If we cannot even apply the hold label, fall back to a comment
|
||||
# so the wedge is at least visible; do NOT loop on this PR.
|
||||
sys.stderr.write(
|
||||
f"::error::could not apply HOLD_LABEL to PR #{pr_number}: {label_exc}\n"
|
||||
)
|
||||
hold_note += (
|
||||
f"\n\n(NOTE: could not apply the hold label automatically: "
|
||||
f"{label_exc}. Please add `{HOLD_LABEL}` manually.)"
|
||||
)
|
||||
post_comment(pr_number, hold_note, dry_run=dry_run)
|
||||
hold_pr(pr_number, hold_note, dry_run=dry_run)
|
||||
return 0
|
||||
return 0
|
||||
return 0
|
||||
|
||||
@@ -539,3 +539,166 @@ def test_process_once_holds_tick_when_branch_protection_unavailable(monkeypatch)
|
||||
rc = mq.process_once(dry_run=False)
|
||||
assert rc == 0
|
||||
assert merged["called"] is False
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Fix 4 (issue #2352): a persistent 409-conflict-on-update HOLDS the PR and
|
||||
# the queue ADVANCES — it does not retry the conflicted PR forever (HOL).
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
def test_BranchUpdateConflictError_inherits_from_ApiError():
|
||||
assert issubclass(mq.BranchUpdateConflictError, mq.ApiError)
|
||||
|
||||
|
||||
def test_update_pull_raises_conflict_error_on_409(monkeypatch):
|
||||
"""A 409 from the /update endpoint becomes BranchUpdateConflictError so
|
||||
process_once can HOLD-and-advance rather than letting it propagate as a
|
||||
plain ApiError (which would leave the PR queued and re-selectable)."""
|
||||
monkeypatch.setattr(mq, "OWNER", "molecule-ai")
|
||||
monkeypatch.setattr(mq, "NAME", "molecule-core")
|
||||
|
||||
def fake_api(method, path, **kwargs):
|
||||
raise mq.ApiError("POST /pulls/1409/update -> HTTP 409: merge conflict")
|
||||
monkeypatch.setattr(mq, "api", fake_api)
|
||||
|
||||
import pytest
|
||||
with pytest.raises(mq.BranchUpdateConflictError) as exc_info:
|
||||
mq.update_pull(1409, dry_run=False)
|
||||
assert "409" in str(exc_info.value)
|
||||
|
||||
|
||||
def test_update_pull_reraises_non_409_errors(monkeypatch):
|
||||
"""Non-409 update failures (e.g. 500) are NOT conflicts; they must NOT be
|
||||
swallowed as a hold — they re-raise as the original ApiError so the tick is
|
||||
a transient no-op and the PR is retried next tick."""
|
||||
monkeypatch.setattr(mq, "OWNER", "molecule-ai")
|
||||
monkeypatch.setattr(mq, "NAME", "molecule-core")
|
||||
|
||||
def fake_api(method, path, **kwargs):
|
||||
raise mq.ApiError("POST /pulls/1409/update -> HTTP 500: server error")
|
||||
monkeypatch.setattr(mq, "api", fake_api)
|
||||
|
||||
import pytest
|
||||
with pytest.raises(mq.ApiError) as exc_info:
|
||||
mq.update_pull(1409, dry_run=False)
|
||||
# Re-raised as plain ApiError, NOT the conflict subclass.
|
||||
assert not isinstance(exc_info.value, mq.BranchUpdateConflictError)
|
||||
assert "500" in str(exc_info.value)
|
||||
|
||||
|
||||
def _stale_pr_update_409_monkeypatch(monkeypatch, queued_issues, calls):
|
||||
"""Wire process_once so the selected PR needs an update (head does NOT
|
||||
contain main) and the /update call returns a 409 conflict. Everything else
|
||||
is green. Records merge attempts and the applied hold label in `calls`."""
|
||||
monkeypatch.setattr(mq, "OWNER", "molecule-ai")
|
||||
monkeypatch.setattr(mq, "NAME", "molecule-core")
|
||||
monkeypatch.setattr(mq, "WATCH_BRANCH", "main")
|
||||
monkeypatch.setattr(mq, "QUEUE_LABEL", "merge-queue")
|
||||
monkeypatch.setattr(mq, "HOLD_LABEL", "merge-queue-hold")
|
||||
monkeypatch.setattr(mq, "REVIEWER_SET", REVIEWERS)
|
||||
monkeypatch.setattr(mq, "get_branch_protection", lambda branch: mq.BranchProtection(
|
||||
required_contexts=["CI / all-required (pull_request)"],
|
||||
required_approvals=2,
|
||||
block_on_rejected_reviews=True,
|
||||
))
|
||||
main_sha = "b" * 40
|
||||
head_sha = "a" * 40
|
||||
monkeypatch.setattr(mq, "get_branch_head", lambda branch: main_sha)
|
||||
|
||||
def fake_combined(sha):
|
||||
ctx = "CI / all-required (push)" if sha == main_sha else "CI / all-required (pull_request)"
|
||||
return {"state": "success", "statuses": [{"context": ctx, "status": "success"}]}
|
||||
monkeypatch.setattr(mq, "get_combined_status", fake_combined)
|
||||
|
||||
monkeypatch.setattr(mq, "list_queued_issues", lambda: queued_issues)
|
||||
monkeypatch.setattr(mq, "get_pull", lambda n: {
|
||||
"state": "open", "number": n, "mergeable": True,
|
||||
"base": {"ref": "main", "repo_id": 1},
|
||||
"head": {"sha": head_sha, "repo_id": 1},
|
||||
"labels": [{"name": "merge-queue"}],
|
||||
})
|
||||
# NOTE: commits do NOT contain main_sha → pr_has_current_base is False →
|
||||
# decision.action == "update".
|
||||
monkeypatch.setattr(mq, "get_pull_commits", lambda n: [{"sha": head_sha}])
|
||||
monkeypatch.setattr(mq, "get_pull_reviews", lambda n: [
|
||||
{"state": "APPROVED", "user": {"login": "agent-researcher"},
|
||||
"official": True, "stale": False, "dismissed": False, "commit_id": head_sha},
|
||||
{"state": "APPROVED", "user": {"login": "agent-reviewer-cr2"},
|
||||
"official": True, "stale": False, "dismissed": False, "commit_id": head_sha},
|
||||
])
|
||||
|
||||
def fake_update(pr_number, *, dry_run):
|
||||
calls["update_attempts"] += 1
|
||||
raise mq.BranchUpdateConflictError(
|
||||
"POST /pulls/%d/update -> HTTP 409: merge conflict" % pr_number
|
||||
)
|
||||
monkeypatch.setattr(mq, "update_pull", fake_update)
|
||||
|
||||
def fake_merge(pr_number, *, dry_run, force=False):
|
||||
calls["merge_attempts"] += 1
|
||||
monkeypatch.setattr(mq, "merge_pull", fake_merge)
|
||||
|
||||
def fake_add_label(pr_number, label_name, *, dry_run):
|
||||
calls["hold_label"] = (pr_number, label_name)
|
||||
monkeypatch.setattr(mq, "add_label_by_name", fake_add_label)
|
||||
monkeypatch.setattr(mq, "post_comment", lambda *a, **k: None)
|
||||
|
||||
|
||||
def test_process_once_holds_pr_on_409_conflict_on_update(monkeypatch):
|
||||
"""The #2352 regression: a queued PR whose /update returns 409 must get the
|
||||
HOLD_LABEL (so the queue advances) and must NOT be merged. Without the fix
|
||||
the 409 propagated, the PR stayed queued, and the next tick re-selected the
|
||||
SAME conflicted PR forever (head-of-line block)."""
|
||||
calls = {"update_attempts": 0, "merge_attempts": 0, "hold_label": None}
|
||||
_stale_pr_update_409_monkeypatch(
|
||||
monkeypatch,
|
||||
queued_issues=[
|
||||
{"number": 1409, "pull_request": {}, "labels": [{"name": "merge-queue"}],
|
||||
"created_at": "2026-06-01T00:00:00Z"},
|
||||
],
|
||||
calls=calls,
|
||||
)
|
||||
|
||||
rc = mq.process_once(dry_run=False)
|
||||
|
||||
assert rc == 0
|
||||
assert calls["update_attempts"] == 1
|
||||
# Held, not merged — fail-closed.
|
||||
assert calls["hold_label"] == (1409, "merge-queue-hold")
|
||||
assert calls["merge_attempts"] == 0
|
||||
|
||||
|
||||
def test_queue_advances_past_held_conflicted_pr(monkeypatch):
|
||||
"""End-to-end HOL proof for #2352: PR #1409 (oldest) hits a 409-on-update
|
||||
and is held; on the NEXT tick choose_next_queued_issue must SKIP the held
|
||||
PR and select the next ready PR (#1500) instead of stalling on #1409."""
|
||||
calls = {"update_attempts": 0, "merge_attempts": 0, "hold_label": None}
|
||||
conflicted = {"number": 1409, "pull_request": {},
|
||||
"labels": [{"name": "merge-queue"}],
|
||||
"created_at": "2026-06-01T00:00:00Z"}
|
||||
next_ready = {"number": 1500, "pull_request": {},
|
||||
"labels": [{"name": "merge-queue"}],
|
||||
"created_at": "2026-06-02T00:00:00Z"}
|
||||
_stale_pr_update_409_monkeypatch(
|
||||
monkeypatch,
|
||||
queued_issues=[conflicted, next_ready],
|
||||
calls=calls,
|
||||
)
|
||||
|
||||
# Tick 1: oldest (#1409) is selected, 409-on-update → held.
|
||||
rc = mq.process_once(dry_run=False)
|
||||
assert rc == 0
|
||||
assert calls["hold_label"] == (1409, "merge-queue-hold")
|
||||
|
||||
# Simulate the label now present on #1409 (as the real hold would persist).
|
||||
conflicted["labels"] = [{"name": "merge-queue"}, {"name": "merge-queue-hold"}]
|
||||
|
||||
# Tick 2: the queue must ADVANCE — choose_next_queued_issue skips the held
|
||||
# #1409 and selects the next ready candidate #1500, NOT re-select #1409.
|
||||
selected = mq.choose_next_queued_issue(
|
||||
[conflicted, next_ready],
|
||||
queue_label="merge-queue",
|
||||
hold_label="merge-queue-hold",
|
||||
)
|
||||
assert selected is not None
|
||||
assert selected["number"] == 1500
|
||||
|
||||
Reference in New Issue
Block a user