fix(merge-queue): HOLD on persistent 409-conflict-on-update (HOL guard) (#2352) #2354

Merged
devops-engineer merged 2 commits from fix/merge-queue-hold-on-409-conflict-update into main 2026-06-06 09:10:03 +00:00
2 changed files with 248 additions and 21 deletions
+85 -21
View File
@@ -31,7 +31,11 @@ Authoritative gates (fail-closed):
Head-of-line (HOL) safety: a permanent permission/4xx merge error
(403/404/405) HOLDS the PR (applies HOLD_LABEL) so the queue advances to the
next PR instead of re-selecting the same wedged PR every tick.
next PR instead of re-selecting the same wedged PR every tick. Likewise, a
persistent branch-update conflict (the /update endpoint returns HTTP 409
because the PR branch cannot be merged with main without manual rebase) HOLDS
the PR — a conflict will not self-resolve, so retrying it every tick would
HOL-block every ready PR behind it (issue #2352).
Status-fetch is fail-closed: if the combined status for a sha cannot be
fetched, the PR is skipped this tick (never treated as green).
@@ -112,6 +116,20 @@ class MergePermissionError(ApiError):
The queue should HOLD this PR and move to the next one."""
class BranchUpdateConflictError(ApiError):
"""Updating the PR branch with the base hit a merge-conflict (HTTP 409).
A true merge-conflict is NOT transient: the branch cannot be auto-updated
until a human/agent rebases it. The queue should HOLD this PR (apply
HOLD_LABEL) and advance to the next candidate, exactly like the permission
path — otherwise the conflicted PR sits at the queue head and is retried
every tick forever, head-of-line-blocking every ready PR behind it.
NOTE: distinct from mergeable=None, which is Gitea STILL COMPUTING conflict
state — that case is handled as a transient WAIT (no hold). This error is
only raised on an explicit 409 returned by the /update endpoint."""
class BranchProtectionUnavailable(ApiError):
"""Branch protection (the authoritative required-context source) could not
be enumerated. The queue must HOLD rather than merge with an unverified
@@ -584,12 +602,24 @@ def update_pull(pr_number: int, *, dry_run: bool) -> None:
print(f"::notice::updating PR #{pr_number} with base branch via style={UPDATE_STYLE}")
if dry_run:
return
api(
"POST",
f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/update",
query={"style": UPDATE_STYLE},
expect_json=False,
)
try:
api(
"POST",
f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/update",
query={"style": UPDATE_STYLE},
expect_json=False,
)
except ApiError as exc:
# Gitea returns HTTP 409 when the base cannot be merged into the PR
# branch because of a real conflict. The queue cannot auto-resolve a
# conflict, so re-raise as BranchUpdateConflictError; process_once HOLDs
# the PR and advances (HOL guard) instead of retrying it forever.
# Match the HTTP STATUS token ("-> HTTP 409") specifically, not a bare
# "409" substring — the PR number or path can itself contain "409"
# (e.g. /pulls/1409/update) and must not be misread as a conflict.
if "-> HTTP 409" in str(exc):
raise BranchUpdateConflictError(str(exc)) from exc
raise # re-raise other ApiErrors unchanged
def add_label_by_name(pr_number: int, label_name: str, *, dry_run: bool) -> None:
@@ -618,6 +648,29 @@ def add_label_by_name(pr_number: int, label_name: str, *, dry_run: bool) -> None
)
def hold_pr(pr_number: int, hold_note: str, *, dry_run: bool) -> None:
"""Apply HOLD_LABEL to a wedged PR so the queue advances past it.
choose_next_queued_issue skips HOLD_LABEL-bearing PRs, so this is the HOL
guard: a PR the queue cannot make progress on (permanent permission error
or unresolvable branch-update conflict) is held and a human/agent fixes it,
rather than the queue re-selecting it every tick forever. If the label
cannot be applied we still post the explanatory comment so the wedge is at
least visible — but we never loop on the PR.
"""
try:
add_label_by_name(pr_number, HOLD_LABEL, dry_run=dry_run)
except ApiError as label_exc:
sys.stderr.write(
f"::error::could not apply HOLD_LABEL to PR #{pr_number}: {label_exc}\n"
)
hold_note += (
f"\n\n(NOTE: could not apply the hold label automatically: "
f"{label_exc}. Please add `{HOLD_LABEL}` manually.)"
)
post_comment(pr_number, hold_note, dry_run=dry_run)
def merge_pull(pr_number: int, *, dry_run: bool, force: bool = False) -> None:
payload: dict[str, Any] = {
"Do": "merge",
@@ -737,7 +790,30 @@ def process_once(*, dry_run: bool = False) -> int:
print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}")
if decision.action == "update":
update_pull(pr_number, dry_run=dry_run)
try:
update_pull(pr_number, dry_run=dry_run)
except BranchUpdateConflictError as exc:
# The branch cannot be updated with main because of a real conflict
# (HTTP 409). This is the HOL fix for issue #2352: previously the
# 409 propagated to main() and the tick exited 0 with the PR still
# queued, so the NEXT tick re-selected the SAME conflicted PR and
# retried the failing update forever — head-of-line-blocking every
# ready PR behind it. A conflict will not self-resolve; it needs a
# human/agent rebase. So HOLD this PR (HOL guard) and advance to the
# next candidate. Fail-closed: a held PR is skipped, never merged.
sys.stderr.write(
f"::error::branch-update conflict for PR #{pr_number}: {exc}\n"
)
hold_note = (
"merge-queue: could not update this branch with "
f"`{WATCH_BRANCH}` — the update returned a merge conflict "
f"(HTTP 409) that the queue cannot auto-resolve ({exc}). "
f"Applied `{HOLD_LABEL}` to unblock the queue (HOL guard). "
f"Fix: rebase/merge `{WATCH_BRANCH}` into this branch and "
f"resolve the conflicts, then remove `{HOLD_LABEL}` to requeue."
)
hold_pr(pr_number, hold_note, dry_run=dry_run)
return 0
post_comment(
pr_number,
(
@@ -773,19 +849,7 @@ def process_once(*, dry_run: bool = False) -> int:
f"Fix: grant Can-merge to the queue token, then remove "
f"`{HOLD_LABEL}` to requeue."
)
try:
add_label_by_name(pr_number, HOLD_LABEL, dry_run=dry_run)
except ApiError as label_exc:
# If we cannot even apply the hold label, fall back to a comment
# so the wedge is at least visible; do NOT loop on this PR.
sys.stderr.write(
f"::error::could not apply HOLD_LABEL to PR #{pr_number}: {label_exc}\n"
)
hold_note += (
f"\n\n(NOTE: could not apply the hold label automatically: "
f"{label_exc}. Please add `{HOLD_LABEL}` manually.)"
)
post_comment(pr_number, hold_note, dry_run=dry_run)
hold_pr(pr_number, hold_note, dry_run=dry_run)
return 0
return 0
return 0
@@ -539,3 +539,166 @@ def test_process_once_holds_tick_when_branch_protection_unavailable(monkeypatch)
rc = mq.process_once(dry_run=False)
assert rc == 0
assert merged["called"] is False
# --------------------------------------------------------------------------
# Fix 4 (issue #2352): a persistent 409-conflict-on-update HOLDS the PR and
# the queue ADVANCES — it does not retry the conflicted PR forever (HOL).
# --------------------------------------------------------------------------
def test_BranchUpdateConflictError_inherits_from_ApiError():
assert issubclass(mq.BranchUpdateConflictError, mq.ApiError)
def test_update_pull_raises_conflict_error_on_409(monkeypatch):
"""A 409 from the /update endpoint becomes BranchUpdateConflictError so
process_once can HOLD-and-advance rather than letting it propagate as a
plain ApiError (which would leave the PR queued and re-selectable)."""
monkeypatch.setattr(mq, "OWNER", "molecule-ai")
monkeypatch.setattr(mq, "NAME", "molecule-core")
def fake_api(method, path, **kwargs):
raise mq.ApiError("POST /pulls/1409/update -> HTTP 409: merge conflict")
monkeypatch.setattr(mq, "api", fake_api)
import pytest
with pytest.raises(mq.BranchUpdateConflictError) as exc_info:
mq.update_pull(1409, dry_run=False)
assert "409" in str(exc_info.value)
def test_update_pull_reraises_non_409_errors(monkeypatch):
"""Non-409 update failures (e.g. 500) are NOT conflicts; they must NOT be
swallowed as a hold — they re-raise as the original ApiError so the tick is
a transient no-op and the PR is retried next tick."""
monkeypatch.setattr(mq, "OWNER", "molecule-ai")
monkeypatch.setattr(mq, "NAME", "molecule-core")
def fake_api(method, path, **kwargs):
raise mq.ApiError("POST /pulls/1409/update -> HTTP 500: server error")
monkeypatch.setattr(mq, "api", fake_api)
import pytest
with pytest.raises(mq.ApiError) as exc_info:
mq.update_pull(1409, dry_run=False)
# Re-raised as plain ApiError, NOT the conflict subclass.
assert not isinstance(exc_info.value, mq.BranchUpdateConflictError)
assert "500" in str(exc_info.value)
def _stale_pr_update_409_monkeypatch(monkeypatch, queued_issues, calls):
"""Wire process_once so the selected PR needs an update (head does NOT
contain main) and the /update call returns a 409 conflict. Everything else
is green. Records merge attempts and the applied hold label in `calls`."""
monkeypatch.setattr(mq, "OWNER", "molecule-ai")
monkeypatch.setattr(mq, "NAME", "molecule-core")
monkeypatch.setattr(mq, "WATCH_BRANCH", "main")
monkeypatch.setattr(mq, "QUEUE_LABEL", "merge-queue")
monkeypatch.setattr(mq, "HOLD_LABEL", "merge-queue-hold")
monkeypatch.setattr(mq, "REVIEWER_SET", REVIEWERS)
monkeypatch.setattr(mq, "get_branch_protection", lambda branch: mq.BranchProtection(
required_contexts=["CI / all-required (pull_request)"],
required_approvals=2,
block_on_rejected_reviews=True,
))
main_sha = "b" * 40
head_sha = "a" * 40
monkeypatch.setattr(mq, "get_branch_head", lambda branch: main_sha)
def fake_combined(sha):
ctx = "CI / all-required (push)" if sha == main_sha else "CI / all-required (pull_request)"
return {"state": "success", "statuses": [{"context": ctx, "status": "success"}]}
monkeypatch.setattr(mq, "get_combined_status", fake_combined)
monkeypatch.setattr(mq, "list_queued_issues", lambda: queued_issues)
monkeypatch.setattr(mq, "get_pull", lambda n: {
"state": "open", "number": n, "mergeable": True,
"base": {"ref": "main", "repo_id": 1},
"head": {"sha": head_sha, "repo_id": 1},
"labels": [{"name": "merge-queue"}],
})
# NOTE: commits do NOT contain main_sha → pr_has_current_base is False →
# decision.action == "update".
monkeypatch.setattr(mq, "get_pull_commits", lambda n: [{"sha": head_sha}])
monkeypatch.setattr(mq, "get_pull_reviews", lambda n: [
{"state": "APPROVED", "user": {"login": "agent-researcher"},
"official": True, "stale": False, "dismissed": False, "commit_id": head_sha},
{"state": "APPROVED", "user": {"login": "agent-reviewer-cr2"},
"official": True, "stale": False, "dismissed": False, "commit_id": head_sha},
])
def fake_update(pr_number, *, dry_run):
calls["update_attempts"] += 1
raise mq.BranchUpdateConflictError(
"POST /pulls/%d/update -> HTTP 409: merge conflict" % pr_number
)
monkeypatch.setattr(mq, "update_pull", fake_update)
def fake_merge(pr_number, *, dry_run, force=False):
calls["merge_attempts"] += 1
monkeypatch.setattr(mq, "merge_pull", fake_merge)
def fake_add_label(pr_number, label_name, *, dry_run):
calls["hold_label"] = (pr_number, label_name)
monkeypatch.setattr(mq, "add_label_by_name", fake_add_label)
monkeypatch.setattr(mq, "post_comment", lambda *a, **k: None)
def test_process_once_holds_pr_on_409_conflict_on_update(monkeypatch):
"""The #2352 regression: a queued PR whose /update returns 409 must get the
HOLD_LABEL (so the queue advances) and must NOT be merged. Without the fix
the 409 propagated, the PR stayed queued, and the next tick re-selected the
SAME conflicted PR forever (head-of-line block)."""
calls = {"update_attempts": 0, "merge_attempts": 0, "hold_label": None}
_stale_pr_update_409_monkeypatch(
monkeypatch,
queued_issues=[
{"number": 1409, "pull_request": {}, "labels": [{"name": "merge-queue"}],
"created_at": "2026-06-01T00:00:00Z"},
],
calls=calls,
)
rc = mq.process_once(dry_run=False)
assert rc == 0
assert calls["update_attempts"] == 1
# Held, not merged — fail-closed.
assert calls["hold_label"] == (1409, "merge-queue-hold")
assert calls["merge_attempts"] == 0
def test_queue_advances_past_held_conflicted_pr(monkeypatch):
"""End-to-end HOL proof for #2352: PR #1409 (oldest) hits a 409-on-update
and is held; on the NEXT tick choose_next_queued_issue must SKIP the held
PR and select the next ready PR (#1500) instead of stalling on #1409."""
calls = {"update_attempts": 0, "merge_attempts": 0, "hold_label": None}
conflicted = {"number": 1409, "pull_request": {},
"labels": [{"name": "merge-queue"}],
"created_at": "2026-06-01T00:00:00Z"}
next_ready = {"number": 1500, "pull_request": {},
"labels": [{"name": "merge-queue"}],
"created_at": "2026-06-02T00:00:00Z"}
_stale_pr_update_409_monkeypatch(
monkeypatch,
queued_issues=[conflicted, next_ready],
calls=calls,
)
# Tick 1: oldest (#1409) is selected, 409-on-update → held.
rc = mq.process_once(dry_run=False)
assert rc == 0
assert calls["hold_label"] == (1409, "merge-queue-hold")
# Simulate the label now present on #1409 (as the real hold would persist).
conflicted["labels"] = [{"name": "merge-queue"}, {"name": "merge-queue-hold"}]
# Tick 2: the queue must ADVANCE — choose_next_queued_issue skips the held
# #1409 and selects the next ready candidate #1500, NOT re-select #1409.
selected = mq.choose_next_queued_issue(
[conflicted, next_ready],
queue_label="merge-queue",
hold_label="merge-queue-hold",
)
assert selected is not None
assert selected["number"] == 1500