Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 8ccf3a844c | |||
| 9ede993f3d | |||
| c3cfbea750 | |||
| a01d1d8f86 |
@@ -23,7 +23,6 @@ import dataclasses
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
@@ -327,43 +326,6 @@ def update_pull(pr_number: int, *, dry_run: bool) -> None:
|
||||
)
|
||||
|
||||
|
||||
def wait_for_ci(
|
||||
head_sha: str,
|
||||
contexts: list[str],
|
||||
*,
|
||||
max_wait_seconds: int = 300,
|
||||
poll_interval: int = 15,
|
||||
) -> bool:
|
||||
"""Poll CI statuses for head_sha until all required contexts are terminal.
|
||||
|
||||
Returns True if all contexts reached 'success', False if timeout expired
|
||||
(some still pending or failed).
|
||||
|
||||
Background: after a queue-triggered PR update, CI re-runs on the new head.
|
||||
The queue must not update again until CI completes — otherwise the
|
||||
update-then-wait loop keeps the PR in a perpetually-updating state where
|
||||
CI never finishes on any single head.
|
||||
"""
|
||||
deadline = time.time() + max_wait_seconds
|
||||
while time.time() < deadline:
|
||||
time.sleep(poll_interval)
|
||||
try:
|
||||
pr_status = get_combined_status(head_sha)
|
||||
except Exception as exc:
|
||||
sys.stderr.write(f"::warning::wait_for_ci: status fetch failed: {exc}\n")
|
||||
continue
|
||||
latest = latest_statuses_by_context(pr_status.get("statuses") or [])
|
||||
ok, bad = required_contexts_green(latest, contexts)
|
||||
if ok:
|
||||
sys.stderr.write(f"::notice::wait_for_ci: all contexts green after {int(time.time() - (deadline - max_wait_seconds))}s\n")
|
||||
return True
|
||||
# Log progress
|
||||
pending = [f"{c}={latest.get(c, {}).get('status', 'missing')}" for c in contexts if latest.get(c, {}).get('status') != 'success']
|
||||
sys.stderr.write(f"::notice::wait_for_ci: still waiting ({int(deadline - time.time())}s left): {', '.join(pending[:3])}\n")
|
||||
sys.stderr.write(f"::warning::wait_for_ci: timeout after {max_wait_seconds}s; proceeding with merge check\n")
|
||||
return False
|
||||
|
||||
|
||||
def merge_pull(pr_number: int, *, dry_run: bool) -> None:
|
||||
payload = {
|
||||
"Do": "merge",
|
||||
@@ -376,24 +338,7 @@ def merge_pull(pr_number: int, *, dry_run: bool) -> None:
|
||||
print(f"::notice::merging PR #{pr_number}")
|
||||
if dry_run:
|
||||
return
|
||||
# Gitea's merge endpoint returns HTTP 200 with an empty body on success.
|
||||
# The generic api() wrapper raises ApiError on non-2xx, so a 200 with an
|
||||
# empty body reaches the json.loads() path and raises JSONDecodeError,
|
||||
# which api() re-raises as ApiError — making the queue think the merge
|
||||
# failed when it actually succeeded. Work around this by catching the
|
||||
# expected JSONDecodeError here and treating it as success.
|
||||
try:
|
||||
api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
|
||||
except ApiError as exc:
|
||||
# Surface non-merge errors (5xx server errors, 403 forbidden, etc.)
|
||||
if "merge" in str(exc).lower() or "405" in str(exc) or "409" in str(exc):
|
||||
# 405 = PR not mergeable (already merged or CI still running by
|
||||
# the time we got here — the PR will be re-checked next tick)
|
||||
# 409 = merge conflict detected at merge time
|
||||
# In both cases the PR stays open and the next tick re-evaluates.
|
||||
sys.stderr.write(f"::warning::merge call returned: {exc}\n")
|
||||
else:
|
||||
raise
|
||||
api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
|
||||
|
||||
|
||||
def process_once(*, dry_run: bool = False) -> int:
|
||||
@@ -445,32 +390,6 @@ def process_once(*, dry_run: bool = False) -> int:
|
||||
print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}")
|
||||
if decision.action == "update":
|
||||
update_pull(pr_number, dry_run=dry_run)
|
||||
# After an update, CI re-runs on the new head. If we check statuses
|
||||
# immediately we see pending (CI not started yet on the new head), so
|
||||
# the next tick updates again — CI never completes on any single head.
|
||||
# Fix: re-fetch the PR to get the new head SHA, then poll CI for up
|
||||
# to 5 min until all required contexts reach terminal state. If CI
|
||||
# finishes in time, proceed to merge on the same tick.
|
||||
if not dry_run:
|
||||
updated_pr = get_pull(pr_number)
|
||||
new_head = updated_pr.get("head", {}).get("sha", "")
|
||||
if new_head and new_head != head_sha:
|
||||
sys.stderr.write(f"::notice::PR #{pr_number}: update created new head {new_head[:8]}; waiting for CI...\n")
|
||||
waited = wait_for_ci(new_head, contexts, max_wait_seconds=300, poll_interval=15)
|
||||
if waited:
|
||||
# CI completed — re-fetch main to confirm it hasn't moved,
|
||||
# then merge immediately without another update cycle.
|
||||
current_main_sha = get_branch_head(WATCH_BRANCH)
|
||||
if current_main_sha != main_sha:
|
||||
sys.stderr.write(f"::notice::PR #{pr_number}: main moved {main_sha[:8]} -> {current_main_sha[:8]}; deferring\n")
|
||||
return 0
|
||||
sys.stderr.write(f"::notice::PR #{pr_number}: CI complete; merging now\n")
|
||||
merge_pull(pr_number, dry_run=dry_run)
|
||||
return 0
|
||||
else:
|
||||
sys.stderr.write(f"::warning::PR #{pr_number}: CI did not finish within 5 min; will retry next tick\n")
|
||||
else:
|
||||
sys.stderr.write(f"::notice::PR #{pr_number}: update did not change head SHA; will retry\n")
|
||||
post_comment(
|
||||
pr_number,
|
||||
(
|
||||
@@ -481,13 +400,6 @@ def process_once(*, dry_run: bool = False) -> int:
|
||||
)
|
||||
return 0
|
||||
if decision.ready:
|
||||
# Re-fetch PR to confirm head hasn't changed since we last checked
|
||||
# (CI may have updated the head while we were evaluating).
|
||||
current_pr = get_pull(pr_number)
|
||||
current_head = current_pr.get("head", {}).get("sha", "")
|
||||
if current_head != head_sha:
|
||||
print(f"::notice::PR #{pr_number} head changed {head_sha[:8]} -> {current_head[:8]}; re-evaluating")
|
||||
return 0
|
||||
latest_main_sha = get_branch_head(WATCH_BRANCH)
|
||||
if latest_main_sha != main_sha:
|
||||
print(
|
||||
@@ -495,7 +407,23 @@ def process_once(*, dry_run: bool = False) -> int:
|
||||
"deferring to next tick"
|
||||
)
|
||||
return 0
|
||||
merge_pull(pr_number, dry_run=dry_run)
|
||||
try:
|
||||
merge_pull(pr_number, dry_run=dry_run)
|
||||
except ApiError as exc:
|
||||
# Merge API errors (405 permission denied, 422 hook block, etc.)
|
||||
# are NOT transient — retrying will not help. Surface the error
|
||||
# on the PR immediately so it is visible without digging into
|
||||
# workflow logs, and fail the workflow so it is distinguishable
|
||||
# from a successful-no-op tick.
|
||||
post_comment(
|
||||
pr_number,
|
||||
f"merge-queue: MERGE FAILED — {exc}. "
|
||||
"This is a non-transient error (permission or hook issue). "
|
||||
"See SEV-1 internal#487.",
|
||||
dry_run=dry_run,
|
||||
)
|
||||
sys.stderr.write(f"::error::PR #{pr_number} merge failed: {exc}\n")
|
||||
return 2 # distinct exit code so workflow run shows failure
|
||||
return 0
|
||||
return 0
|
||||
|
||||
|
||||
@@ -830,9 +830,18 @@ def main(argv: list[str] | None = None) -> int:
|
||||
# one membership lookup per team.
|
||||
team_member_cache: dict[tuple[str, int], bool | None] = {}
|
||||
|
||||
def _required_teams_for(slug: str) -> list[str] | None:
|
||||
"""Look up required_teams for a slug from checklist items OR N/A gates."""
|
||||
if slug in items_by_slug:
|
||||
return items_by_slug[slug]["required_teams"]
|
||||
if slug in na_gates:
|
||||
return na_gates[slug].get("required_teams", [])
|
||||
return None
|
||||
|
||||
def probe(slug: str, users: list[str]) -> list[str]:
|
||||
item = items_by_slug[slug]
|
||||
team_names: list[str] = item["required_teams"]
|
||||
team_names = _required_teams_for(slug)
|
||||
if team_names is None:
|
||||
raise KeyError(f"slug '{slug}' not found in items or N/A gates")
|
||||
# Resolve names → ids. NOTE: orgs/{org}/teams/search may not be
|
||||
# available — fall back to the list endpoint.
|
||||
team_ids: list[int] = []
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import importlib.util
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
|
||||
SCRIPT = Path(__file__).resolve().parents[1] / "gitea-merge-queue.py"
|
||||
@@ -118,3 +119,54 @@ def test_merge_decision_updates_stale_pr_before_merge():
|
||||
|
||||
assert decision.ready is False
|
||||
assert decision.action == "update"
|
||||
|
||||
|
||||
def test_merge_failure_returns_nonzero_and_posts_comment(monkeypatch):
|
||||
"""When merge_pull raises ApiError (e.g. HTTP 405 permission denied),
|
||||
process_once returns exit code 2 (non-zero) and posts a comment on the PR.
|
||||
This distinguishes merge-permission errors from successful-no-op ticks."""
|
||||
captured_comment = {}
|
||||
|
||||
def fake_post_comment(pr_number, body, *, dry_run):
|
||||
captured_comment["pr_number"] = pr_number
|
||||
captured_comment["body"] = body
|
||||
|
||||
# Replace functions directly on the module object so process_once()
|
||||
# (which looks them up by name at call time) picks up the fakes.
|
||||
mq.list_queued_issues = lambda: [{
|
||||
"number": 42,
|
||||
"created_at": "2026-05-17T00:00:00Z",
|
||||
"labels": [{"name": "merge-queue"}],
|
||||
"pull_request": {},
|
||||
}]
|
||||
mq.get_pull = lambda n: {
|
||||
"state": "open",
|
||||
"base": {"ref": "main", "repo_id": 1},
|
||||
"head": {"sha": "headsha", "repo_id": 1},
|
||||
"merge_base": "abc123def",
|
||||
}
|
||||
mq.get_pull_commits = lambda n: [{"sha": "headsha"}]
|
||||
mq.get_branch_head = lambda branch: "abc123def"
|
||||
mq.get_combined_status = lambda sha: {
|
||||
"state": "success",
|
||||
"statuses": [{"context": "CI / all-required (push)", "status": "success"}],
|
||||
}
|
||||
mq.latest_statuses_by_context = lambda s: {
|
||||
"CI / all-required (pull_request)": {"status": "success"},
|
||||
"sop-checklist / all-items-acked (pull_request)": {"status": "success"},
|
||||
}
|
||||
mq.required_contexts_green = lambda statuses, contexts: (True, [])
|
||||
mq.post_comment = fake_post_comment
|
||||
|
||||
# Simulate merge failing with HTTP 405 (permission denied).
|
||||
# The ApiError raised by api() is caught inside process_once().
|
||||
merge_error = mq.ApiError(
|
||||
"POST /repos/x/y/pulls/42/merge -> HTTP 405: User not allowed to merge PR"
|
||||
)
|
||||
with patch.object(mq, "merge_pull", side_effect=merge_error):
|
||||
exit_code = mq.process_once(dry_run=False)
|
||||
|
||||
assert exit_code == 2, f"Expected exit code 2, got {exit_code}"
|
||||
assert captured_comment["pr_number"] == 42
|
||||
assert "MERGE FAILED" in captured_comment["body"]
|
||||
assert "405" in captured_comment["body"]
|
||||
|
||||
@@ -603,3 +603,51 @@ class TestComputeNaState(unittest.TestCase):
|
||||
self.assertEqual(na_directives[0][0], "sop-n/a")
|
||||
self.assertEqual(na_directives[0][1], "qa-review")
|
||||
self.assertIn("no surface", na_directives[0][2])
|
||||
|
||||
|
||||
class TestProbeNaGateFallback(unittest.TestCase):
|
||||
"""Regression test: probe() must handle gate names (qa-review, security-review)
|
||||
from N/A gates without raising KeyError.
|
||||
|
||||
mc#1389: compute_na_state calls probe(gate_name, [user]) where gate_name is
|
||||
a gate name like 'qa-review' — NOT a checklist item slug. The probe must
|
||||
resolve the gate's required_teams from na_gates, not raise KeyError from
|
||||
items_by_slug lookup.
|
||||
"""
|
||||
|
||||
def test_probe_resolves_gate_name_from_na_gates(self):
|
||||
cfg = sop.load_config(CONFIG_PATH)
|
||||
items = cfg["items"]
|
||||
items_by_slug = {it["slug"]: it for it in items}
|
||||
na_gates = cfg.get("n/a_gates", {})
|
||||
|
||||
# Reconstruct the _required_teams_for helper from sop-checklist.py
|
||||
def _required_teams_for(slug):
|
||||
if slug in items_by_slug:
|
||||
return items_by_slug[slug]["required_teams"]
|
||||
if slug in na_gates:
|
||||
return na_gates[slug].get("required_teams", [])
|
||||
return None
|
||||
|
||||
# Gate names should resolve from na_gates
|
||||
self.assertEqual(
|
||||
_required_teams_for("qa-review"),
|
||||
["qa", "security", "engineers"],
|
||||
)
|
||||
self.assertEqual(
|
||||
_required_teams_for("security-review"),
|
||||
["security", "managers", "ceo"],
|
||||
)
|
||||
|
||||
# Checklist item slugs should still resolve from items_by_slug
|
||||
self.assertEqual(
|
||||
_required_teams_for("comprehensive-testing"),
|
||||
["qa", "engineers"],
|
||||
)
|
||||
self.assertEqual(
|
||||
_required_teams_for("root-cause"),
|
||||
["managers", "ceo"],
|
||||
)
|
||||
|
||||
# Unknown slug should return None (not raise KeyError)
|
||||
self.assertIsNone(_required_teams_for("nonexistent-slug"))
|
||||
|
||||
@@ -32,12 +32,6 @@ on:
|
||||
# iterating all open PRs when PR_NUMBER is empty.
|
||||
workflow_dispatch:
|
||||
|
||||
# Cancel stale runs so the 8-runner pool stays available for PR jobs.
|
||||
# Per-SHA group ensures push and cron runs at different SHAs don't cancel each other.
|
||||
concurrency:
|
||||
group: gate-check-v3-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
# read: contents — for checkout (base ref, not PR head for security)
|
||||
# read: pull-requests — for reading PR info via API
|
||||
|
||||
@@ -162,6 +162,7 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
python -m twine upload \
|
||||
--verbose \
|
||||
--repository pypi \
|
||||
--username __token__ \
|
||||
--password "$PYPI_TOKEN" \
|
||||
|
||||
@@ -44,12 +44,6 @@ on:
|
||||
- ".github/scripts/lint_secret_pattern_drift.py"
|
||||
- ".githooks/pre-commit"
|
||||
|
||||
# Cancel stale runs to keep the 8-runner pool available for PR jobs.
|
||||
# Per-SHA group ensures push and scheduled runs at different SHAs don't cancel each other.
|
||||
concurrency:
|
||||
group: secret-pattern-drift-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GITHUB_SERVER_URL: https://git.moleculesai.app
|
||||
|
||||
|
||||
@@ -22,11 +22,6 @@ on:
|
||||
- cron: '17 4 * * 1' # Mondays at 04:17 UTC
|
||||
workflow_dispatch:
|
||||
|
||||
# Cancel stale runs to keep the 8-runner pool available for PR jobs.
|
||||
concurrency:
|
||||
group: weekly-platform-go-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
statuses: write
|
||||
|
||||
Reference in New Issue
Block a user