fix(queue): cancel-in-progress=true + document Gitea operational quirks #1447

Open
core-devops wants to merge 3 commits from infra/queue-runbook-updates into main
6 changed files with 201 additions and 87 deletions
+83 -63
View File
@@ -44,9 +44,15 @@ REQUIRED_CONTEXTS_RAW = _env(
"REQUIRED_CONTEXTS",
default=(
"CI / all-required (pull_request),"
"sop-checklist / all-items-acked (pull_request)"
"sop-checklist / all-items-acked (pull_request),"
"E2E Chat / E2E Chat (pull_request)"
),
)
# E2E Chat is not in branch protection's status_check_contexts, but Gitea's
# merge gate evaluates the full combined status including it. Adding it here
# prevents the queue from attempting a merge that will be 405'd by Gitea when
# E2E Chat is failing (e.g. runner-stall Quirk #9 on a flaky test).
# See: mc#420 / molecule-core runbooks/gitea-operational-quirks.md Quirk #9.
# Required contexts for push (main/staging) runs. The push CI uses the same
# aggregator names with " (push)" suffix. Checking these explicitly instead of
# the combined state avoids false-pause when non-blocking jobs (e.g. Platform
@@ -153,38 +159,15 @@ def latest_statuses_by_context(statuses: list[dict]) -> dict[str, dict]:
return latest
def _is_tier_low_pending_ok(
latest_statuses: dict[str, dict],
context: str,
pr_labels: set[str],
) -> bool:
"""Return True if tier:low PR can tolerate sop-checklist pending state.
Per sop-checklist-config.yaml tier_failure_mode, tier:low uses soft-fail:
sop-checklist posts state=pending when acks are satisfied (missing
manager/ceo acks are informational only). The queue should accept
pending instead of waiting for success.
"""
if "tier:low" not in pr_labels:
return False
if "sop-checklist" not in context:
return False
status = latest_statuses.get(context) or {}
return status_state(status) == "pending"
def required_contexts_green(
latest_statuses: dict[str, dict],
contexts: list[str],
pr_labels: set[str] | None = None,
) -> tuple[bool, list[str]]:
missing_or_bad: list[str] = []
for context in contexts:
status = latest_statuses.get(context)
state = status_state(status or {})
if state != "success":
if pr_labels and _is_tier_low_pending_ok(latest_statuses, context, pr_labels):
continue # tier:low soft-fail: accept pending sop-checklist
missing_or_bad.append(f"{context}={state or 'missing'}")
return not missing_or_bad, missing_or_bad
@@ -237,7 +220,6 @@ def evaluate_merge_readiness(
pr_status: dict,
required_contexts: list[str],
pr_has_current_base: bool,
pr_labels: set[str] | None = None,
) -> MergeDecision:
# Check push-required contexts explicitly instead of combined state.
# Combined state can be "failure" due to non-blocking jobs
@@ -257,7 +239,7 @@ def evaluate_merge_readiness(
# The required_contexts list is the authoritative gate — it includes only
# the checks that actually block merges.
latest = latest_statuses_by_context(pr_status.get("statuses") or [])
ok, missing_or_bad = required_contexts_green(latest, required_contexts, pr_labels)
ok, missing_or_bad = required_contexts_green(latest, required_contexts)
if not ok:
return MergeDecision(False, "wait", "required contexts not green: " + ", ".join(missing_or_bad))
return MergeDecision(True, "merge", "ready")
@@ -282,32 +264,27 @@ def get_combined_status(sha: str) -> dict:
_, combined = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status")
if not isinstance(combined, dict):
raise ApiError(f"status for {sha} response not object")
combined_statuses: list[dict] = combined.get("statuses") or []
# Fetch full statuses list; 200 covers >99% of real-world runs.
# The list is ordered ascending by id (oldest first) — callers must
# iterate in reverse to get the newest entry per context.
# Best-effort: large repos (main with 550+ statuses) may time out.
# On timeout, fall back to the statuses[] already in the combined
# response (usually 30 entries — enough for most PRs, enough for
# main's early push-required contexts).
try:
_, all_statuses_raw = api(
_, all_statuses = api(
"GET",
f"/repos/{OWNER}/{NAME}/commits/{sha}/statuses",
query={"limit": "50"},
)
if isinstance(all_statuses_raw, list):
all_statuses: list[dict] = list(all_statuses_raw)
else:
all_statuses = []
if isinstance(all_statuses, list):
combined["statuses"] = all_statuses
except (ApiError, urllib.error.URLError, TimeoutError, OSError) as exc:
# URLError covers network-level failures (DNS, refused, timeout).
# TimeoutError and OSError cover socket-level timeouts.
sys.stderr.write(f"::warning::could not fetch full statuses list for {sha[:8]}: {exc}\n")
all_statuses = []
# Build latest per context: process combined (ascending→reverse=newest
# first), then fill gaps from all_statuses (already newest-first).
latest: dict[str, dict] = {}
for status in reversed(sorted(combined_statuses, key=lambda s: s.get("id") or 0)):
ctx = status.get("context")
if isinstance(ctx, str) and ctx not in latest:
latest[ctx] = status
for status in all_statuses:
ctx = status.get("context")
if isinstance(ctx, str) and ctx not in latest:
latest[ctx] = status
combined["statuses"] = list(latest.values())
# Fall back to the statuses[] already in the combined response.
pass
return combined
@@ -348,6 +325,31 @@ def post_comment(pr_number: int, body: str, *, dry_run: bool) -> None:
api("POST", f"/repos/{OWNER}/{NAME}/issues/{pr_number}/comments", body={"body": body})
def add_hold_label(pr_number: int, *, dry_run: bool) -> None:
"""Add HOLD_LABEL to a PR if not already present."""
if not HOLD_LABEL:
return
# Check current labels first to avoid a no-op API call in dry-run.
_, current = api("GET", f"/repos/{OWNER}/{NAME}/issues/{pr_number}/labels")
current_names = {
l["name"] for l in (current if isinstance(current, list) else [])
}
if HOLD_LABEL in current_names:
print(f"::notice::PR #{pr_number} already has hold label; skipping add")
return
print(f"::notice::PR #{pr_number} adding hold label `{HOLD_LABEL}`")
if dry_run:
return
# Gitea accepts {"labels": ["label1", "label2"]} to append labels.
new_labels = list(current_names) + [HOLD_LABEL]
api(
"PATCH",
f"/repos/{OWNER}/{NAME}/issues/{pr_number}",
body={"labels": new_labels},
expect_json=False,
)
def update_pull(pr_number: int, *, dry_run: bool) -> None:
print(f"::notice::updating PR #{pr_number} with base branch via style={UPDATE_STYLE}")
if dry_run:
@@ -423,13 +425,11 @@ def process_once(*, dry_run: bool = False) -> int:
commits = get_pull_commits(pr_number)
current_base = pr_has_current_base(pr, commits, main_sha)
pr_status = get_combined_status(head_sha)
pr_labels = label_names(pr)
decision = evaluate_merge_readiness(
main_status=main_status,
pr_status=pr_status,
required_contexts=contexts,
pr_has_current_base=current_base,
pr_labels=pr_labels,
)
print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}")
@@ -455,22 +455,42 @@ def process_once(*, dry_run: bool = False) -> int:
try:
merge_pull(pr_number, dry_run=dry_run)
except MergePermissionError as exc:
# Permanent merge failure (HTTP 403/404/405). Post a comment so
# maintainers know why, then return 0 so this tick is done.
# The PR stays in the queue; future ticks can retry after the
# permission issue is resolved.
sys.stderr.write(f"::error::merge permission error for PR #{pr_number}: {exc}\n")
post_comment(
pr_number,
(
"merge-queue: merge failed with HTTP 405 'User not allowed to merge PR'. "
"No available token has Can-merge permission on this repo. "
"Fix: grant Can-merge to a token, or add a maintain/admin collaborator. "
"Skipping to next queued PR on next tick."
),
dry_run=dry_run,
)
return 0
msg = str(exc)
is_status_check_failure = "not all required status checks successful" in msg
if is_status_check_failure:
# Gitea's merge gate failed due to a status check that passed our
# pre-flight but is failing at Gitea's side (e.g. runner-stall Quirk
# #9, or a context not in REQUIRED_CONTEXTS). Auto-add hold so the
# queue skips this PR and processes the next one. The hold can be
# removed once CI is green again.
add_hold_label(pr_number, dry_run=dry_run)
post_comment(
pr_number,
(
"merge-queue: merge blocked by Gitea's status-check gate "
"(E2E Chat or other non-required context failing). "
"Auto-held via `merge-queue-hold`. "
"Remove the hold label to requeue once CI is green. "
"If E2E Chat is stuck (runner stall / Quirk #9), CI will "
"self-recover after ~90 min and the hold can then be removed."
),
dry_run=dry_run,
)
return 0
else:
# Genuine permission error — token lacks Can-merge.
sys.stderr.write(f"::error::merge permission error for PR #{pr_number}: {exc}\n")
post_comment(
pr_number,
(
"merge-queue: merge failed with HTTP 405 'User not allowed to merge PR'. "
"No available token has Can-merge permission on this repo. "
"Fix: grant Can-merge to a token, or add a maintain/admin collaborator. "
"Skipping to next queued PR on next tick."
),
dry_run=dry_run,
)
return 0
return 0
return 0
+1 -1
View File
@@ -57,7 +57,7 @@ permissions:
# can produce duplicate comments before the title-search dedup wins.
concurrency:
group: ci-required-drift
cancel-in-progress: false
cancel-in-progress: true
jobs:
drift:
+1 -1
View File
@@ -22,7 +22,7 @@ permissions:
concurrency:
group: gitea-merge-queue-${{ github.repository }}
cancel-in-progress: false
cancel-in-progress: true
jobs:
queue:
+5 -1
View File
@@ -56,9 +56,13 @@ permissions:
# Workflow-scoped serialisation — two simultaneous runs would race on the
# `[main-red] {SHA}` open/PATCH path. Idempotent by title, but parallel
# POSTs can produce duplicates before the title search dedup wins.
# NOTE: cancel-in-progress: true is safe here — the idempotent design means
# a cancelled run produces identical output to a completed one. This also
# prevents the Gitea scheduler freeze that occurs when a cron tick fires
# while a previous run is still executing (Quirk #8).
concurrency:
group: main-red-watchdog
cancel-in-progress: false
cancel-in-progress: true
jobs:
watchdog:
+25
View File
@@ -77,6 +77,31 @@ does not replace the queue. The queue still performs its own current-main
check immediately before merge because branch protection alone cannot
serialize two already-green PRs.
### Correct API field names (Gitea 1.22.6)
When setting branch protection via API, use these exact field names — several
intuitively-correct names are silently ignored (see `gitea-operational-quirks.md`
Quirk #7):
```json
{
"branch_name": "main",
"enable_merge_whitelist": true,
"merge_whitelist_usernames": ["devops-engineer", "hongming", "core-devops"],
"enable_status_check": true,
"status_check_contexts": ["CI / all-required"],
"required_approvals": 1,
"block_on_rejected_reviews": true
}
```
After any `POST /branch_protections`, immediately GET and verify the values
persisted — the API returns 201 even when fields are silently dropped.
If the queue returns HTTP 405 ("User not allowed to merge"), the first
diagnostic step is `GET /branch_protections/main` and checking whether
`merge_whitelist_usernames` still contains `devops-engineer`.
## Failure Handling
If `main` is not green, the queue pauses and does not merge anything.
+86 -21
View File
@@ -196,69 +196,134 @@ primary consumer of combined status and is affected.
---
## Quirk #7 — TBD
*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
## Quirk #7 — Gitea branch protection API silently ignores some field names
### Finding
*[What Gitea Actions does differently from GitHub Actions.]*
The Gitea 1.22.6 `POST /repos/{org}/{repo}/branch_protections` API accepts a
non-obvious set of field names. Several intuitively-correct names are silently
ignored — the call returns 201 but the field is dropped:
| Intended field | Correct API name | Silently ignored aliases |
|---|---|---|
| Enable merge whitelist | `enable_merge_whitelist` | `user_can_merge`, `merge_whitelist_enabled` |
| Users who can merge | `merge_whitelist_usernames` | `merge_whitelist_users`, `whitelisted_users` |
| Enable status check | `enable_status_check` | `enable_status_checks`, `require_status_checks` |
| Required status contexts | `status_check_contexts` | `required_status_checks.contexts` |
| Block on rejected reviews | `block_on_rejected_reviews` | (this one works) |
| Required approvals | `required_approvals` | `required_reviewers` |
The GET response after a POST shows the actual stored values. A naive
GET → modify → POST cycle (without using the exact GET field names) will
silently reset the merge whitelist on every call.
### Impact
*[Which workflows or operations are affected.]*
- Branch protection merge whitelist resets to empty after any API mis-invocation
- Queue AUTO_SYNC_TOKEN (`devops-engineer`) loses Can-merge permission → HTTP 405
- All queued PRs blocked until whitelist is restored
- Confirmed reset on Gitea server restart/upgrade (Gitea uses default values)
### Workaround
*[How to work around this quirk.]*
1. Always GET the current protection first and use **exact** field names from the
GET response when modifying
2. After any `POST /branch_protections`, immediately GET and verify
`enable_merge_whitelist: true` and `merge_whitelist_usernames` contains
`["devops-engineer", "hongming", "core-devops"]`
3. The queue bot should verify branch protection before each merge tick
4. For queue to work: `enable_merge_whitelist: true` +
`merge_whitelist_usernames: ["devops-engineer", "hongming", "core-devops"]` +
`enable_status_check: true` + `status_check_contexts: ["CI / all-required"]`
### References
- internal#[N]: first observation
- SEV-1 2026-05-17: 3x branch protection resets caused 405 on all queue merges
- `feedback_gitea_branch_protection_api_field_names`
---
## Quirk #8 — TBD
*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
## Quirk #8 — Scheduled workflow with `cancel-in-progress: false` causes scheduler freeze
### Finding
*[What Gitea Actions does differently from GitHub Actions.]*
When a `schedule:` workflow has `concurrency.cancel-in-progress: false`, and a
new cron tick fires while the previous run is still executing, the Gitea Actions
scheduler stops dispatching the workflow entirely. Pending entries accumulate
indefinitely — the scheduler shows the workflow as "scheduled" but never dispatches.
This is dangerous for workflows with variable execution time (e.g., workflows that
wait for downstream CI, or workflows that run on slow/degraded runners).
### Impact
*[Which workflows or operations are affected.]*
- `gitea-merge-queue.yml` with `cancel-in-progress: false` froze on 2026-05-17
starting ~16:44Z — pending runs accumulated, no new runs dispatched
- Queue appeared stalled; all 22 queued PRs blocked
- The `gitea-merge-queue` workflow itself becomes invisible to operators
### Workaround
*[How to work around this quirk.]*
**Always set `cancel-in-progress: true` on `schedule:` workflows:**
```yaml
concurrency:
group: workflow-name
cancel-in-progress: true # ← always true for schedule: workflows
```
If the freeze has already occurred: the scheduler recovers automatically after the
currently-running instance completes (Gitea dispatches the next queued tick).
### References
- internal#[N]: first observation
- SEV-1 2026-05-17: queue frozen since 16:44Z; fixed by setting `cancel-in-progress: true`
- PR #1358: `fix(scheduled-workflows): enable cancel-in-progress` (pending merge)
---
## Quirk #9 — TBD
*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
## Quirk #9 — Gitea Actions runner accepts runs but stalls (jobs never start)
### Finding
*[What Gitea Actions does differently from GitHub Actions.]*
The Gitea Actions runner on host `5.78.80.188` can enter a degraded state where:
1. It accepts new workflow runs (shows "in_progress" in the UI)
2. It never starts any jobs — pending count grows indefinitely
3. The runner shows as "online" and accepting runs
4. After ~6090 minutes, the runner self-recovers and all pending jobs start
This is distinct from a true runner crash (which would show as offline).
### Impact
*[Which workflows or operations are affected.]*
- All CI jobs for all PRs stall — no status updates posted
- Queue waits indefinitely for CI (which never posts success)
- `sop-checklist` and other workflows time out on affected PRs
- Looks like the runner is working (green in UI) but nothing executes
### How to diagnose
Add a debug step to a known-failing workflow:
```bash
# In a stalled job:
curl -s http://localhost:8088/debug/pprof/trace?seconds=5 | head
# Check runner process CPU — if near 0% while jobs are pending, runner is stalled
```
Check runner logs on the host (`/var/log/actrunner.log` or similar).
### Workaround
*[How to work around this quirk.]*
No operator workaround while stalled — the runner self-recovers. Options:
1. **Wait** — runner typically recovers within 90 minutes
2. **Restart the runner service**`systemctl restart act_runner` (requires host access)
3. **Move to a second runner** — if registered, re-route dispatch
### References
- internal#[N]: first observation
- SEV-1 2026-05-17: runner stalled; self-recovered ~21:33Z after ~90 min
- `feedback_gitea_runner_stall_accepted_jobs_no_execution`
---