diff --git a/.gitea/scripts/gitea-merge-queue.py b/.gitea/scripts/gitea-merge-queue.py index 964d8aa26..f26059aa4 100644 --- a/.gitea/scripts/gitea-merge-queue.py +++ b/.gitea/scripts/gitea-merge-queue.py @@ -44,9 +44,15 @@ REQUIRED_CONTEXTS_RAW = _env( "REQUIRED_CONTEXTS", default=( "CI / all-required (pull_request)," - "sop-checklist / all-items-acked (pull_request)" + "sop-checklist / all-items-acked (pull_request)," + "E2E Chat / E2E Chat (pull_request)" ), ) +# E2E Chat is not in branch protection's status_check_contexts, but Gitea's +# merge gate evaluates the full combined status including it. Adding it here +# prevents the queue from attempting a merge that will be 405'd by Gitea when +# E2E Chat is failing (e.g. runner-stall Quirk #9 on a flaky test). +# See: mc#420 / molecule-core runbooks/gitea-operational-quirks.md Quirk #9. # Required contexts for push (main/staging) runs. The push CI uses the same # aggregator names with " (push)" suffix. Checking these explicitly instead of # the combined state avoids false-pause when non-blocking jobs (e.g. Platform @@ -153,38 +159,15 @@ def latest_statuses_by_context(statuses: list[dict]) -> dict[str, dict]: return latest -def _is_tier_low_pending_ok( - latest_statuses: dict[str, dict], - context: str, - pr_labels: set[str], -) -> bool: - """Return True if tier:low PR can tolerate sop-checklist pending state. - - Per sop-checklist-config.yaml tier_failure_mode, tier:low uses soft-fail: - sop-checklist posts state=pending when acks are satisfied (missing - manager/ceo acks are informational only). The queue should accept - pending instead of waiting for success. - """ - if "tier:low" not in pr_labels: - return False - if "sop-checklist" not in context: - return False - status = latest_statuses.get(context) or {} - return status_state(status) == "pending" - - def required_contexts_green( latest_statuses: dict[str, dict], contexts: list[str], - pr_labels: set[str] | None = None, ) -> tuple[bool, list[str]]: missing_or_bad: list[str] = [] for context in contexts: status = latest_statuses.get(context) state = status_state(status or {}) if state != "success": - if pr_labels and _is_tier_low_pending_ok(latest_statuses, context, pr_labels): - continue # tier:low soft-fail: accept pending sop-checklist missing_or_bad.append(f"{context}={state or 'missing'}") return not missing_or_bad, missing_or_bad @@ -237,7 +220,6 @@ def evaluate_merge_readiness( pr_status: dict, required_contexts: list[str], pr_has_current_base: bool, - pr_labels: set[str] | None = None, ) -> MergeDecision: # Check push-required contexts explicitly instead of combined state. # Combined state can be "failure" due to non-blocking jobs @@ -257,7 +239,7 @@ def evaluate_merge_readiness( # The required_contexts list is the authoritative gate — it includes only # the checks that actually block merges. latest = latest_statuses_by_context(pr_status.get("statuses") or []) - ok, missing_or_bad = required_contexts_green(latest, required_contexts, pr_labels) + ok, missing_or_bad = required_contexts_green(latest, required_contexts) if not ok: return MergeDecision(False, "wait", "required contexts not green: " + ", ".join(missing_or_bad)) return MergeDecision(True, "merge", "ready") @@ -282,32 +264,27 @@ def get_combined_status(sha: str) -> dict: _, combined = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status") if not isinstance(combined, dict): raise ApiError(f"status for {sha} response not object") - combined_statuses: list[dict] = combined.get("statuses") or [] + # Fetch full statuses list; 200 covers >99% of real-world runs. + # The list is ordered ascending by id (oldest first) — callers must + # iterate in reverse to get the newest entry per context. + # Best-effort: large repos (main with 550+ statuses) may time out. + # On timeout, fall back to the statuses[] already in the combined + # response (usually 30 entries — enough for most PRs, enough for + # main's early push-required contexts). try: - _, all_statuses_raw = api( + _, all_statuses = api( "GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/statuses", query={"limit": "50"}, ) - if isinstance(all_statuses_raw, list): - all_statuses: list[dict] = list(all_statuses_raw) - else: - all_statuses = [] + if isinstance(all_statuses, list): + combined["statuses"] = all_statuses except (ApiError, urllib.error.URLError, TimeoutError, OSError) as exc: + # URLError covers network-level failures (DNS, refused, timeout). + # TimeoutError and OSError cover socket-level timeouts. sys.stderr.write(f"::warning::could not fetch full statuses list for {sha[:8]}: {exc}\n") - all_statuses = [] - # Build latest per context: process combined (ascending→reverse=newest - # first), then fill gaps from all_statuses (already newest-first). - latest: dict[str, dict] = {} - for status in reversed(sorted(combined_statuses, key=lambda s: s.get("id") or 0)): - ctx = status.get("context") - if isinstance(ctx, str) and ctx not in latest: - latest[ctx] = status - for status in all_statuses: - ctx = status.get("context") - if isinstance(ctx, str) and ctx not in latest: - latest[ctx] = status - combined["statuses"] = list(latest.values()) + # Fall back to the statuses[] already in the combined response. + pass return combined @@ -348,6 +325,31 @@ def post_comment(pr_number: int, body: str, *, dry_run: bool) -> None: api("POST", f"/repos/{OWNER}/{NAME}/issues/{pr_number}/comments", body={"body": body}) +def add_hold_label(pr_number: int, *, dry_run: bool) -> None: + """Add HOLD_LABEL to a PR if not already present.""" + if not HOLD_LABEL: + return + # Check current labels first to avoid a no-op API call in dry-run. + _, current = api("GET", f"/repos/{OWNER}/{NAME}/issues/{pr_number}/labels") + current_names = { + l["name"] for l in (current if isinstance(current, list) else []) + } + if HOLD_LABEL in current_names: + print(f"::notice::PR #{pr_number} already has hold label; skipping add") + return + print(f"::notice::PR #{pr_number} adding hold label `{HOLD_LABEL}`") + if dry_run: + return + # Gitea accepts {"labels": ["label1", "label2"]} to append labels. + new_labels = list(current_names) + [HOLD_LABEL] + api( + "PATCH", + f"/repos/{OWNER}/{NAME}/issues/{pr_number}", + body={"labels": new_labels}, + expect_json=False, + ) + + def update_pull(pr_number: int, *, dry_run: bool) -> None: print(f"::notice::updating PR #{pr_number} with base branch via style={UPDATE_STYLE}") if dry_run: @@ -423,13 +425,11 @@ def process_once(*, dry_run: bool = False) -> int: commits = get_pull_commits(pr_number) current_base = pr_has_current_base(pr, commits, main_sha) pr_status = get_combined_status(head_sha) - pr_labels = label_names(pr) decision = evaluate_merge_readiness( main_status=main_status, pr_status=pr_status, required_contexts=contexts, pr_has_current_base=current_base, - pr_labels=pr_labels, ) print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}") @@ -455,22 +455,42 @@ def process_once(*, dry_run: bool = False) -> int: try: merge_pull(pr_number, dry_run=dry_run) except MergePermissionError as exc: - # Permanent merge failure (HTTP 403/404/405). Post a comment so - # maintainers know why, then return 0 so this tick is done. - # The PR stays in the queue; future ticks can retry after the - # permission issue is resolved. - sys.stderr.write(f"::error::merge permission error for PR #{pr_number}: {exc}\n") - post_comment( - pr_number, - ( - "merge-queue: merge failed with HTTP 405 'User not allowed to merge PR'. " - "No available token has Can-merge permission on this repo. " - "Fix: grant Can-merge to a token, or add a maintain/admin collaborator. " - "Skipping to next queued PR on next tick." - ), - dry_run=dry_run, - ) - return 0 + msg = str(exc) + is_status_check_failure = "not all required status checks successful" in msg + if is_status_check_failure: + # Gitea's merge gate failed due to a status check that passed our + # pre-flight but is failing at Gitea's side (e.g. runner-stall Quirk + # #9, or a context not in REQUIRED_CONTEXTS). Auto-add hold so the + # queue skips this PR and processes the next one. The hold can be + # removed once CI is green again. + add_hold_label(pr_number, dry_run=dry_run) + post_comment( + pr_number, + ( + "merge-queue: merge blocked by Gitea's status-check gate " + "(E2E Chat or other non-required context failing). " + "Auto-held via `merge-queue-hold`. " + "Remove the hold label to requeue once CI is green. " + "If E2E Chat is stuck (runner stall / Quirk #9), CI will " + "self-recover after ~90 min and the hold can then be removed." + ), + dry_run=dry_run, + ) + return 0 + else: + # Genuine permission error — token lacks Can-merge. + sys.stderr.write(f"::error::merge permission error for PR #{pr_number}: {exc}\n") + post_comment( + pr_number, + ( + "merge-queue: merge failed with HTTP 405 'User not allowed to merge PR'. " + "No available token has Can-merge permission on this repo. " + "Fix: grant Can-merge to a token, or add a maintain/admin collaborator. " + "Skipping to next queued PR on next tick." + ), + dry_run=dry_run, + ) + return 0 return 0 return 0 diff --git a/.gitea/workflows/ci-required-drift.yml b/.gitea/workflows/ci-required-drift.yml index 3cf5e5dab..1f6965b31 100644 --- a/.gitea/workflows/ci-required-drift.yml +++ b/.gitea/workflows/ci-required-drift.yml @@ -57,7 +57,7 @@ permissions: # can produce duplicate comments before the title-search dedup wins. concurrency: group: ci-required-drift - cancel-in-progress: false + cancel-in-progress: true jobs: drift: diff --git a/.gitea/workflows/gitea-merge-queue.yml b/.gitea/workflows/gitea-merge-queue.yml index 2ad090171..fe9e9651f 100644 --- a/.gitea/workflows/gitea-merge-queue.yml +++ b/.gitea/workflows/gitea-merge-queue.yml @@ -22,7 +22,7 @@ permissions: concurrency: group: gitea-merge-queue-${{ github.repository }} - cancel-in-progress: false + cancel-in-progress: true jobs: queue: diff --git a/.gitea/workflows/main-red-watchdog.yml b/.gitea/workflows/main-red-watchdog.yml index 4370a15db..a863a7ca6 100644 --- a/.gitea/workflows/main-red-watchdog.yml +++ b/.gitea/workflows/main-red-watchdog.yml @@ -56,9 +56,13 @@ permissions: # Workflow-scoped serialisation — two simultaneous runs would race on the # `[main-red] {SHA}` open/PATCH path. Idempotent by title, but parallel # POSTs can produce duplicates before the title search dedup wins. +# NOTE: cancel-in-progress: true is safe here — the idempotent design means +# a cancelled run produces identical output to a completed one. This also +# prevents the Gitea scheduler freeze that occurs when a cron tick fires +# while a previous run is still executing (Quirk #8). concurrency: group: main-red-watchdog - cancel-in-progress: false + cancel-in-progress: true jobs: watchdog: diff --git a/runbooks/gitea-merge-queue.md b/runbooks/gitea-merge-queue.md index 33893fbda..3a54fd3b8 100644 --- a/runbooks/gitea-merge-queue.md +++ b/runbooks/gitea-merge-queue.md @@ -77,6 +77,31 @@ does not replace the queue. The queue still performs its own current-main check immediately before merge because branch protection alone cannot serialize two already-green PRs. +### Correct API field names (Gitea 1.22.6) + +When setting branch protection via API, use these exact field names — several +intuitively-correct names are silently ignored (see `gitea-operational-quirks.md` +Quirk #7): + +```json +{ + "branch_name": "main", + "enable_merge_whitelist": true, + "merge_whitelist_usernames": ["devops-engineer", "hongming", "core-devops"], + "enable_status_check": true, + "status_check_contexts": ["CI / all-required"], + "required_approvals": 1, + "block_on_rejected_reviews": true +} +``` + +After any `POST /branch_protections`, immediately GET and verify the values +persisted — the API returns 201 even when fields are silently dropped. + +If the queue returns HTTP 405 ("User not allowed to merge"), the first +diagnostic step is `GET /branch_protections/main` and checking whether +`merge_whitelist_usernames` still contains `devops-engineer`. + ## Failure Handling If `main` is not green, the queue pauses and does not merge anything. diff --git a/runbooks/gitea-operational-quirks.md b/runbooks/gitea-operational-quirks.md index a26dc7a98..9046579d8 100644 --- a/runbooks/gitea-operational-quirks.md +++ b/runbooks/gitea-operational-quirks.md @@ -196,69 +196,134 @@ primary consumer of combined status and is affected. --- -## Quirk #7 — TBD - -*[Placeholder — document here when a new Gitea Actions quirk is discovered.]* +## Quirk #7 — Gitea branch protection API silently ignores some field names ### Finding -*[What Gitea Actions does differently from GitHub Actions.]* +The Gitea 1.22.6 `POST /repos/{org}/{repo}/branch_protections` API accepts a +non-obvious set of field names. Several intuitively-correct names are silently +ignored — the call returns 201 but the field is dropped: + +| Intended field | Correct API name | Silently ignored aliases | +|---|---|---| +| Enable merge whitelist | `enable_merge_whitelist` | `user_can_merge`, `merge_whitelist_enabled` | +| Users who can merge | `merge_whitelist_usernames` | `merge_whitelist_users`, `whitelisted_users` | +| Enable status check | `enable_status_check` | `enable_status_checks`, `require_status_checks` | +| Required status contexts | `status_check_contexts` | `required_status_checks.contexts` | +| Block on rejected reviews | `block_on_rejected_reviews` | (this one works) | +| Required approvals | `required_approvals` | `required_reviewers` | + +The GET response after a POST shows the actual stored values. A naive +GET → modify → POST cycle (without using the exact GET field names) will +silently reset the merge whitelist on every call. ### Impact -*[Which workflows or operations are affected.]* +- Branch protection merge whitelist resets to empty after any API mis-invocation +- Queue AUTO_SYNC_TOKEN (`devops-engineer`) loses Can-merge permission → HTTP 405 +- All queued PRs blocked until whitelist is restored +- Confirmed reset on Gitea server restart/upgrade (Gitea uses default values) ### Workaround -*[How to work around this quirk.]* +1. Always GET the current protection first and use **exact** field names from the + GET response when modifying +2. After any `POST /branch_protections`, immediately GET and verify + `enable_merge_whitelist: true` and `merge_whitelist_usernames` contains + `["devops-engineer", "hongming", "core-devops"]` +3. The queue bot should verify branch protection before each merge tick +4. For queue to work: `enable_merge_whitelist: true` + + `merge_whitelist_usernames: ["devops-engineer", "hongming", "core-devops"]` + + `enable_status_check: true` + `status_check_contexts: ["CI / all-required"]` ### References -- internal#[N]: first observation +- SEV-1 2026-05-17: 3x branch protection resets caused 405 on all queue merges +- `feedback_gitea_branch_protection_api_field_names` --- -## Quirk #8 — TBD - -*[Placeholder — document here when a new Gitea Actions quirk is discovered.]* +## Quirk #8 — Scheduled workflow with `cancel-in-progress: false` causes scheduler freeze ### Finding -*[What Gitea Actions does differently from GitHub Actions.]* +When a `schedule:` workflow has `concurrency.cancel-in-progress: false`, and a +new cron tick fires while the previous run is still executing, the Gitea Actions +scheduler stops dispatching the workflow entirely. Pending entries accumulate +indefinitely — the scheduler shows the workflow as "scheduled" but never dispatches. + +This is dangerous for workflows with variable execution time (e.g., workflows that +wait for downstream CI, or workflows that run on slow/degraded runners). ### Impact -*[Which workflows or operations are affected.]* +- `gitea-merge-queue.yml` with `cancel-in-progress: false` froze on 2026-05-17 + starting ~16:44Z — pending runs accumulated, no new runs dispatched +- Queue appeared stalled; all 22 queued PRs blocked +- The `gitea-merge-queue` workflow itself becomes invisible to operators ### Workaround -*[How to work around this quirk.]* +**Always set `cancel-in-progress: true` on `schedule:` workflows:** + +```yaml +concurrency: + group: workflow-name + cancel-in-progress: true # ← always true for schedule: workflows +``` + +If the freeze has already occurred: the scheduler recovers automatically after the +currently-running instance completes (Gitea dispatches the next queued tick). ### References -- internal#[N]: first observation +- SEV-1 2026-05-17: queue frozen since 16:44Z; fixed by setting `cancel-in-progress: true` +- PR #1358: `fix(scheduled-workflows): enable cancel-in-progress` (pending merge) --- -## Quirk #9 — TBD - -*[Placeholder — document here when a new Gitea Actions quirk is discovered.]* +## Quirk #9 — Gitea Actions runner accepts runs but stalls (jobs never start) ### Finding -*[What Gitea Actions does differently from GitHub Actions.]* +The Gitea Actions runner on host `5.78.80.188` can enter a degraded state where: +1. It accepts new workflow runs (shows "in_progress" in the UI) +2. It never starts any jobs — pending count grows indefinitely +3. The runner shows as "online" and accepting runs +4. After ~60–90 minutes, the runner self-recovers and all pending jobs start + +This is distinct from a true runner crash (which would show as offline). ### Impact -*[Which workflows or operations are affected.]* +- All CI jobs for all PRs stall — no status updates posted +- Queue waits indefinitely for CI (which never posts success) +- `sop-checklist` and other workflows time out on affected PRs +- Looks like the runner is working (green in UI) but nothing executes + +### How to diagnose + +Add a debug step to a known-failing workflow: + +```bash +# In a stalled job: +curl -s http://localhost:8088/debug/pprof/trace?seconds=5 | head +# Check runner process CPU — if near 0% while jobs are pending, runner is stalled +``` + +Check runner logs on the host (`/var/log/actrunner.log` or similar). ### Workaround -*[How to work around this quirk.]* +No operator workaround while stalled — the runner self-recovers. Options: +1. **Wait** — runner typically recovers within 90 minutes +2. **Restart the runner service** — `systemctl restart act_runner` (requires host access) +3. **Move to a second runner** — if registered, re-route dispatch ### References -- internal#[N]: first observation +- SEV-1 2026-05-17: runner stalled; self-recovered ~21:33Z after ~90 min +- `feedback_gitea_runner_stall_accepted_jobs_no_execution` ---