6 changed files with 201 additions and 87 deletions
@@ -44,9 +44,15 @@ REQUIRED_CONTEXTS_RAW = _env(
    "REQUIRED_CONTEXTS",
    default=(
        "CI / all-required (pull_request),"
-        "sop-checklist / all-items-acked (pull_request)"
+        "sop-checklist / all-items-acked (pull_request),"
+        "E2E Chat / E2E Chat (pull_request)"
    ),
 )
+# E2E Chat is not in branch protection's status_check_contexts, but Gitea's
+# merge gate evaluates the full combined status including it. Adding it here
+# prevents the queue from attempting a merge that will be 405'd by Gitea when
+# E2E Chat is failing (e.g. runner-stall Quirk #9 on a flaky test).
+# See: mc#420 / molecule-core runbooks/gitea-operational-quirks.md Quirk #9.
 # Required contexts for push (main/staging) runs. The push CI uses the same
 # aggregator names with " (push)" suffix. Checking these explicitly instead of
 # the combined state avoids false-pause when non-blocking jobs (e.g. Platform
@@ -153,38 +159,15 @@ def latest_statuses_by_context(statuses: list[dict]) -> dict[str, dict]:
    return latest


-def _is_tier_low_pending_ok(
-    latest_statuses: dict[str, dict],
-    context: str,
-    pr_labels: set[str],
-) -> bool:
-    """Return True if tier:low PR can tolerate sop-checklist pending state.
-
-    Per sop-checklist-config.yaml tier_failure_mode, tier:low uses soft-fail:
-    sop-checklist posts state=pending when acks are satisfied (missing
-    manager/ceo acks are informational only). The queue should accept
-    pending instead of waiting for success.
-    """
-    if "tier:low" not in pr_labels:
-        return False
-    if "sop-checklist" not in context:
-        return False
-    status = latest_statuses.get(context) or {}
-    return status_state(status) == "pending"
-
-
 def required_contexts_green(
    latest_statuses: dict[str, dict],
    contexts: list[str],
-    pr_labels: set[str] | None = None,
 ) -> tuple[bool, list[str]]:
    missing_or_bad: list[str] = []
    for context in contexts:
        status = latest_statuses.get(context)
        state = status_state(status or {})
        if state != "success":
-            if pr_labels and _is_tier_low_pending_ok(latest_statuses, context, pr_labels):
-                continue  # tier:low soft-fail: accept pending sop-checklist
            missing_or_bad.append(f"{context}={state or 'missing'}")
    return not missing_or_bad, missing_or_bad

@@ -237,7 +220,6 @@ def evaluate_merge_readiness(
    pr_status: dict,
    required_contexts: list[str],
    pr_has_current_base: bool,
-    pr_labels: set[str] | None = None,
 ) -> MergeDecision:
    # Check push-required contexts explicitly instead of combined state.
    # Combined state can be "failure" due to non-blocking jobs
@@ -257,7 +239,7 @@ def evaluate_merge_readiness(
    # The required_contexts list is the authoritative gate — it includes only
    # the checks that actually block merges.
    latest = latest_statuses_by_context(pr_status.get("statuses") or [])
-    ok, missing_or_bad = required_contexts_green(latest, required_contexts, pr_labels)
+    ok, missing_or_bad = required_contexts_green(latest, required_contexts)
    if not ok:
        return MergeDecision(False, "wait", "required contexts not green: " + ", ".join(missing_or_bad))
    return MergeDecision(True, "merge", "ready")
@@ -282,32 +264,27 @@ def get_combined_status(sha: str) -> dict:
    _, combined = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status")
    if not isinstance(combined, dict):
        raise ApiError(f"status for {sha} response not object")
-    combined_statuses: list[dict] = combined.get("statuses") or []
+    # Fetch full statuses list; 200 covers >99% of real-world runs.
+    # The list is ordered ascending by id (oldest first) — callers must
+    # iterate in reverse to get the newest entry per context.
+    # Best-effort: large repos (main with 550+ statuses) may time out.
+    # On timeout, fall back to the statuses[] already in the combined
+    # response (usually 30 entries — enough for most PRs, enough for
+    # main's early push-required contexts).
    try:
-        _, all_statuses_raw = api(
+        _, all_statuses = api(
            "GET",
            f"/repos/{OWNER}/{NAME}/commits/{sha}/statuses",
            query={"limit": "50"},
        )
-        if isinstance(all_statuses_raw, list):
-            all_statuses: list[dict] = list(all_statuses_raw)
-        else:
-            all_statuses = []
+        if isinstance(all_statuses, list):
+            combined["statuses"] = all_statuses
    except (ApiError, urllib.error.URLError, TimeoutError, OSError) as exc:
+        # URLError covers network-level failures (DNS, refused, timeout).
+        # TimeoutError and OSError cover socket-level timeouts.
        sys.stderr.write(f"::warning::could not fetch full statuses list for {sha[:8]}: {exc}\n")
-        all_statuses = []
-    # Build latest per context: process combined (ascending→reverse=newest
-    # first), then fill gaps from all_statuses (already newest-first).
-    latest: dict[str, dict] = {}
-    for status in reversed(sorted(combined_statuses, key=lambda s: s.get("id") or 0)):
-        ctx = status.get("context")
-        if isinstance(ctx, str) and ctx not in latest:
-            latest[ctx] = status
-    for status in all_statuses:
-        ctx = status.get("context")
-        if isinstance(ctx, str) and ctx not in latest:
-            latest[ctx] = status
-    combined["statuses"] = list(latest.values())
+        # Fall back to the statuses[] already in the combined response.
+        pass
    return combined


@@ -348,6 +325,31 @@ def post_comment(pr_number: int, body: str, *, dry_run: bool) -> None:
    api("POST", f"/repos/{OWNER}/{NAME}/issues/{pr_number}/comments", body={"body": body})


+def add_hold_label(pr_number: int, *, dry_run: bool) -> None:
+    """Add HOLD_LABEL to a PR if not already present."""
+    if not HOLD_LABEL:
+        return
+    # Check current labels first to avoid a no-op API call in dry-run.
+    _, current = api("GET", f"/repos/{OWNER}/{NAME}/issues/{pr_number}/labels")
+    current_names = {
+        l["name"] for l in (current if isinstance(current, list) else [])
+    }
+    if HOLD_LABEL in current_names:
+        print(f"::notice::PR #{pr_number} already has hold label; skipping add")
+        return
+    print(f"::notice::PR #{pr_number} adding hold label `{HOLD_LABEL}`")
+    if dry_run:
+        return
+    # Gitea accepts {"labels": ["label1", "label2"]} to append labels.
+    new_labels = list(current_names) + [HOLD_LABEL]
+    api(
+        "PATCH",
+        f"/repos/{OWNER}/{NAME}/issues/{pr_number}",
+        body={"labels": new_labels},
+        expect_json=False,
+    )
+
+
 def update_pull(pr_number: int, *, dry_run: bool) -> None:
    print(f"::notice::updating PR #{pr_number} with base branch via style={UPDATE_STYLE}")
    if dry_run:
@@ -423,13 +425,11 @@ def process_once(*, dry_run: bool = False) -> int:
    commits = get_pull_commits(pr_number)
    current_base = pr_has_current_base(pr, commits, main_sha)
    pr_status = get_combined_status(head_sha)
-    pr_labels = label_names(pr)
    decision = evaluate_merge_readiness(
        main_status=main_status,
        pr_status=pr_status,
        required_contexts=contexts,
        pr_has_current_base=current_base,
-        pr_labels=pr_labels,
    )

    print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}")
@@ -455,22 +455,42 @@ def process_once(*, dry_run: bool = False) -> int:
        try:
            merge_pull(pr_number, dry_run=dry_run)
        except MergePermissionError as exc:
-            # Permanent merge failure (HTTP 403/404/405). Post a comment so
-            # maintainers know why, then return 0 so this tick is done.
-            # The PR stays in the queue; future ticks can retry after the
-            # permission issue is resolved.
-            sys.stderr.write(f"::error::merge permission error for PR #{pr_number}: {exc}\n")
-            post_comment(
-                pr_number,
-                (
-                    "merge-queue: merge failed with HTTP 405 'User not allowed to merge PR'. "
-                    "No available token has Can-merge permission on this repo. "
-                    "Fix: grant Can-merge to a token, or add a maintain/admin collaborator. "
-                    "Skipping to next queued PR on next tick."
-                ),
-                dry_run=dry_run,
-            )
-            return 0
+            msg = str(exc)
+            is_status_check_failure = "not all required status checks successful" in msg
+            if is_status_check_failure:
+                # Gitea's merge gate failed due to a status check that passed our
+                # pre-flight but is failing at Gitea's side (e.g. runner-stall Quirk
+                # #9, or a context not in REQUIRED_CONTEXTS). Auto-add hold so the
+                # queue skips this PR and processes the next one. The hold can be
+                # removed once CI is green again.
+                add_hold_label(pr_number, dry_run=dry_run)
+                post_comment(
+                    pr_number,
+                    (
+                        "merge-queue: merge blocked by Gitea's status-check gate "
+                        "(E2E Chat or other non-required context failing). "
+                        "Auto-held via `merge-queue-hold`. "
+                        "Remove the hold label to requeue once CI is green. "
+                        "If E2E Chat is stuck (runner stall / Quirk #9), CI will "
+                        "self-recover after ~90 min and the hold can then be removed."
+                    ),
+                    dry_run=dry_run,
+                )
+                return 0
+            else:
+                # Genuine permission error — token lacks Can-merge.
+                sys.stderr.write(f"::error::merge permission error for PR #{pr_number}: {exc}\n")
+                post_comment(
+                    pr_number,
+                    (
+                        "merge-queue: merge failed with HTTP 405 'User not allowed to merge PR'. "
+                        "No available token has Can-merge permission on this repo. "
+                        "Fix: grant Can-merge to a token, or add a maintain/admin collaborator. "
+                        "Skipping to next queued PR on next tick."
+                    ),
+                    dry_run=dry_run,
+                )
+                return 0
        return 0
    return 0

@@ -57,7 +57,7 @@ permissions:
 # can produce duplicate comments before the title-search dedup wins.
 concurrency:
  group: ci-required-drift
-  cancel-in-progress: false
+  cancel-in-progress: true

 jobs:
  drift:
@@ -22,7 +22,7 @@ permissions:

 concurrency:
  group: gitea-merge-queue-${{ github.repository }}
-  cancel-in-progress: false
+  cancel-in-progress: true

 jobs:
  queue:
@@ -56,9 +56,13 @@ permissions:
 # Workflow-scoped serialisation — two simultaneous runs would race on the
 # `[main-red] {SHA}` open/PATCH path. Idempotent by title, but parallel
 # POSTs can produce duplicates before the title search dedup wins.
+# NOTE: cancel-in-progress: true is safe here — the idempotent design means
+# a cancelled run produces identical output to a completed one. This also
+# prevents the Gitea scheduler freeze that occurs when a cron tick fires
+# while a previous run is still executing (Quirk #8).
 concurrency:
  group: main-red-watchdog
-  cancel-in-progress: false
+  cancel-in-progress: true

 jobs:
  watchdog:
@@ -77,6 +77,31 @@ does not replace the queue. The queue still performs its own current-main
 check immediately before merge because branch protection alone cannot
 serialize two already-green PRs.

+### Correct API field names (Gitea 1.22.6)
+
+When setting branch protection via API, use these exact field names — several
+intuitively-correct names are silently ignored (see `gitea-operational-quirks.md`
+Quirk #7):
+
+```json
+{
+  "branch_name": "main",
+  "enable_merge_whitelist": true,
+  "merge_whitelist_usernames": ["devops-engineer", "hongming", "core-devops"],
+  "enable_status_check": true,
+  "status_check_contexts": ["CI / all-required"],
+  "required_approvals": 1,
+  "block_on_rejected_reviews": true
+}
+```
+
+After any `POST /branch_protections`, immediately GET and verify the values
+persisted — the API returns 201 even when fields are silently dropped.
+
+If the queue returns HTTP 405 ("User not allowed to merge"), the first
+diagnostic step is `GET /branch_protections/main` and checking whether
+`merge_whitelist_usernames` still contains `devops-engineer`.
+
 ## Failure Handling

 If `main` is not green, the queue pauses and does not merge anything.
@@ -196,69 +196,134 @@ primary consumer of combined status and is affected.

 ---

-## Quirk #7 — TBD
-
-*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
+## Quirk #7 — Gitea branch protection API silently ignores some field names

 ### Finding

-*[What Gitea Actions does differently from GitHub Actions.]*
+The Gitea 1.22.6 `POST /repos/{org}/{repo}/branch_protections` API accepts a
+non-obvious set of field names. Several intuitively-correct names are silently
+ignored — the call returns 201 but the field is dropped:
+
+| Intended field | Correct API name | Silently ignored aliases |
+|---|---|---|
+| Enable merge whitelist | `enable_merge_whitelist` | `user_can_merge`, `merge_whitelist_enabled` |
+| Users who can merge | `merge_whitelist_usernames` | `merge_whitelist_users`, `whitelisted_users` |
+| Enable status check | `enable_status_check` | `enable_status_checks`, `require_status_checks` |
+| Required status contexts | `status_check_contexts` | `required_status_checks.contexts` |
+| Block on rejected reviews | `block_on_rejected_reviews` | (this one works) |
+| Required approvals | `required_approvals` | `required_reviewers` |
+
+The GET response after a POST shows the actual stored values. A naive
+GET → modify → POST cycle (without using the exact GET field names) will
+silently reset the merge whitelist on every call.

 ### Impact

-*[Which workflows or operations are affected.]*
+- Branch protection merge whitelist resets to empty after any API mis-invocation
+- Queue AUTO_SYNC_TOKEN (`devops-engineer`) loses Can-merge permission → HTTP 405
+- All queued PRs blocked until whitelist is restored
+- Confirmed reset on Gitea server restart/upgrade (Gitea uses default values)

 ### Workaround

-*[How to work around this quirk.]*
+1. Always GET the current protection first and use **exact** field names from the
+   GET response when modifying
+2. After any `POST /branch_protections`, immediately GET and verify
+   `enable_merge_whitelist: true` and `merge_whitelist_usernames` contains
+   `["devops-engineer", "hongming", "core-devops"]`
+3. The queue bot should verify branch protection before each merge tick
+4. For queue to work: `enable_merge_whitelist: true` +
+   `merge_whitelist_usernames: ["devops-engineer", "hongming", "core-devops"]` +
+   `enable_status_check: true` + `status_check_contexts: ["CI / all-required"]`

 ### References

- internal#[N]: first observation
+- SEV-1 2026-05-17: 3x branch protection resets caused 405 on all queue merges
+- `feedback_gitea_branch_protection_api_field_names`

 ---

-## Quirk #8 — TBD
-
-*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
+## Quirk #8 — Scheduled workflow with `cancel-in-progress: false` causes scheduler freeze

 ### Finding

-*[What Gitea Actions does differently from GitHub Actions.]*
+When a `schedule:` workflow has `concurrency.cancel-in-progress: false`, and a
+new cron tick fires while the previous run is still executing, the Gitea Actions
+scheduler stops dispatching the workflow entirely. Pending entries accumulate
+indefinitely — the scheduler shows the workflow as "scheduled" but never dispatches.
+
+This is dangerous for workflows with variable execution time (e.g., workflows that
+wait for downstream CI, or workflows that run on slow/degraded runners).

 ### Impact

-*[Which workflows or operations are affected.]*
+- `gitea-merge-queue.yml` with `cancel-in-progress: false` froze on 2026-05-17
+  starting ~16:44Z — pending runs accumulated, no new runs dispatched
+- Queue appeared stalled; all 22 queued PRs blocked
+- The `gitea-merge-queue` workflow itself becomes invisible to operators

 ### Workaround

-*[How to work around this quirk.]*
+**Always set `cancel-in-progress: true` on `schedule:` workflows:**
+
+```yaml
+concurrency:
+  group: workflow-name
+  cancel-in-progress: true   # ← always true for schedule: workflows
+```
+
+If the freeze has already occurred: the scheduler recovers automatically after the
+currently-running instance completes (Gitea dispatches the next queued tick).

 ### References

- internal#[N]: first observation
+- SEV-1 2026-05-17: queue frozen since 16:44Z; fixed by setting `cancel-in-progress: true`
+- PR #1358: `fix(scheduled-workflows): enable cancel-in-progress` (pending merge)

 ---

-## Quirk #9 — TBD
-
-*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
+## Quirk #9 — Gitea Actions runner accepts runs but stalls (jobs never start)

 ### Finding

-*[What Gitea Actions does differently from GitHub Actions.]*
+The Gitea Actions runner on host `5.78.80.188` can enter a degraded state where:
+1. It accepts new workflow runs (shows "in_progress" in the UI)
+2. It never starts any jobs — pending count grows indefinitely
+3. The runner shows as "online" and accepting runs
+4. After ~60–90 minutes, the runner self-recovers and all pending jobs start
+
+This is distinct from a true runner crash (which would show as offline).

 ### Impact

-*[Which workflows or operations are affected.]*
+- All CI jobs for all PRs stall — no status updates posted
+- Queue waits indefinitely for CI (which never posts success)
+- `sop-checklist` and other workflows time out on affected PRs
+- Looks like the runner is working (green in UI) but nothing executes
+
+### How to diagnose
+
+Add a debug step to a known-failing workflow:
+
+```bash
+# In a stalled job:
+curl -s http://localhost:8088/debug/pprof/trace?seconds=5 | head
+# Check runner process CPU — if near 0% while jobs are pending, runner is stalled
+```
+
+Check runner logs on the host (`/var/log/actrunner.log` or similar).

 ### Workaround

-*[How to work around this quirk.]*
+No operator workaround while stalled — the runner self-recovers. Options:
+1. **Wait** — runner typically recovers within 90 minutes
+2. **Restart the runner service** — `systemctl restart act_runner` (requires host access)
+3. **Move to a second runner** — if registered, re-route dispatch

 ### References

- internal#[N]: first observation
+- SEV-1 2026-05-17: runner stalled; self-recovered ~21:33Z after ~90 min
+- `feedback_gitea_runner_stall_accepted_jobs_no_execution`

 ---